In [1]:
with open('../../my_key.txt') as f:
    key_list = f.readlines()

key_list = [key.strip() for key in key_list]

keys = {}
for key in key_list:
    key = key.split(':')
    if (key != ['']):
        keys[key[0]] = key[1]

In [2]:
import os
os.environ["GOOGLE_CSE_ID"] = keys['GOOGLE_CSE_ID']
os.environ["GOOGLE_API_KEY"] = keys['GOOGLE_API_KEY']

# Information Gathering
- Objectives
    - Gather Information From Web
- I/O
    - Input: Query
    - Output: Text relative to give anwser
- Features
    - All Web Search
        - Only in Google Programmable Search Engine's Setting can we config Search Range
    - Domain Search
        - Only search in some specific domain

## 1. Query Construction

In [23]:
query = "Why Elon Musk bought twitter?"

In [24]:
# query = f"site:youtube.com search:{query}"  # Good
# query = f"site:arxiv.org/pdf search:{query}"  # Good
# query = f"site:arxiv.org search:{query}"  # Good
# query = f"site:arxiv.org/abs search:{query}"  # Good
# query = f"site:scholar.google.com search:{query}"  # Need Process
# query = f"site:wikipedia.org search:{query}"  # Good
# query = f"site:news.google.com search:{query}"  # Good
# query = f"site:news.google.com OR site:cnn.com search:{query}"  # Not Good enough
query_c = f"search:{query}"  # Good

## 2. Search
- I/O
    - Input: Query
    - Output: {"Link", "Title", "Snippet"}
- Performance
    - Runtime: Less than 2s

### Google Custom Search Engine
- Method:
    - Google Official Python API
    - Langchain - GoogleSearchAPIWrapper
- Problem

#### Google Official API

In [4]:
# Build a Google Search Service
from googleapiclient.discovery import build

service = build(
    "customsearch", "v1",
    developerKey=os.environ["GOOGLE_API_KEY"]
)

In [5]:
# Get Search Result
search_result = service.cse().list(
    q = query_c,
    cx = os.environ["GOOGLE_CSE_ID"]
).execute()

In [6]:
search_result

{'kind': 'customsearch#search',
 'url': {'type': 'application/json',
  'template': 'https://www.googleapis.com/customsearch/v1?q={searchTerms}&num={count?}&start={startIndex?}&lr={language?}&safe={safe?}&cx={cx?}&sort={sort?}&filter={filter?}&gl={gl?}&cr={cr?}&googlehost={googleHost?}&c2coff={disableCnTwTranslation?}&hq={hq?}&hl={hl?}&siteSearch={siteSearch?}&siteSearchFilter={siteSearchFilter?}&exactTerms={exactTerms?}&excludeTerms={excludeTerms?}&linkSite={linkSite?}&orTerms={orTerms?}&relatedSite={relatedSite?}&dateRestrict={dateRestrict?}&lowRange={lowRange?}&highRange={highRange?}&searchType={searchType}&fileType={fileType?}&rights={rights?}&imgSize={imgSize?}&imgType={imgType?}&imgColorType={imgColorType?}&imgDominantColor={imgDominantColor?}&alt=json'},
 'queries': {'request': [{'title': 'Google Custom Search - search:Why Elon Musk bought twitter?',
    'totalResults': '55100000',
    'searchTerms': 'search:Why Elon Musk bought twitter?',
    'count': 10,
    'startIndex': 1,
  

#### Langchain GoogleSearchAPIWrapper


In [7]:
from langchain.utilities import GoogleSearchAPIWrapper

google_search = GoogleSearchAPIWrapper(
    k = 10,
    siterestrict = False
)


In [8]:
# This place is really low robustness, need to dig down
search_result = google_search.results(query_c, 10)

In [9]:
print(search_result)

[{'title': 'Acquisition of Twitter by Elon Musk - Wikipedia', 'link': 'https://en.wikipedia.org/wiki/Acquisition_of_Twitter_by_Elon_Musk', 'snippet': 'Musk had begun buying shares of the company in January 2022, becoming its largest shareholder by April with a 9.1 percent ownership stake. Twitter invited\xa0...'}, {'title': "Hate Speech's Rise on Twitter Under Elon Musk Is Unprecedented ...", 'link': 'https://www.nytimes.com/2022/12/02/technology/twitter-hate-speech.html', 'snippet': "Dec 2, 2022 ... Hate Speech's Rise on Twitter Is Unprecedented, Researchers Find ... Before Elon Musk bought Twitter, slurs against Black Americans showed\xa0..."}, {'title': 'Why Elon Musk Bought Twitter | The New Yorker', 'link': 'https://www.newyorker.com/news/q-and-a/why-elon-musk-bought-twitter', 'snippet': "Apr 26, 2022 ... Why do you think Elon Musk is buying Twitter? I assume it's because he has some genuine set of political and social beliefs about how Twitter\xa0..."}, {'title': 'Elon Musk - Wik

In [10]:
i = 0
for sr in search_result:
    print(i,' ', sr['link'])
    i += 1

0   https://en.wikipedia.org/wiki/Acquisition_of_Twitter_by_Elon_Musk
1   https://www.nytimes.com/2022/12/02/technology/twitter-hate-speech.html
2   https://www.newyorker.com/news/q-and-a/why-elon-musk-bought-twitter
3   https://en.wikipedia.org/wiki/Elon_Musk
4   https://www.cnn.com/2022/10/27/tech/elon-musk-twitter/index.html
5   https://www.npr.org/2022/10/07/1127337447/heres-what-elon-musk-will-likely-do-with-twitter-if-he-buys-it
6   https://twitter.com/elonmusk
7   https://www.npr.org/2022/04/25/1094671225/elon-musk-bought-twitter-plans
8   https://www.pbs.org/newshour/economy/elon-musk-says-hes-found-a-woman-to-lead-twitter-as-new-ceo
9   https://help.twitter.com/en/using-twitter/twitter-blue


In [11]:
i = 0
for sr in search_result:
    print(i,' ', sr['title'])
    i += 1

0   Acquisition of Twitter by Elon Musk - Wikipedia
1   Hate Speech's Rise on Twitter Under Elon Musk Is Unprecedented ...
2   Why Elon Musk Bought Twitter | The New Yorker
3   Elon Musk - Wikipedia
4   Elon Musk has taken control of Twitter and fired its top executives ...
5   What Elon Musk will likely do with Twitter if he buys it : NPR
6   Elon Musk (@elonmusk) / Twitter
7   Elon Musk bought Twitter. Here's what he says he'll do next : NPR
8   Elon Musk says he's found a new CEO to lead Twitter | PBS NewsHour
9   About Twitter Blue


In [12]:
i = 0
for sr in search_result:
    print(i,' ', sr['snippet'])
    i += 1

0   Musk had begun buying shares of the company in January 2022, becoming its largest shareholder by April with a 9.1 percent ownership stake. Twitter invited ...
1   Dec 2, 2022 ... Hate Speech's Rise on Twitter Is Unprecedented, Researchers Find ... Before Elon Musk bought Twitter, slurs against Black Americans showed ...
2   Apr 26, 2022 ... Why do you think Elon Musk is buying Twitter? I assume it's because he has some genuine set of political and social beliefs about how Twitter ...
3   Elon Reeve Musk is a business magnate and investor. He is the founder, CEO and chief ... Musk expressed interest in buying Twitter as early as 2017, ...
4   Oct 28, 2022 ... Elon Musk has completed his $44 billion deal to buy Twitter, a source familiar with the deal told CNN ... Elon Musk just bought Twitter.
5   Oct 7, 2022 ... Billionaire Elon Musk's on-again, off-again bid to acquire Twitter advanced this week after he agreed to pay the $44 billion he had ...
6   And on June 6, the Dragon 2 flee

## 3. Text Processing
- I/O
    - Input: {"Link", "Title", "Snippet"}
    - Output: Text String

### 3.1. To Recommend Algorithm
- Performance
    - Runtime: Almost 0s

- Problem

#### Without Filter and Construct Text
- Structure: {Link Index} - {Title} - {Snippet}

In [29]:
# Construct Text to Recommend Algorithm
i = 0
text_to_ra = query + '\n'
for sr in search_result:
    text_to_ra += ' - '.join([
        'source: ' + f'url{i}', 
        'title: ' + sr['title'], 
        'snippet:' + sr['snippet']
    ]) + '\n'
    i += 1
print(text_to_ra)

Why Elon Musk bought twitter?
source: url0 - title: Acquisition of Twitter by Elon Musk - Wikipedia - snippet:Musk had begun buying shares of the company in January 2022, becoming its largest shareholder by April with a 9.1 percent ownership stake. Twitter invited ...
source: url1 - title: Hate Speech's Rise on Twitter Under Elon Musk Is Unprecedented ... - snippet:Dec 2, 2022 ... Hate Speech's Rise on Twitter Is Unprecedented, Researchers Find ... Before Elon Musk bought Twitter, slurs against Black Americans showed ...
source: url2 - title: Why Elon Musk Bought Twitter | The New Yorker - snippet:Apr 26, 2022 ... Why do you think Elon Musk is buying Twitter? I assume it's because he has some genuine set of political and social beliefs about how Twitter ...
source: url3 - title: Elon Musk - Wikipedia - snippet:Elon Reeve Musk is a business magnate and investor. He is the founder, CEO and chief ... Musk expressed interest in buying Twitter as early as 2017, ...
source: url4 - title: Elo

In [30]:
# Save the Text to .txt
with open('text_to_ra.txt', 'w') as f:
    f.write(text_to_ra)

### 3.2 To Understanding Synthesis Algorithm
- Problem:
    - Filtering the text gather from web (Langchain would be more serious)

#### Web filtering with Top-K Method
- Description
    - Directly select the top k result return from Google CSE
    - Load the Web Link and Get the Text String

In [31]:
i = 0
for sr in search_result:
    print(i,' ', sr['link'])
    i += 1

0   https://en.wikipedia.org/wiki/Acquisition_of_Twitter_by_Elon_Musk
1   https://www.nytimes.com/2022/12/02/technology/twitter-hate-speech.html
2   https://www.newyorker.com/news/q-and-a/why-elon-musk-bought-twitter
3   https://en.wikipedia.org/wiki/Elon_Musk
4   https://www.cnn.com/2022/10/27/tech/elon-musk-twitter/index.html
5   https://www.npr.org/2022/10/07/1127337447/heres-what-elon-musk-will-likely-do-with-twitter-if-he-buys-it
6   https://twitter.com/elonmusk
7   https://www.npr.org/2022/04/25/1094671225/elon-musk-bought-twitter-plans
8   https://www.pbs.org/newshour/economy/elon-musk-says-hes-found-a-woman-to-lead-twitter-as-new-ceo
9   https://help.twitter.com/en/using-twitter/twitter-blue


In [32]:
# Top k Method
k = 3
web_url = []

for i in range(k):
    web_url.append(search_result[i]['link'])

print(web_url)

['https://en.wikipedia.org/wiki/Acquisition_of_Twitter_by_Elon_Musk', 'https://www.nytimes.com/2022/12/02/technology/twitter-hate-speech.html', 'https://www.newyorker.com/news/q-and-a/why-elon-musk-bought-twitter']


#### Read with Requests + bs4

In [33]:
import requests
from bs4 import BeautifulSoup

def get_text_content_from_link(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find the desired elements and extract their text content
        text_content = ''
        for element in soup.find_all('p'):  # Example: Extract text from all <p> tags
            text_content += element.get_text() + '\n'

        return text_content

    else:
        print(f"Failed to retrieve content from {url}. Status code: {response.status_code}")
        return None

In [48]:
text_to_su = ''
for i in range(len(web_url)):
    try:
        web_content = get_text_content_from_link(web_url[i])
        lines = web_content.split('\n')
        for line in lines:
            if len(line) > 100:
                web_content += line + '\n'
        text_to_su += ''.join(['\n',f'[this text is from url{i}]', '\n', web_content])
        print(i)
    except:
        pass

0
Failed to retrieve content from https://www.nytimes.com/2022/12/02/technology/twitter-hate-speech.html. Status code: 403
2


In [49]:
with open('text_to_su.txt', 'w') as f:
    f.write(text_to_su)

# print(text_to_su)

#### Read with Langchain-Unstructured

In [50]:
from langchain.document_loaders import UnstructuredURLLoader

url_loader = UnstructuredURLLoader(
    urls = web_url,
    continue_on_failure = True,  # determines whether the loader should continue loading files even if there is a failure
    mode = "single"  # determines whether the loader should return a single document or a list of documents
)

url_data = url_loader.load()

In [51]:
text_to_su = ''

for i in range(len(url_data)):
    try:
        text = url_data[i].page_content
        lines = text.split('\n')
        for line in lines:
            if (len(line) > 100):
                text += line + '\n'
        text_to_su += ''.join(['\n',f'[this text is from url{i}]', '\n', text])
        print(i)
    except:
        pass

0
1
2


In [52]:
with open('text_to_su.txt', 'w') as f:
    f.write(text_to_su)

# print(text_to_su)