In [2]:
from googleapiclient.discovery import build

API_KEY = 'AIzaSyCMizf1CsepV8Psf6pnU3hy0FZXQTAXZFA'
CSE_ID = '86970aef48dab4539'


'''
Method which utilizes the Google Custom Search API to take a query and retrieve the top
    resulting website titles and links to them
@param query is the query to search the web with
@returns a list of tuples where the first element is the website title, the second is
    the website link, and the third is a site description
'''
def get_sites(query):
    # Instantiate list of websites to be returned from query
    listOfSites = []

    # Build custom serach service and retrieve top 10 website titles and links
    service = build("customsearch", "v1", developerKey=API_KEY)
    titlesAndLinks = service.cse().list(q=query, cx=CSE_ID).execute()

    # Append top 5 websites to listOfSites and return them
    for item in titlesAndLinks.get('items', []):
        listOfSites.append((item['title'], item['link'], item['snippet']))

    # Return list of tuples
    return listOfSites

In [4]:
sites = get_sites("fact check Paris capital of France")
print(sites)

[('Old video shows independence day rally in Warsaw, not Paris ...', 'https://factcheck.afp.com/doc.afp.com.74HE7WE', "Sep 21, 2025 ... The footage was in fact filmed in Poland's capital Warsaw and ... Is there content that you would like AFP to fact-check? Get in touch\xa0..."), ("Fact-check: Did Paris requisition the military to clean up the capital's ...", 'https://www.euronews.com/2023/03/22/fact-check-did-paris-requisition-the-military-to-clean-up-the-capitals-garbage-filled-stre', "Mar 22, 2023 ... News World France. Fact-check: Did Paris requisition the military to clean up the capital's garbage-filled streets?"), ('Rio becomes the global capital of fact-checking - AFP steps up | AFP ...', 'http://www.afp.com/en/agency/inside-afp/inside-afp/rio-becomes-global-capital-fact-checking-afp-steps', 'Jun 23, 2025 ... Paris (AFP) | 30/10/2025 - 08:35:16 | France arrests five new suspects in Louvre heist probe: prosecutor ... Rio becomes the global capital of\xa0...'), ('FACT CHECK: Did 

In [17]:
import arxiv
'''
Method for querying academic database arxiv to extract relevant research papers
@param query is the query to search the database with
@returns a list of tuples where the first element is the paper title, the second is
    a summary of the paper, and the third is a link to the pdf
'''
def get_research_papers(query):
    # Instantiate list of relevant paper infor to be returned
    threeTuples = []

    # Get top 5 results from the entered search query
    papers = arxiv.Search(query=query, max_results=5)

    # Loop through returned papers' info and append a tuple of title, summary, and link
    for paper in papers.results():
        threeTuples.append((paper.title, paper.summary, paper.pdf_url))
    
    # Return the list of three tuples
    return threeTuples
    


In [21]:
query = "large language models"
print(get_research_papers(query))

  for paper in papers.results():


[('Lost in Translation: Large Language Models in Non-English Content Analysis', "In recent years, large language models (e.g., Open AI's GPT-4, Meta's LLaMa,\nGoogle's PaLM) have become the dominant approach for building AI systems to\nanalyze and generate language online. However, the automated systems that\nincreasingly mediate our interactions online -- such as chatbots, content\nmoderation systems, and search engines -- are primarily designed for and work\nfar more effectively in English than in the world's other 7,000 languages.\nRecently, researchers and technology companies have attempted to extend the\ncapabilities of large language models into languages other than English by\nbuilding what are called multilingual language models.\n  In this paper, we explain how these multilingual language models work and\nexplore their capabilities and limits. Part I provides a simple technical\nexplanation of how large language models work, why there is a gap in available\ndata between Engli

In [10]:
import requests
from bs4 import BeautifulSoup
from readability import Document

'''
Method that takes in a url and scrapes the text content of that webpage
@param url is the link for a website to be scraped
@returns a string with the main textual content of a webpage
'''
def get_text_content(url):
    # Get raw html content of webpage
    htmlContent = requests.get(url).text

    # Convert raw html content into Document object
    document = Document(htmlContent)
    
    # Extract main textual content (HTML format most likely)
    textContentHTML = document.summary()

    # Parse using bs4
    soup = BeautifulSoup(textContentHTML, "html.parser")

    # Return parsed textual content
    return soup.get_text(separator='\n').strip()

In [8]:
# Rank retrieved links

def rank_links(links):
    # raise links ending with .edu, .gov, or .org to top of list
    credibleLinks = []
    nonCredibleLinks = []
    for link in links:
        if '.edu' in link or '.gov' in link or '.org' in link:
            credibleLinks.append(link)
        else:
            nonCredibleLinks.append(link)
    
    # Concatenate lists with credible ones first
    allLinks = credibleLinks + nonCredibleLinks

    # Scrape web content from link page and move to end of list if not text rich
    newLinks = []
    notContentRich = []

    for link in allLinks:
        content = get_text_content(link)
        if len(content) < 1000:
            notContentRich.append(link)
            continue
        newLinks.append(link)
    
    # Return new list of links sorted by domain and content
    return newLinks + notContentRich


In [23]:
links = ['https://en.wikipedia.org/wiki/American_Revolutionary_War',
         'https://www.battlefields.org/learn/revolutionary-war',
         'https://www.history.com/articles/american-revolution-history',
         'https://www.whitehouse.gov/america250/founders-museum/major-events/timeline/',
         'https://www.britannica.com/event/American-Revolution']
print(rank_links(links))

['https://www.history.com/articles/american-revolution-history', 'https://www.britannica.com/event/American-Revolution', 'https://en.wikipedia.org/wiki/American_Revolutionary_War', 'https://www.battlefields.org/learn/revolutionary-war', 'https://www.whitehouse.gov/america250/founders-museum/major-events/timeline/']


In [19]:
# Remove irrelevant results
def remove_irrelevant_results(links):
    newLinks = []
    removed = 0
    for link in links:
        content = get_text_content(link)
        if len(content) < 250:
            if removed < 2:
                removed += 1
                continue
        newLinks.append(link)
    return newLinks

In [24]:
print(remove_irrelevant_results(links))

['https://www.battlefields.org/learn/revolutionary-war', 'https://www.history.com/articles/american-revolution-history', 'https://www.britannica.com/event/American-Revolution']
