In [2]:
from tavily import TavilyClient
import os
from dotenv import load_dotenv

load_dotenv()


def tavily_search(query: str) -> str:
    tavily_client = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])
    response = tavily_client.search(
        query,
        max_results=5,
        search_depth="advanced",
        include_answer=True,
        include_raw_content=True,
        include_image=False,
    )
    return response


response = tavily_search("What is the population of US in 2020?")
response

{'query': 'What is the population of US in 2020?',
 'follow_up_questions': None,
 'answer': 'The population of the United States in 2020 was estimated to be 331.5 million people according to the data from the U.S. Census Bureau.',
 'images': [],
 'results': [{'title': 'Population change in 2020: The most deaths, fewest births ... - USAFacts',
   'url': 'https://usafacts.org/articles/population-change-in-2020-us/',
   'content': 'The US population was an estimated 329 million on July 1, 2020, growing by 1.2 million in a year, according to newly released estimates from the Census Bureau. This annual growth was a decrease of 22% since 2019.',
   'score': 0.93338853,
   'raw_content': "Population change in 2020: The most deaths, fewest births, and fewest new immigrants in a decade\nUSAFacts\n\nEconomy\nCrime\nEducation\nHealth\nPopulation\nAbout\n\nMore\n\nHealth\nPopulation\nAbout\nGovernment spending\nDefense and security\nEnvironment\n\n\n\nArticles\nGuides and reports\n\n\n\nSubscribe\

In [3]:
import re


def filter_garbage(text: str) -> str:
    """
    Removes garbage characters while keeping printable ASCII, Chinese characters, and emojis intact.

    Args:
        text (str): The input string containing various characters.

    Returns:
        str: A cleaned string with garbage characters removed.
    """
    # Define character ranges
    PRINTABLE_ASCII = lambda c: 32 <= ord(c) <= 126
    CONTROL_CHARS = "\n\t\r"
    CHINESE = lambda c: "\u4e00" <= c <= "\u9fff"
    MISC_SYMBOLS = lambda c: "\u2600" <= c <= "\u26FF"
    DINGBATS = lambda c: "\u2700" <= c <= "\u27BF"
    EMOJIS = lambda c: "\U0001F300" <= c <= "\U0001F9FF"

    def is_valid_char(c):
        return PRINTABLE_ASCII(c) or c in CONTROL_CHARS or CHINESE(c) or MISC_SYMBOLS(c) or DINGBATS(c) or EMOJIS(c)

    cleaned_text = "".join(char for char in text if is_valid_char(char))
    cleaned_text = re.sub(r"\n{3,}", r"\n\n", cleaned_text)
    cleaned_text = re.sub(r" {2,}", " ", cleaned_text)

    return cleaned_text

In [30]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

summarizer = ChatOpenAI(model="gpt-4o-mini", temperature=0)
raw_content_list = [result["raw_content"] for result in response["results"]]
none_idx = [i for i, content in enumerate(raw_content_list) if content is None]
prompt = ChatPromptTemplate.from_messages([("system", "Summarize the following content: {raw_content}")])
chain = prompt | summarizer
summarized_content_list = chain.batch(raw_content_list)
content_list = [None if i in none_idx else msg.content for i, msg in enumerate(summarized_content_list)]
content_list

['In 2020, the U.S. population growth was the lowest in over a century, with a mere 0.4% increase, marking the smallest annual growth rate since 1918. The population was estimated at 329 million, growing by 1.2 million people, a 22% decrease from 2019. The year saw 3.7 million births (down 0.6% from 2019) and 3.1 million deaths (up 8%), resulting in a net natural population increase of 677,000, which is a significant decline from previous years. Immigration also hit a decade-low, contributing only 477,000 to the population growth. \n\nDemographically, the Northeast experienced a population decline for the second consecutive year, with significant losses in states like New York and New Jersey. The Midwest also saw its first population drop, particularly in Illinois. In contrast, states in the South and West, such as Florida and Texas, gained population through both domestic migration and immigration. Overall, the trends indicate a slowdown in both natural population growth and immigrati

In [29]:
content_list

['In 2020, the U.S. population growth was the lowest in over a century, with a mere 0.4% increase, marking the smallest annual growth rate since 1918. The population was estimated at 329 million, growing by 1.2 million people, a 22% decrease from 2019. The year saw 3.7 million births (down 0.6% from 2019) and 3.1 million deaths (up 8%), resulting in a net natural population increase of 677,000, which is a significant decline from previous years. Immigration also hit a decade-low, contributing only 477,000 to the population growth. \n\nDemographically, the Northeast experienced its second consecutive year of population loss, with a notable decline in states like New York and New Jersey. The Midwest also saw a population drop for the first time, particularly in Illinois. Conversely, states in the South and West, such as Florida and Texas, gained population through both domestic migration and immigration. A total of eight states and Puerto Rico reported negative natural growth, with West 

In [34]:
[result["content"] for result in response["results"]]

['The US population was an estimated 329 million on July 1, 2020, growing by 1.2 million in a year, according to newly released estimates from the Census Bureau. This annual growth was a decrease of 22% since 2019.',
 'The first numbers to come out of the 2020 Census show the U.S. population on April 1, 2020 Census Day was 331.5 million people, an increase of just 7.4 percent between 2010 and 2020.',
 'List of U.S. states by population - Simple English Wikipedia, the free encyclopedia Show any page List of U.S. states by population Page The total population of the United States was 331,449,281 at the 2020 Census, not including U.S. territories, or 335,073,176, including U.S. territories.[1][2] Map of population change in U.S. states from April 1, 2010 to April 1, 2020 [note 1] | Absolute change, 2010-2020 | Total seats in the U.S. House of Representatives, 20232033 | Census population per electoral vote 1.0 1.1 1.2 "Change in Resident Population of the 50 States, the District of Columb

In [23]:
none_idx

[1]

In [19]:
raw_content_list = [result["raw_content"] for result in response["results"]]
raw_content_list

["Population change in 2020: The most deaths, fewest births, and fewest new immigrants in a decade\nUSAFacts\n\nEconomy\nCrime\nEducation\nHealth\nPopulation\nAbout\n\nMore\n\nHealth\nPopulation\nAbout\nGovernment spending\nDefense and security\nEnvironment\n\nArticles\nGuides and reports\n\nSubscribe\n\nNo results found\nWe're sorry, but there are no results that match your search criteria. Try checking your spelling or using alternate search terms. \nWe add new data to USAFacts all the time; you can subscribe to our newsletter to get unbiased, data-driven insights sent to your inbox weekly, no searching required.\nSee all results\nSubscribe to our newsletter\nGet unbiased, data-driven insights sent to your inbox weekly. To learn more, explore our newsletter archive.\nTopics\n\nEconomy\nCrime\nEducation\nHealth\nPopulation\nGovernment spending\nDefense and security\nEnvironment\nArticles\nGuides and reports\nAbout\n\nSubscribe to our newsletter\nGet unbiased, data-driven insights sent

In [37]:
def process_tavily_response(response: dict, filter_score: float = 0.5, summarize_content: bool = False) -> dict:
    """Process Tavily search response by filtering results and cleaning content. Can optionally summarize content using LLM.

    Args:
        response (dict): Raw Tavily API response
        filter_score (float): Minimum score threshold for filtering results
        summarize_content (bool): Whether to summarize content using LLM

    Returns:
        dict: Processed response with filtered and cleaned search results
    """
    # Filter results by score
    response["results"] = [result for result in response["results"] if result["score"] >= filter_score]

    # Clean content
    for result in response["results"]:
        if result["content"] is not None:
            result["content"] = filter_garbage(result["content"])
        if result["raw_content"] is not None:
            result["raw_content"] = filter_garbage(result["raw_content"])

    if summarize_content:
        _summarize_content(response)

    final_string = f"Query: {response['query']}\n\n" + f"Sources:\n\n"
    for result in response["results"]:
        final_string += f"Relevance Score: {result['score']}\nURL: {result['url']}\nContent: {result['content']}\n\n"
    final_string += f"Suggested Answer: {response['answer']}"

    return final_string


def _summarize_content(response: dict) -> None:
    """Helper function to summarize content using LLM."""
    from langchain_openai import ChatOpenAI
    from langchain_core.prompts import ChatPromptTemplate

    # Setup summarizer
    summarizer = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    prompt = ChatPromptTemplate.from_messages([("system", "Summarize the following content: {raw_content}")])
    chain = prompt | summarizer

    # Get raw content
    raw_content_list = [result["raw_content"] for result in response["results"]]
    none_idx = [i for i, content in enumerate(raw_content_list) if content is None]

    # Summarize content
    summarized_content_list = chain.batch(raw_content_list)
    content_list = [None if i in none_idx else msg.content for i, msg in enumerate(summarized_content_list)]

    # Update response with summarized content
    for i, result in enumerate(response["results"]):
        if content_list[i] is not None:
            result["content"] = content_list[i]


a = process_tavily_response(response, summarize_content=True)
a

"Query: What is the population of US in 2020?\n\nSources:\n\nRelevance Score: 0.93338853\nURL: https://usafacts.org/articles/population-change-in-2020-us/\nContent: In 2020, the U.S. population growth was the lowest in over a century, with a mere 0.4% increase, marking the smallest annual growth rate since 1918. The population was estimated at 329 million, growing by 1.2 million people, a 22% decrease from 2019. The year saw 3.7 million births (down 0.6% from 2019) and 3.1 million deaths (up 8%), resulting in a net natural population increase of 677,000, which is a significant decline from previous years. Immigration also hit a decade-low, contributing only 477,000 to the population growth. \n\nDemographically, the Northeast experienced its second consecutive year of population loss, with a notable decline in states like New York and New Jersey. The Midwest also saw a population drop for the first time, particularly in Illinois. Conversely, states in the South and West, such as Florida

In [38]:
print(
    "Query: What is the population of US in 2020?\n\nSources:\n\nRelevance Score: 0.93338853\nURL: https://usafacts.org/articles/population-change-in-2020-us/\nContent: In 2020, the U.S. population growth was the lowest in over a century, with a mere 0.4% increase, marking the smallest annual growth rate since 1918. The population was estimated at 329 million, growing by 1.2 million people, a 22% decrease from 2019. The year saw 3.7 million births (down 0.6% from 2019) and 3.1 million deaths (up 8%), resulting in a net natural population increase of 677,000, which is a significant decline from previous years. Immigration also hit a decade-low, contributing only 477,000 to the population growth. \n\nDemographically, the Northeast experienced its second consecutive year of population loss, with a notable decline in states like New York and New Jersey. The Midwest also saw a population drop for the first time, particularly in Illinois. Conversely, states in the South and West, such as Florida and Texas, gained population through both domestic migration and immigration. Overall, the trends indicate a slowdown in both natural population growth and immigration, leading to a significant impact on the overall population dynamics in the U.S.\n\nRelevance Score: 0.9220975\nURL: https://www.washingtonpost.com/dc-md-va/interactive/2021/2020-census-us-population-results/\nContent: The first numbers to come out of the 2020 Census show the U.S. population on April 1, 2020 Census Day was 331.5 million people, an increase of just 7.4 percent between 2010 and 2020.\n\nRelevance Score: 0.9073207\nURL: https://simple.wikipedia.org/wiki/List_of_U.S._states_by_population\nContent: The content provides a list of U.S. states ranked by population based on the 2020 United States Census. The total population of the U.S. was approximately 331.4 million, making it the third most populous country in the world after China and India. The list includes the population of each state, the percentage change from 2010 to 2020, and the number of congressional seats each state holds. California is the most populous state, followed by Texas and Florida. The document also includes data on U.S. territories and the overall population growth rate.\n\nRelevance Score: 0.8891543\nURL: https://www.census.gov/library/stories/2021/04/2020-census-data-release.html\nContent: The U.S. Census Bureau has released the first population counts from the 2020 Census, showing a total resident population of 331,449,281 as of April 1, 2020, which is a 7.4% increase since the 2010 Census. This data is crucial for congressional apportionment, which determines the distribution of the 435 seats in the U.S. House of Representatives among the states. Based on the 2020 Census, Texas will gain two seats, while several states including California and New York will lose seats. The apportionment population, which includes residents and overseas military and federal employees, totals 331,108,434. The average congressional district population will increase to 761,169 people. The Census also reported that 37.2% of the U.S. population resides in the five most populous states, while the five least populous states account for only 1.0% of the total population. Additional data and resources regarding the apportionment process are available on the Census Bureau's website.\n\nSuggested Answer: The population of the United States in 2020 was estimated to be 331.5 million people according to the data from the U.S. Census Bureau."
)

Query: What is the population of US in 2020?

Sources:

Relevance Score: 0.93338853
URL: https://usafacts.org/articles/population-change-in-2020-us/
Content: In 2020, the U.S. population growth was the lowest in over a century, with a mere 0.4% increase, marking the smallest annual growth rate since 1918. The population was estimated at 329 million, growing by 1.2 million people, a 22% decrease from 2019. The year saw 3.7 million births (down 0.6% from 2019) and 3.1 million deaths (up 8%), resulting in a net natural population increase of 677,000, which is a significant decline from previous years. Immigration also hit a decade-low, contributing only 477,000 to the population growth. 

Demographically, the Northeast experienced its second consecutive year of population loss, with a notable decline in states like New York and New Jersey. The Midwest also saw a population drop for the first time, particularly in Illinois. Conversely, states in the South and West, such as Florida and Texa

In [4]:
from langchain_community.tools.tavily_search import TavilySearchResults


def invoke(query: str) -> str:
    """
    Performs a comprehensive web search using Tavily API with maximum content retrieval.

    Args:
        query (str): The search query

    Returns:
        str: Formatted search results combining all relevant content
    """
    search_tool = TavilySearchResults(
        max_results=5,
        search_depth="advanced",
        response_format="json",
        include_raw_content=True,
    )
    results = search_tool.run(query)

    # Combine all content from results
    all_content = []
    for result in results:
        all_content.append(f"Source: {result['url']}\n{result['content']}\n")

    # return "\n\n".join(all_content)
    return results


r = invoke("What is the population of US in 2020?")
r

TypeError: list indices must be integers or slices, not str

In [27]:
def filter_garbage(text: str) -> str:
    """
    Removes garbage characters while keeping printable ASCII, Chinese characters, and emojis intact.

    Args:
        text (str): The input string containing various characters.

    Returns:
        str: A cleaned string with garbage characters removed.
    """
    # Define character ranges
    PRINTABLE_ASCII = lambda c: 32 <= ord(c) <= 126
    CONTROL_CHARS = "\n\t\r"
    CHINESE = lambda c: "\u4e00" <= c <= "\u9fff"
    MISC_SYMBOLS = lambda c: "\u2600" <= c <= "\u26FF"
    DINGBATS = lambda c: "\u2700" <= c <= "\u27BF"
    EMOJIS = lambda c: "\U0001F300" <= c <= "\U0001F9FF"

    def is_valid_char(c):
        return PRINTABLE_ASCII(c) or c in CONTROL_CHARS or CHINESE(c) or MISC_SYMBOLS(c) or DINGBATS(c) or EMOJIS(c)

    cleaned_text = "".join(char for char in text if is_valid_char(char))

    return cleaned_text

In [29]:
from langchain_playground.UniversalChain.tools import webloader

for result in r:
    print(webloader(result["url"]))

