In [1]:
import pandas as pd

df = pd.read_csv("data/resources.csv")
df.replace('\n', '', regex=True, inplace=True)
df.replace(' ', '', regex=True, inplace=True)
df

Unnamed: 0,Resources,Category
0,https://www.epa.gov/watersense/fix-leak-week,Water
1,https://www.epa.gov/watersense/statistics-and-...,Water
2,https://www.nidwater.com/5-facts-you-need-to-k...,Water
3,https://parade.com/1192402/jessicasager/earth-...,History
4,https://www.intermountainhistories.org/tours/s...,History
...,...,...
164,https://education.nationalgeographic.org/resou...,Water
165,https://www.worldatlas.com/articles/what-is-a-...,Water
166,https://environmentamerica.org/pennsylvania/ar...,Technology
167,https://www.plasticsoupfoundation.org/en/the-n...,Technology


In [2]:
df.sample(1)['Resources'].iloc[0]

'https://www.gpi.org/glass-facts'

In [3]:
def get_categories():
    return df['Category'].unique().tolist()



In [4]:
get_categories()

['Water',
 'History',
 'Policy',
 'Air/Atmosphere',
 'Energy',
 'Actions/Behaviors',
 'Solar',
 'Technology',
 'Wildlife',
 'Recycling',
 'WIldlife']

In [5]:
def get_urls(category: str = 'Water', number: int = 1):
    cat_df = df.loc[df['Category'] == category]
    cat_df = cat_df.sample(number)['Resources']
    return cat_df.values.tolist()

get_urls(number=2)


['https://astrobiology.nasa.gov/education/alp/water-so-important-for-life/',
 'https://education.nationalgeographic.org/resource/lagoon/']

In [6]:
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_community.document_transformers import BeautifulSoupTransformer
import nest_asyncio

nest_asyncio.apply()

def extract_urls(urls: list):
    loader = AsyncChromiumLoader(urls)
    docs = loader.load()
    bs_transformer = BeautifulSoupTransformer()

    docs_transformed = bs_transformer.transform_documents(docs,       
            unwanted_tags=["script", "style", "footer", "header", "nav", "form", "noscript"],
            remove_lines=True,
            remove_comments=True)
    
    formatted = [
        f"Source ID: {i}\nArticle Title: {doc.metadata} \nArticle Snippet: {doc.page_content}"
        for i, doc in enumerate(docs_transformed)
    ]
    return formatted

In [18]:
extracted = extract_urls(get_urls(number=1))
extracted

["Source ID: 0\nArticle Title: {'source': 'https://sitn.hms.harvard.edu/uncategorized/2019/biological-roles-of-water-why-is-water-necessary-for-life/'} \nArticle Snippet: Skip to content (#content)          by Molly Sargen  figures by Daniel Utter  Water makes up 60-75% of human body weight. A loss of just 4% of total body water leads to dehydration, and a loss of 15% can be fatal. Likewise, a person could survive a month without food but wouldn’t survive 3 days without water. This crucial dependence on water broadly governs all life forms.\xa0 Clearly water is vital for survival, but what makes it so necessary?  The Molecular Make-up of Water  Many of water’s roles in supporting life are due to its molecular structure and a few special properties. Water is a simple molecule composed of two small, positively charged hydrogen atoms and one large negatively charged oxygen atom. When the hydrogens bind to the oxygen, it creates an asymmetrical molecule with positive charge on one side and

In [16]:
extracted[1]

context = "\n".join(doc for doc in extracted)

In [19]:
import dotenv
from langchain_openai import ChatOpenAI

from citation_chain import create_trivia_with_citation_chain


dotenv.load_dotenv()

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")
llm_chain = create_trivia_with_citation_chain(llm)

question = "4"
result = llm_chain.run(num_questions=question, context=extracted)
result

ValidationError: 1 validation error for QuestionAnswerList
question_answers
  field required (type=value_error.missing)

In [None]:
questions=[QuestionAnswer(question='What is a lagoon?', wrong_answer='A lake', difficulty='Easy', 
                          answer=FactWithEvidence(fact='A lagoon is a body of water separated from larger bodies of water by a natural barrier.', 
                                                  trivia_answer='A body of water separated by a natural barrier', 
                                                  substring_quote='A lagoon is a body of water separated from larger bodies of water by a natural barrier.', 
                                                  sources='https://oceanservice.noaa.gov/facts/lagoon.html')), 
            QuestionAnswer(question='How are atoll lagoons formed?', wrong_answer='By volcanic eruptions', difficulty='Medium', 
                           answer=FactWithEvidence(fact='Atoll lagoons form when an island completely subsides beneath the water, leaving a ring of coral that continues to grow upwards.', 
                                                   trivia_answer='When an island subsides beneath the water, leaving a ring of coral', 
                                                   substring_quote='Atoll lagoons form when an island completely subsides beneath the water, leaving a ring of coral that continues to grow upwards.', 
                                                   sources='https://oceanservice.noaa.gov/facts/lagoon.html')), 
            QuestionAnswer(question='What are coastal lagoons sheltered by?', wrong_answer='Mountains', difficulty='Hard', 
                           answer=FactWithEvidence(fact='Lagoons sheltered by sandbars or barrier islands are called coastal lagoons.', 
                                                   trivia_answer='Sandbars or barrier islands', 
                                                   substring_quote='Lagoons sheltered by sandbars or barrier islands are called coastal lagoons.', 
                                                   sources='https://education.nationalgeographic.org/resource/lagoon/')), 
            QuestionAnswer(question='What is the nickname of the city of Venice?', wrong_answer='City of Canals', difficulty='Easy', 
                           answer=FactWithEvidence(fact='Venice\'s nickname is "Queen of the Adriatic."', 
                                                   trivia_answer='Queen of the Adriatic', 
                                                   substring_quote='Venice\'s nickname is "Queen of the Adriatic."', 
                                                   sources='https://education.nationalgeographic.org/resource/lagoon/')), 
            QuestionAnswer(question='What is the Blue Lagoon in Iceland known for?', wrong_answer='Natural formation', difficulty='Medium', 
                           answer=FactWithEvidence(fact='The Blue Lagoon in Iceland is a manmade feature where water from a local geothermal power plant is pumped over a lava bed rich in silica and sulfur.', 
                                                   trivia_answer='Manmade feature with geothermal water', 
                                                   substring_quote="The world's most famous lagoon, the Blue Lagoon in Iceland, is not a lagoon at all. It is a manmade feature where water from a local geothermal power plant is pumped over a lava bed rich in silica and sulfur.", 
                                                   sources='https://education.nationalgeographic.org/resource/lagoon/'))]

In [24]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate


messages = [
    SystemMessage(
        content=(
            "You are a world class algorithm to generate trivia "
            "questions and answers with correct and exact citations."
        )
    ),
    HumanMessagePromptTemplate.from_template(
            "Generate {num_questions} sets of trivia-style questions "
            "and answers using the following context"
    ),
    HumanMessagePromptTemplate.from_template("{context}"),
    HumanMessage(
        content=(
            "Tips: Make sure to cite your sources, "
            "and use the exact words from the context."
        )
    ),
]
messages

[SystemMessage(content='You are a world class algorithm to generate trivia questions and answers with correct and exact citations.'),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['num_questions'], template='Generate {num_questions} sets of trivia-style questions and answers using the following context')),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template='{context}')),
 HumanMessage(content='Tips: Make sure to cite your sources, and use the exact words from the context.')]

In [27]:
prompt = ChatPromptTemplate(messages=messages)
prompt.pretty_print()


You are a world class algorithm to generate trivia questions and answers with correct and exact citations.


Generate [33;1m[1;3m{num_questions}[0m sets of trivia-style questions and answers using the following context


[33;1m[1;3m{context}[0m


Tips: Make sure to cite your sources, and use the exact words from the context.


In [26]:
first=RunnableAssign(mapper={
  prompt: ChatPromptTemplate(input_variables=['context', 'num_questions'], messages=[
      SystemMessage(content='You are a world class algorithm to generate trivia questions and answers with correct and exact citations.'), 
      HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['num_questions'], template='Generate {num_questions} sets of trivia-style questions and answers using the following context')), 
      HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template='{context}')), 
      HumanMessage(content='Tips: Make sure to cite your sources, and use the exact words from the context.')
      ])
}) 
last=RunnableAssign(mapper={
  output: RunnableBinding(bound=ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x1213bacc0>, 
                                           async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x12127f890>, temperature=0.0, openai_api_key=SecretStr('**********'), openai_proxy=''), 
                                           kwargs={'tools': [{'type': 'function', 'function': {
                                               'name': 'QuestionAnswersList', 
                                               'description': 'Class representing a list of trivia style questions and answers with correct and exact citations.\neach sentence contains a body and a list of sources.', 
                                               'parameters': {'type': 'object', 'properties': {'trivias': {'description': 'List of questions and answers as a list of facts', 'type': 'array', 'items': {'description': 'Trivia style questions and its answers as a list of facts each one should have a source.\neach sentence contains a body and a list of sources.', 'type': 'object', 'properties': {'question': {'description': 'Question that was asked', 'type': 'string'}, 'wrong_answer': {'description': 'A trivia-style answer that is wrong', 'type': 'string'}, 'difficulty': {'description': 'Difficulty level of the question', 'type': 'string'}, 'answer': {'description': 'Body of the answer, the fact should be its separate object with a body and a list of sources', 'allOf': [{'title': 'FactWithEvidence', 'description': 'Class representing a single statement.\n\nEach fact has a body and a list of sources.\nIf there are multiple facts make sure to break them apart\nsuch that each one only uses a set of sources that are relevant to it.', 'type': 'object', 'properties': {'fact': {'title': 'Fact', 'description': 'Body of the sentence, as part of a response', 'type': 'string'}, 'trivia_answer': {'title': 'Trivia Answer', 'description': 'Trivia-style answer, like one word or phrase', 'type': 'string'}, 'substring_quote': {'title': 'Substring Quote', 'description': 'The source should be a direct quote from the context, as a substring of the original content', 'type': 'string'}, 'sources': {'title': 'Sources', 'description': 'The EXACT source link corresponding to the substring_quote', 'type': 'string'}}, 'required': ['fact', 'trivia_answer', 'substring_quote', 'sources']}]}}, 'required': ['question', 'wrong_answer', 'difficulty', 'answer']}}}, 'required': ['trivias']}}}], 'tool_choice': {'type': 'function', 'function': {'name': 'QuestionAnswersList'}}})
          | PydanticToolsParser(first_tool_only=True, tools=[<class 'llmlang.QuestionAnswersList'>])
})

NameError: name 'RunnablePassthrough' is not defined

In [None]:
trivias=[
    QuestionAnswer(question="What is the oldest and deepest lake in the world, containing 20% of the world's total unfrozen freshwater reserve?", 
                   wrong_answer='Lake Superior', difficulty='easy', answer=
                   FactWithEvidence(fact="Lake Baikal is the oldest (25 million years) and deepest (1,700 m) lake in the world, containing 20% of the world's total unfrozen freshwater reserve.", 
                                    trivia_answer='Lake Baikal', 
                                    substring_quote="Lake Baikal is the oldest (25 million years) and deepest (1,700 m) lake in the world, containing 20% of the world's total unfrozen freshwater reserve.", 
                                    sources='https://whc.unesco.org/en/list/754/')), 
    QuestionAnswer(question="What is Lake Baikal known as due to its age and isolation producing one of the world's richest and most unusual freshwater faunas?", 
                   wrong_answer='Lake Michigan', difficulty='easy', answer=
                   FactWithEvidence(fact="Known as the 'Galapagos of Russia', Lake Baikal's age and isolation have produced one of the world's richest and most unusual freshwater faunas.", 
                                    trivia_answer='Galapagos of Russia', 
                                    substring_quote="Known as the 'Galapagos of Russia', its age and isolation have produced one of the world's richest and most unusual freshwater faunas.", 
                                    sources='https://whc.unesco.org/en/list/754/')), 
    QuestionAnswer(question="What percentage of the world's total unfrozen freshwater reserve does Lake Baikal contain?", 
                   wrong_answer='10%', difficulty='medium', answer=
                   FactWithEvidence(fact="Lake Baikal contains 20% of the world's total unfrozen freshwater reserve.", 
                                    trivia_answer='20%', 
                                    substring_quote="It contains 20% of the world's total unfrozen freshwater reserve.", 
                                    sources='https://whc.unesco.org/en/list/754/')), 
    QuestionAnswer(question='Where is Lake Baikal situated, and what makes it unique in terms of its age and isolation?', 
                   wrong_answer='North America, known for its size', difficulty='medium', answer=
                   FactWithEvidence(fact="Lake Baikal is situated in south-east Siberia, and its age and isolation have produced one of the world's richest and most unusual freshwater faunas.", 
                                    trivia_answer='South-east Siberia, age, and isolation', 
                                    substring_quote='Situated in south-east Siberia, the 3.15-million-ha Lake Baikal is the oldest (25 million years) and deepest (1,700 m) lake in the world.', 
                                    sources='https://whc.unesco.org/en/list/754/'))]