**Step 1:** Configure your working directory 

In [1]:
import os
print(os.getcwd())
os.chdir("..")
print(os.getcwd())

/sapmnt/home/I584060/SMU-Kaist-Gen-AI/Solutions
/sapmnt/home/I584060/SMU-Kaist-Gen-AI


**Step 2:** Import libraries and modules

In [2]:
import json
from typing import List
import openai
from langchain.llms import AzureOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

**Step 3:** Initializing Azure OpenAI client object from `LangChain`

In [7]:
with open("secrets/openai-secrets-sap-aicoe-exp.json", "r") as f:

    azure_openai_credentials = json.load(f)

 

llm = AzureOpenAI(

    openai_api_base="https://aicoe-smu-kaist-challenge.openai.azure.com/",
    openai_api_key=azure_openai_credentials["openai_api_key"],
    openai_api_type="azure",
    openai_api_version="2022-12-01",
    deployment_name="text-davinci-003",
    model = "text-davinci-003",
    temperature=0,
    max_tokens=1024
)

**Step 4:** Read data from text file

In [4]:
def read_data_from_file(file_path: str) -> str:
    with open(file_path, 'r') as file:
        data = file.read()
    return data

file_path = 'Data/state_of_the_union.txt'
data = read_data_from_file(file_path)

**Step 5:** Text Summarization with stuff summarization chain

In [5]:
documents_to_summarize = [Document(page_content=data)]
summary_chain = load_summarize_chain(llm=llm, chain_type="stuff", verbose=False)
summary_output = summary_chain.run(documents_to_summarize) 

InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 10375 tokens (9351 in your prompt; 1024 for the completion). Please reduce your prompt; or completion length.

**Step 6:** Initializing text splitter object for splitting long texts

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="p50k_base",
    chunk_size=2000,
    chunk_overlap=100,
    separators=["\n\n", "\n"],
)

**Step 7:** Split long text into smaller chunks 

In [None]:
documents_to_summarize = text_splitter.create_documents([data])
print(documents_to_summarize[0])
print(documents_to_summarize[1])

page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.  \n\nLast year COVID-19 kept us apart. This year we are finally together again. \n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \n\nWith a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \n\nHe met the Ukrainian people. \n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world. \n\nGroups of citizens blockin


**Step 8:** Using summarization chain with map reduce from LangChain out-of-the-box

In [None]:
# Set verbose=True for ease of debugging, set verbose=False for production
summary_chain = load_summarize_chain(llm=llm, chain_type="map_reduce", verbose=False)
summary_output = summary_chain.run(documents_to_summarize) 
print(f"Summary output: {summary_output}")
print(f"Number of tokens in summary output: {llm.get_num_tokens(summary_output)}")

Summary output:  President Biden is taking steps to combat the COVID-19 pandemic, provide economic relief, and create jobs. He is also proposing a Unity Agenda for the Nation, which includes protecting the rights of women, access to health care, and the LGBTQ+ community. He is also addressing Russian aggression in Ukraine and calling for Congress to pass the Freedom to Vote Act, the John Lewis Voting Rights Act, and the Disclose Act.
Number of tokens in summary output: 86
CPU times: user 11.3 ms, sys: 10.7 ms, total: 22 ms
Wall time: 11.2 s


**Step 9:** Using custom prompt templates for map reduce with summarization chain, together with output parsing

In [None]:
class OutputSchema(BaseModel):
    key_points: List[str] = Field(description="List of key points of the text")

In [None]:
parser = PydanticOutputParser(pydantic_object=OutputSchema)

map_prompt_template_str = """Summarize the following text delimited with triple backticks
```{text}```

SUMMARY:
"""

combine_prompt_template_str = """Summarize the following text delimited with triple backticks, in bullet points
```{text}```

SUMMARY:
"""
reformat_prompt_template_str = """Reformat the following key points delimited with triple backticks

```{key_points}```

{format_instructions}

REFORMATTED TEXT:
"""

map_prompt_template = PromptTemplate(template=map_prompt_template_str, input_variables=["text"])
combine_prompt_template = PromptTemplate(
    template=combine_prompt_template_str,
    input_variables=["text"],
)
reformat_prompt_template = PromptTemplate(
    template=reformat_prompt_template_str,
    input_variables=["key_points"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)


# Set verbose=True for ease of debugging, set verbose=False for production
summary_chain = load_summarize_chain(
    llm=llm,
    chain_type="map_reduce",
    map_prompt=map_prompt_template,
    combine_prompt=combine_prompt_template,
    verbose=False
)
reformat_chain = LLMChain(llm=llm, prompt=reformat_prompt_template, verbose=False)
summary_output = summary_chain.run(documents_to_summarize)
reformatted_output = reformat_chain.run(summary_output)
print(f"Number of tokens in summary output: {llm.get_num_tokens(summary_output)}")
print(f"Number of tokens in reformatted output: {llm.get_num_tokens(reformatted_output)}")
parsed_output = parser.parse(reformatted_output)
print(f"Parsed output:\n{parsed_output.dict()}")

Number of tokens in summary output: 185
Number of tokens in reformatted output: 193
Parsed output:
{'key_points': ['President Biden addressed the nation on the situation in Ukraine and announced the release of 60 million barrels of oil from reserves around the world to help blunt gas prices', 'The American Rescue Plan was passed to help combat COVID-19 and provide economic relief to millions of Americans', 'President Biden has proposed a plan to fight inflation and lower costs for Americans, including cutting the cost of prescription drugs, energy costs, and child care; closing loopholes for corporations and the wealthy; and increasing competition', 'He is also taking steps to combat COVID-19, including providing free treatments, tests, and masks, and launching the "Test to Treat" initiative', "He is advocating for the protection of women's rights, access to healthcare, and the LGBTQ+ community, and proposing a Unity Agenda for the Nation", 'He believes that the American people are str

<timed exec>:49: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.2/migration/
