### Installing necessary packages


In [None]:
!pip install langchain langchain-community langchain-huggingface unstructured


Collecting langchain-community
  Downloading langchain_community-0.3.30-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting unstructured
  Downloading unstructured-0.18.15-py3-none-any.whl.metadata (24 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2025.2.18-py3-none-any.whl.metadata (14 kB

In [3]:
# importing the necessary libraries here
import os
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_core.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

In [4]:
# Creating a schema for StructuredOutputParser to summarize news articles or blog posts into exactly three sentences."
schema=[
    ResponseSchema(name="summary 1",description="summary 1 about the topic"),
    ResponseSchema(name="summary 2",description="summary 2 about the topic"),
    ResponseSchema(name="summary 3",description="summary 3 about the topic")

]

In [5]:
# parser for only getting necessary output....
parser=StructuredOutputParser.from_response_schemas(schema)

In [6]:
# pass the api key here
os.environ['HUGGINGFACEHUB_API_TOKEN']="yor_api_key"

In [7]:
# set the hugging face model
llm=HuggingFaceEndpoint(repo_id="meta-llama/Llama-3.1-8B-Instruct",
                        task="text-generation")
model=ChatHuggingFace(llm=llm)

In [8]:
# Load the documents – We will scrape data from Moneycontrol.com. This article is related to sports news.
loader=UnstructuredURLLoader(urls=["https://www.moneycontrol.com/sports/cricket/ind-vs-pak-live-score-today-india-vs-pakistan-asia-cup-2025-final-live-cricket-scorecard-dubai-liveblog-13586345.html"])
docs=loader.load() # load the data ...

In [9]:
docs

[Document(metadata={'source': 'https://www.moneycontrol.com/sports/cricket/ind-vs-pak-live-score-today-india-vs-pakistan-asia-cup-2025-final-live-cricket-scorecard-dubai-liveblog-13586345.html'}, page_content="HomeSportscricketHighlights | India vs Pakistan, Asia Cup final: India edge Pakistan by 5 wickets in nail-biting thriller\n\nLive\n\nAuto Refresh\n\nHighlights | India vs Pakistan, Asia Cup final: India edge Pakistan by 5 wickets in nail-biting thriller\n\nMoneycontrol News\n\n330\n\nSeptember 29, 2025· 00:18 IST\n\n\n\n\n\n\n\nHighlights | India vs Pakistan, Asia Cup final: India skipper Suryakumar Yadav won the toss and chose to bowl first in the Asia Cup 2025 final against Pakistan at the Dubai International Cricket Stadium on Sunday (September 28). Pakistan in reply posted 146 runs which at one point looked an easy target for the Men in Blue to chase but the Pakistan bowlers handed the defending champions early blows in the powerplay with wickets of Shubman Gill, Abhishek Sha

In [10]:
len(docs)  # as we have passed only one link docs len is one....

1

In [11]:
# using recursivetextsplitter for creating the chunks.....
text_splitter= RecursiveCharacterTextSplitter(chunk_size=5000,
                                              chunk_overlap=20)

In [12]:
# lets create the chunks
chunks=text_splitter.split_documents(docs)

In [13]:
len(chunks) # number of chunks it has made.....

7

In [14]:
chunks[3] # each chunk  have meta data and page content

Document(metadata={'source': 'https://www.moneycontrol.com/sports/cricket/ind-vs-pak-live-score-today-india-vs-pakistan-asia-cup-2025-final-live-cricket-scorecard-dubai-liveblog-13586345.html'}, page_content="India vs Pakistan Live Score | Haris Rauf to Shivam Dube : FOUR! Smacked away! Haris Rauf lands on a good length, outside off. Shivam Dube stands tall and just powers it through extra cover for a cracking boundary.\n\n330\n\nSeptember 28, 2025· 23:22 IST\n\n🏏1 RunOver 13.5India 82/4\n\nIndia vs Pakistan Live Score | Saim Ayub to Tilak Varma : At 90.9 kph, short and on middle, Tilak bends and pulls it to deep backward square leg for one more.\n\n330\n\nSeptember 28, 2025· 23:22 IST\n\n⚪Dot BallOver 13.4India 81/4\n\nIndia vs Pakistan Live Score | Saim Ayub to Tilak Varma : Slower and fuller, on middle, Tilak knocks it back to Ayub.\n\n330\n\nSeptember 28, 2025· 23:21 IST\n\n🏏1 RunOver 13.3India 81/4\n\nIndia vs Pakistan Live Score | Saim Ayub to Shivam Dube : Serves the carrom ball

## Note: - Each chunk must fit within the model’s token limit, and the combined chunk summaries in the final reduce step must also stay within that limit.

In [35]:
chunks_prompt = """
    You are given a chunk of text scraped from a website.

    Task:
    - Focus only on the NEWS ARTICLE content (headlines, article body, analysis).
    - Ignore navigation menus, login/logout info, calculators, tools, alerts, or any unrelated website UI.
    - Summarize only the important business/news details in 2–3 concise sentences.

    input_documents:
    {text}

    Summary:
    """
# prompt template for chunks....
map_prompt_template = PromptTemplate(input_variables=['text'], template=chunks_prompt,
)


# final_prompt_template:- first llm will summarize all the chunks
# after that it will merge each chunk's summary and merge them and at the end
# from this final summary will be generated.....
final_prompt = """
    You are given multiple summaries of text chunks from a news article.

    Instructions:
    - First, read all the chunk summaries.
    - Identify the 3 most important insights from the news article.
    - Write a final summary of exactly three sentences (no more, no less).
    - Focus only on the news content. Ignore menus, calculators, login/logout info, or other website sections.

    input_documents:
    {text}

    {format_instruction}


    """
final_prompt_template = PromptTemplate(input_variables=['text'], template=final_prompt,
partial_variables={"format_instruction":parser.get_format_instructions()})


# Load and run map-reduce summarization chain
summary_chain = load_summarize_chain(
    llm=model,
    chain_type="map_reduce",
    map_prompt=map_prompt_template,
    combine_prompt=final_prompt_template,
    verbose=False
)


In [36]:
results=summary_chain.invoke({"input_documents":chunks}) # Pass input documents as expected by map_reduce chain
output_finally=parser.parse(results['output_text'])

In [38]:
print(output_finally)

{'summary 1': 'India won the Asia Cup 2025 final by 5 wickets, defeating Pakistan in a thrilling match, with Tilak Varma scoring 60 runs and Rinku Singh finishing off the match in style.', 'summary 2': "India's batting lineup struggled, but Tilak Varma and Sanju Samson played crucial roles, scoring 34 and 24 runs respectively, to keep India in the game against Pakistan.", 'summary 3': "India overcame a tough target of 146 runs set by Pakistan, with Tilak Varma scoring 60 runs and Rinku Singh finishing off the match in style, securing India's ninth Asia Cup title."}


In [49]:
output_finally

{'summary 1': 'India won the Asia Cup 2025 final by 5 wickets, defeating Pakistan in a thrilling match, with Tilak Varma scoring 60 runs and Rinku Singh finishing off the match in style.',
 'summary 2': "India's batting lineup struggled, but Tilak Varma and Sanju Samson played crucial roles, scoring 34 and 24 runs respectively, to keep India in the game against Pakistan.",
 'summary 3': "India overcame a tough target of 146 runs set by Pakistan, with Tilak Varma scoring 60 runs and Rinku Singh finishing off the match in style, securing India's ninth Asia Cup title."}

In [37]:
print(results['output_text'])

```json
{
    "summary 1": "India won the Asia Cup 2025 final by 5 wickets, defeating Pakistan in a thrilling match, with Tilak Varma scoring 60 runs and Rinku Singh finishing off the match in style.",
    "summary 2": "India's batting lineup struggled, but Tilak Varma and Sanju Samson played crucial roles, scoring 34 and 24 runs respectively, to keep India in the game against Pakistan.",
    "summary 3": "India overcame a tough target of 146 runs set by Pakistan, with Tilak Varma scoring 60 runs and Rinku Singh finishing off the match in style, securing India's ninth Asia Cup title."
}
```


#### lets build a chain and remove unncessary data....

In [39]:
from langchain.schema.runnable import RunnableLambda


In [42]:
# main chain for final output
main_chain= summary_chain | RunnableLambda (lambda x: x['output_text']) | parser

In [44]:
final_output=main_chain.invoke({"input_documents":chunks})

In [48]:
## The entire document has been summarized into three sentences.

In [46]:
final_output

{'summary 1': 'India won the Asia Cup 2025 final against Pakistan by 5 wickets in a nail-biting thriller, with Tilak Varma scoring 60 off 50 balls and Rinku Singh hitting the winning four.',
 'summary 2': 'India required 17 runs from the last 2 overs to win the match, but eventually chased down the target of 146 runs to win the match against Pakistan.',
 'summary 3': "Tilak Varma's half-century (50 off 41 balls) and Shivam Dube's decent score helped India reach a total of 113/4 in the 16th over, setting up a thrilling win against Pakistan."}

In [47]:
# for generating the three sentences.....
l=[]
for key , items in final_output.items():
  l.append(items)
# print(l)
for i in l:
  print(i)

India won the Asia Cup 2025 final against Pakistan by 5 wickets in a nail-biting thriller, with Tilak Varma scoring 60 off 50 balls and Rinku Singh hitting the winning four.
India required 17 runs from the last 2 overs to win the match, but eventually chased down the target of 146 runs to win the match against Pakistan.
Tilak Varma's half-century (50 off 41 balls) and Shivam Dube's decent score helped India reach a total of 113/4 in the 16th over, setting up a thrilling win against Pakistan.
