In [None]:
pip install sumy transformers



In [None]:
#importing nltk dependencies
import nltk
nltk.download("punkt")
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#importing the dependencies
from urllib.request import urlopen
from bs4 import BeautifulSoup
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer


#1. summarize the text using summarizer


In [None]:
#download the data from the link using webscrapping
url = "https://en.wikipedia.org/wiki/Sri_Lanka"
page = urlopen(url)
text = page.read().decode("utf-8")
soup = BeautifulSoup(text, "html.parser")
txt = soup.get_text()

In [None]:
#parsing the text
my_parser = PlaintextParser.from_string(txt, Tokenizer('english'))

In [None]:
#summarizing the text using Text Rank summarizer
sents = sent_tokenize(txt)
tex_rank_summarizer = TextRankSummarizer()
summary1 = tex_rank_summarizer(my_parser.document, sentences_count= 3)
summary1

(<Sentence: Economy Main article: Economy of Sri Lanka See also: Agriculture in Sri Lanka, Tea production in Sri Lanka, Tourism in Sri Lanka, and Transport in Sri Lanka Development of real GDP per capita, 1820 to 2018 According to the International Monetary Fund, Sri Lanka's GDP in terms of purchasing power parity is the second highest in the South Asian region in terms of per capita income.>,
 <Sentence: Visual, literary and performing arts Main articles: Cinema of Sri Lanka, Music of Sri Lanka, Dances of Sri Lanka, Theatre of Sri Lanka, and Sri Lankan literature Female dancers in traditional Kandyan dress The Nelum Pokuna Mahinda Rajapaksa Theatre was constructed as a major venue for the performing arts The movie Kadawunu Poronduwa (The Broken Promise), produced by S. M. Nayagam of Chitra Kala Movietone, heralded the coming of Sri Lankan cinema in 1947.>,
 <Sentence: Retrieved from "https://en.wikipedia.org/w/index.php?title=Sri_Lanka&oldid=1225410621" Categories: Sri LankaRepublics 

In [None]:
#importing LexRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer

In [None]:
#summarizing the text using Lex Rank summarizer
lex_rank_summarizer = LexRankSummarizer()
summary2 = lex_rank_summarizer(my_parser.document, sentences_count = 3)
summary2

(<Sentence: Sri Lanka>, <Sentence: [397] and 2022.>, <Sentence: Sri Lanka.>)

In [None]:
#importing the LSA summarizer
from sumy.summarizers.lsa import LsaSummarizer

In [None]:
lsasummarizer = LsaSummarizer()
summary3 = lsasummarizer(my_parser.document, sentences_count = 3)
summary3

(<Sentence: vteSri Lanka topicsOverviews Sri Lankans Outline Bibliography Timeline Years 2024 HistoryPeriods Prehistory Pre Anuradhapura period Anuradhapura period Polonnaruwa period Transitional period Kandyan period British Ceylon period Sri Lanka (1948–present) Epochs Mahāvaṃsa Vijaya Tambapanni Anuradhapura Chola conquest of Anuradhapura Polonnaruwa Jaffna Dambadeniya Gampola Kotte Crisis of the Sixteenth Century Sitawaka Kandy Portuguese Ceylon Dutch Ceylon British Ceylon Kandyan Wars Uva Rebellion Matale rebellion Independence movement Dominion of Ceylon Civil War Topics Chronicles Monarchs Demographic Economic Education Military Sexual Minorities GovernmentLaw Constitution Constitutional Council Executive President Executive Office Cabinet / Ministries Administrative Service National Security Council Law enforcement Judiciary Supreme Court Court of appeal High Courts District courts Magistrate's Courts Primary Courts Labour Tribunal Legislature Parliament Prime Minister Office S

In [None]:
len(summary1)

3

#2.Apply the abstractive summarization

In [None]:
from transformers import pipeline

In [None]:
#install the summarizer
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

In [None]:
#download the url using webscrapping and deviding the content into 18 equal length
url = "https://en.wikipedia.org/wiki/Sachin_Tendulkar"

# getting the page content
page_new = urlopen(url)
text = page_new.read().decode("utf-8")

# Parsing the HTML content using Beautiful Soup
soup = BeautifulSoup(text, "html.parser")

# Finding all paragraph tags directly from the parsed HTML
paragraphs = soup.find_all('p')

#combining the text of all parts and dividing it into 18 roughly equal length paragraphs
full_text = " ".join([para.get_text() for para in paragraphs])

# Calculate the length of each part
total_length = len(full_text)
part_length = total_length // 18


In [None]:
summaries = []
# Summarizing each part
for i in range(18):
    start_index = i * part_length
    end_index = start_index + part_length if i < 17 else total_length  # Ensuring the last part gets the remainder
    text_part = full_text[start_index:end_index]

    # Summarizing the text part
    partial_summary = summarizer(text_part, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
    summaries.append(partial_summary) #appending each summary in paragraph to make the final summary

#printing final summary
print(summaries)

[" Sachin Ramesh Tendulkar is an Indian cricketer who captained the Indian national team . He is widely regarded as one of the greatest batsmen in the history of cricket . Hailed as the world's most prolific batsman of all time, he is the all-time highest run-scorer in both ODI and Test cricket with more than 18,000 runs and 15,000 Test runs . He also holds the record for receiving the most player of the", ' Tendulkar spent his formative years in the Sahitya Sahawas Cooperative Housing Society in Bandra (East) As a child, he was interested in both tennis and cricket . He idolised American tennis player John McEnroe, and emulated his hero by growing his hair long at the age of 7 or 8 years . As a young boy, he considered a bully, and he often picked fights with new children in his school . To help curb his bullying tendencies,', ' Tendulkar was selected to represent Bombay in the 1987–88 season, but he was not selected for the final eleven in any of the matches, though he was often used

#3. Applying word sense disambiguation


In [None]:
#installing dependencies
from nltk.corpus import wordnet
from nltk.wsd import lesk
from nltk.tokenize import word_tokenize

In [None]:
#1.a "He used the key to unlock the door."
a1 = lesk(word_tokenize('He used the key to unlock the door.'), 'key')
a1.definition()


'mechanical device used to wind another device that is driven by a spring (as a clock)'

In [None]:
#1.b "The answer to the problem lies in the key details."
a2 = lesk(word_tokenize('The answer to the problem lies in the key details.'), 'key')
a2.definition()

'vandalize a car by scratching the sides with a key'

In [None]:
#2.a The weight is measured on a scale weighing instrument.
b1 = lesk(word_tokenize('The weight is measured on a scale weighing instrument.'), 'scale')
b1.definition()

'size or measure according to a scale'

In [None]:
#2.b  The project is too large in scale for one person.
b2 = lesk(word_tokenize('The project is too large in scale for one person'), 'scale')
b2.definition()

'size or measure according to a scale'

In [None]:
#3.a "The knife has a very sharp blade."
c1 = lesk(word_tokenize('The knife has a very sharp blade.'), 'sharp')
c1.definition()

'having or emitting a high-pitched and sharp tone or tones'

In [None]:
#3.a "His mind was always sharp and quick."
c2 = lesk(word_tokenize('His mind was always sharp and quick.'), 'sharp')
c2.definition()

'having or emitting a high-pitched and sharp tone or tones'

#4. Use following sentences to find the bag of words using count vectorizer.


In [None]:
import pandas as pd


In [None]:
from sklearn.feature_extraction.text import CountVectorizer


In [None]:

cv = CountVectorizer()


In [None]:
sentence_1 = "The postman delivered the package to the wrong address."
sentence_2 = "I wrapped a beautiful present for my friend's birthday."
sentence_3 = "The delivery truck arrived late due to heavy traffic."
sentence_4 = "We need to check the shipping address before sending the order."
sentence_5 = "Online shopping offers a wide variety of products with fast delivery."

In [None]:
words = word_tokenize(sentence_1.lower()) + word_tokenize(sentence_2.lower()) + word_tokenize(sentence_3.lower()) + word_tokenize(sentence_4.lower()) + word_tokenize(sentence_5.lower())
tokens = set(words)

In [None]:
tokens1 = word_tokenize(sentence_1.lower())
tokens2 = word_tokenize(sentence_2.lower())
tokens3 = word_tokenize(sentence_3.lower())
tokens4 = word_tokenize(sentence_4.lower())
tokens5 = word_tokenize(sentence_5.lower())


In [None]:
counts1 = [tokens1.count(x) for x in df.columns]
counts2 = [tokens2.count(x) for x in df.columns]
counts3 = [tokens3.count(x) for x in df.columns]
counts4 = [tokens4.count(x) for x in df.columns]
counts5 = [tokens5.count(x) for x in df.columns]


In [None]:
#finding the bag of array
mydata = cv.fit_transform([sentence_1, sentence_2, sentence_3, sentence_4, sentence_5])
mydata.toarray()
df = pd.DataFrame(data=mydata.toarray(),
                  columns=cv.get_feature_names_out())
df

Unnamed: 0,address,arrived,beautiful,before,birthday,check,delivered,delivery,due,fast,...,the,to,traffic,truck,variety,we,wide,with,wrapped,wrong
0,1,0,0,0,0,0,1,0,0,0,...,3,1,0,0,0,0,0,0,0,1
1,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,1,1,0,...,1,1,1,1,0,0,0,0,0,0
3,1,0,0,1,0,1,0,0,0,0,...,2,1,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,1,0,1,1,0,0
