In [1]:
import nltk
nltk.download('punkt') # punkt tokenizer for sentence tokenization
nltk.download('stopwords') # list of stop words, such as 'a', 'an', 'the', 'in', etc, which would be dropped
from collections import Counter # Imports the Counter class from the collections module, used for counting the frequency of words in a text.
from nltk.corpus import stopwords # Imports the stop words list from the NLTK corpus
# corpus is a large collection of text or speech data used for statistical analysis
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.cluster.util import cosine_distance
import numpy as np
import rouge
from rouge import Rouge
from sklearn.feature_extraction.text import CountVectorizer
# Imports the sentence tokenizer and word tokenizer from the NLTK tokenizer module. 
# Sentence tokenizer is for splitting text into sentences
# word tokenizer is for splitting sentences into words

# this function would take 2 inputs, one being the text, and the other being the summary which would contain the number of lines
def generate_summary(text, n):
    sentences = sent_tokenize(text)
    # Tokenize each sentence into individual words and remove stopwords
    stop_words = set(stopwords.words('english'))
    # the following line would tokenize each sentence from sentences into individual words using the word_tokenize function of nltk.tokenize module
    # Then removes any stop words and non-alphanumeric characters from the resulting list of words and converts them all to lowercase
    words = [word.lower() for word in word_tokenize(text) if word.lower() not in stop_words and word.isalnum()]
    word_freq = Counter(words)
    sentence_scores = {}
    for sentence in sentences:
        sentence_words = [word.lower() for word in word_tokenize(sentence) if word.lower() not in stop_words and word.isalnum()]
        sentence_score = sum([word_freq[word] for word in sentence_words])
        if len(sentence_words) < 20:
            sentence_scores[sentence] = sentence_score
    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:n]
    summary = ' '.join(summary_sentences)

    return summary

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vedpr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vedpr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# this function would take 2 inputs, one being the text, and the other being the summary which would contain the number of lines
def generate_summary(text, n):
    # Tokenize the text into individual sentences
    sentences = sent_tokenize(text)
    
    # Tokenize each sentence into individual words and remove stopwords
    stop_words = set(stopwords.words('english'))
    # the following line would tokenize each sentence from sentences into individual words using the word_tokenize function of nltk.tokenize module
    # Then removes any stop words and non-alphanumeric characters from the resulting list of words and converts them all to lowercase
    words = [word.lower() for word in word_tokenize(text) if word.lower() not in stop_words and word.isalnum()]
    
    # Compute the frequency of each word
    word_freq = Counter(words)
    
# Compute the score for each sentence based on the frequency of its words
# After this block of code is executed, sentence_scores will contain the scores of each sentence in the given text, 
# where each score is a sum of the frequency counts of its constituent words

    # empty dictionary to store the scores for each sentence
    sentence_scores = {}

    for sentence in sentences:
        sentence_words = [word.lower() for word in word_tokenize(sentence) if word.lower() not in stop_words and word.isalnum()]
        sentence_score = sum([word_freq[word] for word in sentence_words])
        if len(sentence_words) < 20:
            sentence_scores[sentence] = sentence_score
    
# checks if the length of the sentence_words list is less than 20 (parameter can be adjusted based on the desired length of summary sentences)
# If condition -> true, score of the current sentence is added to the sentence_scores dictionary with the sentence itself as the key
# This is to filter out very short sentences that may not provide meaningful information for summary generation

    # Select the top n sentences with the highest scores
    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:n]
    summary = ' '.join(summary_sentences)
    
    return summary


text = '''
Weather is the day-to-day or hour-to-hour change in the atmosphere. 
Weather includes wind, lightning, storms, hurricanes, tornadoes (also known as twisters), rain, hail, snow, and lots more. 
Energy from the Sun affects the weather too. 
Climate tells us what kinds of weather usually happen in an area at different times of the year. 
Changes in weather can affect our mood and life. We wear different clothes and do different things in different weather conditions. 
We choose different foods in different seasons.
Weather stations around the world measure different parts of weather. 
Ways to measure weather are wind speed, wind direction, temperature and humidity. 
People try to use these measurements to make weather forecasts for the future. 
These people are scientists that are called meteorologists. 
They use computers to build large mathematical models to follow weather trends.'''

summary = generate_summary(text, 5)
summary_sentences = summary.split('. ')
formatted_summary = '.\n'.join(summary_sentences)
print(formatted_summary)

We wear different clothes and do different things in different weather conditions.
Weather stations around the world measure different parts of weather.
Climate tells us what kinds of weather usually happen in an area at different times of the year.
Weather includes wind, lightning, storms, hurricanes, tornadoes (also known as twisters), rain, hail, snow, and lots more.
Ways to measure weather are wind speed, wind direction, temperature and humidity.


In [3]:
def read_article(text):
    sentences = sent_tokenize(text)
    return sentences

def sentence_similarity(sent1, sent2, stop_words):
    words1 = [word.lower() for word in sent1.split() if word.isalnum() and word.lower() not in stop_words]
    words2 = [word.lower() for word in sent2.split() if word.isalnum() and word.lower() not in stop_words]
    
    all_words = list(set(words1 + words2))
    
    vectorizer = CountVectorizer().fit_transform([sent1, sent2])
    vectors = vectorizer.toarray()
    
    return 1 - cosine_distance(vectors[0], vectors[1])

def build_similarity_matrix(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j], stop_words)
    
    return similarity_matrix

def generate_summary(text, num_sentences=5):
    stop_words = set(stopwords.words("english"))
    sentences = read_article(text)
    sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)
    
    # Apply PageRank algorithm
    scores = np.array([np.sum(row) for row in sentence_similarity_matrix])
    
    # Get top sentences based on scores
    ranked_sentences = [sentences[i] for i in np.argsort(scores)[-num_sentences:]]
    
    return " ".join(ranked_sentences)

# Example usage:
input_text = """
Weather is the day-to-day or hour-to-hour change in the atmosphere. 
Weather includes wind, lightning, storms, hurricanes, tornadoes (also known as twisters), rain, hail, snow, and lots more. 
Energy from the Sun affects the weather too. 
Climate tells us what kinds of weather usually happen in an area at different times of the year. 
Changes in weather can affect our mood and life. We wear different clothes and do different things in different weather conditions. 
We choose different foods in different seasons.
Weather stations around the world measure different parts of weather. 
Ways to measure weather are wind speed, wind direction, temperature and humidity. 
People try to use these measurements to make weather forecasts for the future. 
These people are scientists that are called meteorologists. They use computers to build large mathematical models to follow weather trends."""

summary = generate_summary(input_text)
print("Summary:")
print(summary)


Summary:
Climate tells us what kinds of weather usually happen in an area at different times of the year. People try to use these measurements to make weather forecasts for the future. 
Weather is the day-to-day or hour-to-hour change in the atmosphere. We wear different clothes and do different things in different weather conditions. Weather stations around the world measure different parts of weather.


In [4]:
# a defined function called evaluate_rouge taking two arguments, 
# one being reference text and the other summary text, 
# and uses the ROUGE metric to evaluate the quality of the summary text compared to the reference text.
# The function uses the rouge library to compute the ROUGE scores and returns the F1 score of the ROUGE-1 metric.
def evaluate_rouge(reference_text, summary_text):
    rouge = Rouge()
    scores = rouge.get_scores(reference_text, summary_text)
    return scores[0]['rouge-1']['f']


# the following is a human generated summary
reference_summary = '''
Weather is a gradual slow change through days and hours in the atmosphere and can vary from wind to snow. 
Climate tells a lot about the weather in an area.
The livelihood of people changes according to the change in weather.
Weather stations measure different parts of weather.
People who use measurements to make weather forecasts for the future are called meteorologists, and are scientists.'''

# the sample text from Wikipedia
text = '''
Weather is the day-to-day or hour-to-hour change in the atmosphere. 
Weather includes wind, lightning, storms, hurricanes, tornadoes (also known as twisters), rain, hail, snow, and lots more. 
Energy from the Sun affects the weather too. 
Climate tells us what kinds of weather usually happen in an area at different times of the year. 
Changes in weather can affect our mood and life. We wear different clothes and do different things in different weather conditions. 
We choose different foods in different seasons.
Weather stations around the world measure different parts of weather. 
Ways to measure weather are wind speed, wind direction, temperature and humidity. 
People try to use these measurements to make weather forecasts for the future. 
These people are scientists that are called meteorologists. 
They use computers to build large mathematical models to follow weather trends.'''

# Generate summary using frequency-based/TF-IDF approach
summary = generate_summary(text, 5)

# Evaluate the summary using ROUGE
rouge_score = evaluate_rouge(reference_summary, summary)

print(f"ROUGE score: {rouge_score}")


ROUGE score: 0.5319148886192849


In [5]:
# importing the required libraries

# importing TfidfVectorizer class to convert a collection of raw documents to a matrix of TF-IDF features.
from sklearn.feature_extraction.text import TfidfVectorizer

# importing cosine_similarity function to compute the cosine similarity between two vectors.
from sklearn.metrics.pairwise import cosine_similarity

# importing nlargest to return the n largest elements from an iterable in descending order.
from heapq import nlargest

def generate_summary(text, n):
    # Tokenize the text into individual sentences
    sentences = sent_tokenize(text)

    # Create the TF-IDF matrix
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(sentences)

    # Compute the cosine similarity between each sentence and the document
    sentence_scores = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])[0]

    # Select the top n sentences with the highest scores
    summary_sentences = nlargest(n, range(len(sentence_scores)), key=sentence_scores.__getitem__)

    summary_tfidf = ' '.join([sentences[i] for i in sorted(summary_sentences)])

    return summary_tfidf

In [6]:
summary = generate_summary(text, 5)
summary_sentences = summary.split('. ')
formatted_summary = '.\n'.join(summary_sentences)
print(formatted_summary)

Energy from the Sun affects the weather too.
Changes in weather can affect our mood and life.
We wear different clothes and do different things in different weather conditions.
Weather stations around the world measure different parts of weather.
People try to use these measurements to make weather forecasts for the future.
