In [27]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.utils import get_stop_words
from rouge import Rouge

# Load the dataset from a CSV file

data = pd.read_csv("data.csv",nrows=10000)

# Create a dataframe
df = pd.DataFrame(data)

# Load and preprocess the dataset
abstracts = df['abstract'].tolist()
titles = df['title'].tolist()

# Text preprocessing
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')









[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pawan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pawan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
def preprocess_text(text):
    if isinstance(text, str):
        # Remove unnecessary characters and convert to lowercase
        text = text.lower()
        # Tokenize into sentences
        sentences = tokenizer.tokenize(text)
        # Tokenize into words
        words = [word for sentence in sentences for word in nltk.word_tokenize(sentence)]
        # Remove stopwords and punctuation
        words = [word for word in words if word.isalnum() and word not in stop_words]
        return ' '.join(words), sentences
    else:
        return '', []


processed_abstracts, abstract_sentences = zip(*[preprocess_text(abstract) if abstract else ('', []) for abstract in abstracts])
processed_titles, _ = zip(*[preprocess_text(title) for title in titles])


In [29]:
# Extract relevant features - TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(processed_abstracts)
from scipy.sparse import csr_matrix

tfidf_matrix = csr_matrix(tfidf_matrix)

# Sentence scoring - TF-IDF scores
def calculate_tfidf_scores(matrix, sentences):
    tfidf_scores = matrix.toarray()
    num_documents = tfidf_scores.shape[0]
    for i in range(num_documents):
        abstract_sentence_scores = tfidf_scores[i]
        sentence_indices = [j for j in range(num_documents) if j != i]  # Exclude current abstract
        for j in sentence_indices:
            abstract_sentence_scores += tfidf_scores[j]
        tfidf_scores[i] = abstract_sentence_scores / (num_documents - 1)  # Divide by number of other sentences
    return tfidf_scores.mean(axis=1)

tfidf_scores = calculate_tfidf_scores(tfidf_matrix, abstract_sentences)


In [30]:
# Select top sentences
def select_top_sentences(scores, sentences, n=3):
    sorted_sentences = [sentence for _, sentence in sorted(zip(scores, sentences), reverse=True)]
    return sorted_sentences[:n]

selected_sentences = select_top_sentences(tfidf_scores, abstracts)

In [34]:
# Generate summary using LexRank
def generate_summary(sentences, n=1000):
    parser = PlaintextParser.from_string(' '.join(sentences), Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summarizer.stop_words = get_stop_words("english")
    summary = summarizer(document=parser.document, sentences_count=n)
    return [str(sentence) for sentence in summary]

generated_summary = generate_summary(selected_sentences)
generated_summary

['[1] Hourly sea level observations measured by five tide gauges at Santa Cruz harbor (Tenerife Island), in the Northeastern Tropical Atlantic, have been merged to build a consistent and almost continuous sea level record starting in 1927.',
 'Datum continuity was ensured using high precision leveling information.',
 'The time series underwent a detailed quality control in order to remove outliers, time drifts, and datum shifts.',
 'The resulting sea level record was then used to describe the low frequency (interannual to decadal) sea level variability at Tenerife.',
 'It was found that at interannual and longer time scales, the observed sea level changes are primarily driven by steric sea level variations.',
 'Such steric changes are originated by coastal trapped waves induced by longshore winds along the continental coast and propagate poleward.',
 'Observed sea level rise at Tenerife was 2.09 6 0.04 mm/yr since 1927.',
 'According to the hydrographic observations in the area, only h

In [41]:
from itertools import chain
from rouge import Rouge

# Flatten the lists of summaries and titles
generated_summary_flat = list(chain.from_iterable(generated_summary))
processed_titles_flat = list(chain.from_iterable(processed_titles))

# print("Number of generated summaries:", len(generated_summary_flat))
# print("Number of processed titles:", len(processed_titles_flat))
print("Number of generated summaries:", len(generated_summary_flat))
print("Number of processed titles:", len(processed_titles_flat))
# # Evaluate the performance
# rouge = Rouge()
# scores = rouge.get_scores(generated_summary_flat, processed_titles_flat, avg=True)

# print("ROUGE Scores:", scores)


Number of generated summaries:10000
Number of processed titles:10000


In [38]:
# Additional evaluation using beLU measure
def calculate_belue_measure(generated, ground_truth):
    generated_tokens = nltk.word_tokenize(generated)
    ground_truth_tokens = nltk.word_tokenize(ground_truth)
    
    intersection = len(set(generated_tokens) & set(ground_truth_tokens))
    recall = intersection / len(ground_truth_tokens)
    precision = intersection / len(generated_tokens)
    
    if precision + recall > 0:
        f1_score = (2 * precision * recall) / (precision + recall)
    else:
        f1_score = 0.0
    
    return f1_score

belue_scores = [calculate_belue_measure(summary, title) for summary, title in zip(generated_summary, titles)]
average_belue_score = (sum(belue_scores) / len(belue_scores))

print("beLU Measure:", average_belue_score)




beLU Measure: 0.3953212504743868
