In [1]:
# Import required libraries
import nltk
import re
import pickle
import numpy as np

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, confusion_matrix


In [3]:
# Download NLTK resources (only need to run once)
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Sample text to summarize (replace with any longer document)
text = '''
Data science is an interdisciplinary field that combines statistics, computer science, and domain expertise to extract meaningful insights from data. It involves collecting, cleaning, analyzing, and visualizing data to support decision-making and predictions. Data scientists use tools such as Python, R, SQL, and machine learning algorithms to build predictive models and uncover hidden patterns. In industries like healthcare, finance, marketing, and transportation, data science is transforming how organizations operate and deliver value. However, working with data also requires ethical considerations, such as ensuring privacy, avoiding bias, and maintaining transparency. As data continues to grow in volume and complexity, data science is becoming an essential skill in the modern digital economy.

'''

In [5]:
sentences = sent_tokenize(text)

In [6]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
processed_sentences = []

In [7]:
for sentence in sentences:
    # Remove anything that's not a letter and convert to lowercase
    cleaned = re.sub(r'[^a-zA-Z]', ' ', sentence).lower()
    words = word_tokenize(cleaned)
    # Remove stopwords and apply stemming
    filtered = [stemmer.stem(word) for word in words if word not in stop_words]
    processed_sentences.append(' '.join(filtered))


In [8]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_sentences).toarray()


In [9]:
sentence_scores = np.sum(tfidf_matrix, axis=1)


In [10]:
num_sentences = 3
top_indices = sentence_scores.argsort()[-num_sentences:][::-1]
top_indices = sorted(top_indices) 

In [11]:
summary = [sentences[i] for i in top_indices]

In [12]:
print("=== Extractive Summary ===\n")
for line in summary:
    print(line)


=== Extractive Summary ===

Data scientists use tools such as Python, R, SQL, and machine learning algorithms to build predictive models and uncover hidden patterns.
In industries like healthcare, finance, marketing, and transportation, data science is transforming how organizations operate and deliver value.
However, working with data also requires ethical considerations, such as ensuring privacy, avoiding bias, and maintaining transparency.


In [14]:
model_data = {
    'vectorizer': vectorizer,
    'original_sentences': sentences,
    'processed_sentences': processed_sentences
}
with open('tfidf_summarizer.pkl', 'wb') as file:
    pickle.dump(model_data, file)

print("\nModel saved as tfidf_summarizer.pkl")


Model saved as tfidf_summarizer.pkl


In [15]:
y_true = [1 if i in top_indices else 0 for i in range(len(sentences))]
threshold = np.median(sentence_scores)
y_pred = [1 if score > threshold else 0 for score in sentence_scores]

print("\nEvaluation Metrics:")
print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))



Evaluation Metrics:
Precision: 1.0
Recall: 1.0
Confusion Matrix:
 [[3 0]
 [0 3]]
