In [1]:
# 7a
# 1. Import Required Libraries
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary datasets for tokenization and stopwords
nltk.download('punkt')
nltk.download('stopwords')

# 2. Define Sample Text
text = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural language
data.
Challenges in natural language processing frequently involve speech recognition, natural
language understanding, and natural language generation.
"""

# 3. Preprocess the Text
# Split the text into sentences
sentences = nltk.sent_tokenize(text)

# Get the set of stopwords in English
stop_words = set(stopwords.words('english'))

# Function to preprocess each sentence by removing stopwords
def preprocess_sentence(sentence):
    return ' '.join([word for word in sentence.split() if word.lower() not in stop_words])

# Preprocess all the sentences
preprocessed_sentences = [preprocess_sentence(sentence) for sentence in sentences]

# 4. Compute TF-IDF Matrix
# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Transform the preprocessed sentences into TF-IDF features
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)

# 5. Compute Cosine Similarity
# Compute cosine similarity between TF-IDF vectors of the sentences
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 6. Generate Summary
# Function to generate a summary by ranking sentences based on their similarity scores
def generate_summary(sentences, sim_matrix, top_n=2):
    # Compute the sum of similarity scores for each sentence
    scores = sim_matrix.sum(axis=1)
    
    # Rank sentences based on the scores and select the top 'n' sentences
    ranked_sentences = [sentences[i] for i in scores.argsort()[-top_n:]]
    
    # Return the summary as a string
    return ' '.join(ranked_sentences)

# Generate and print the summary
summary = generate_summary(sentences, cosine_sim_matrix)
print("Summary:")
print(summary)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Summary:

Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial
intelligence concerned with the interactions between computers and human language, in
particular how to program computers to process and analyze large amounts of natural language
data. Challenges in natural language processing frequently involve speech recognition, natural
language understanding, and natural language generation.
