In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

A. BASIC TEXT SUMMARIZATION USING TF-IDF AND COSINE SIMILARITY

In [2]:
# Step 1: Import Required Libraries
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary resources for nltk
nltk.download('punkt')
nltk.download('stopwords')

# Step 2: Define Sample Text
text = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence 
concerned with the interactions between computers and human language, in particular how to program computers 
to process and analyze large amounts of natural language data. 
Challenges in natural language processing frequently involve speech recognition, 
natural language understanding, and natural language generation.
"""

# Step 3: Preprocess the Text
# Split text into sentences
sentences = nltk.sent_tokenize(text)

# Define stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def preprocess_sentence(sentence):
    return ' '.join([word for word in sentence.split() if word.lower() not in stop_words])

# Apply preprocessing to each sentence
preprocessed_sentences = [preprocess_sentence(sentence) for sentence in sentences]

# Step 4: Compute TF-IDF Matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)

# Step 5: Compute Cosine Similarity
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Step 6: Generate Summary
def generate_summary(sentences, sim_matrix, top_n=2):
    # Sum of similarity scores for each sentence
    scores = sim_matrix.sum(axis=1)
    # Get the top-ranked sentences based on similarity scores
    ranked_sentences = [sentences[i] for i in scores.argsort()[-top_n:]]
    return ' '.join(ranked_sentences)

# Generate and print the summary
summary = generate_summary(sentences, cosine_sim_matrix)
print("Summary:")
print(summary)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Summary:

Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence 
concerned with the interactions between computers and human language, in particular how to program computers 
to process and analyze large amounts of natural language data. Challenges in natural language processing frequently involve speech recognition, 
natural language understanding, and natural language generation.


B. ABSTRACTIVE TEXT SUMMARIZATION WITH TRANSFORMERS

In [1]:
# Import required libraries
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset

# Load the dataset (using 1% of the test split of CNN/DailyMail)
dataset = load_dataset("/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail", split="test[:1%]")

# Load pre-trained BART model and tokenizer
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Function to generate a summary using the BART model
def summarize(text):
    # Tokenize input text
    inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)
    
    # Generate summary ids
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    
    # Decode the summary and return
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Pick an article from the dataset
article = dataset[0]['article']

# Generate summary
summary = summarize(article)

# Print the original article and the generated summary
print("Original Article:\n", article)
print("\nGenerated Summary:\n", summary)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Original Article:
 Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee. 'It is time that the DOT and FAA take a stand for humane treatment of passengers.' But could crowding on planes lead to more serious issues than 