Importing required libraries

In [3]:
import pandas as pd
import re
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import re
import spacy
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset, load_from_disk
import accelerate

Loading the dataset & data exploration

In [5]:
train_data = pd.read_csv('train.csv')
val_data = pd.read_csv('validation.csv')
test_data = pd.read_csv('test.csv')

In [7]:
print(train_data.head())
print(val_data.head())
print(test_data.head())

                                         id  \
0  0001d1afc246a7964130f43ae940af6bc6c57f01   
1  0002095e55fcbd3a2f366d9bf92a95433dc305ef   
2  00027e965c8264c35cc1bc55556db388da82b07f   
3  0002c17436637c4fe1837c935c04de47adb18e9a   
4  0003ad6ef0c37534f80b55b4235108024b407f0b   

                                             article  \
0  By . Associated Press . PUBLISHED: . 14:11 EST...   
1  (CNN) -- Ralph Mata was an internal affairs li...   
2  A drunk driver who killed a young woman in a h...   
3  (CNN) -- With a breezy sweep of his pen Presid...   
4  Fleetwood are the only team still to have a 10...   

                                          highlights  
0  Bishop John Folda, of North Dakota, is taking ...  
1  Criminal complaint: Cop used his role to help ...  
2  Craig Eccleston-Todd, 27, had drunk at least t...  
3  Nina dos Santos says Europe must be ready to a...  
4  Fleetwood top of League One after 2-0 win at S...  
                                         id  \
0  

Preprocessing

In [None]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

train_data['article'] = train_data['article'].apply(preprocess)
train_data['highlights'] = train_data['highlights'].apply(preprocess)

val_data['article'] = val_data['article'].apply(preprocess)
val_data['highlights'] = val_data['highlights'].apply(preprocess)

test_data['article'] = test_data['article'].apply(preprocess)
test_data['highlights'] = test_data['highlights'].apply(preprocess)


In [None]:
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)


In [None]:
train_data.to_csv('cleaned_train_data.csv', index=False)
val_data.to_csv('cleaned_val_data.csv', index=False)
test_data.to_csv('cleaned_test_data.csv', index=False)

In [None]:
##Remove common words that don't add much meaning (like "the", "is", "in", etc.).

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

train_data['article'] = train_data['article'].apply(remove_stopwords)
train_data['highlights'] = train_data['highlights'].apply(remove_stopwords)

val_data['article'] = val_data['article'].apply(remove_stopwords)
val_data['highlights'] = val_data['highlights'].apply(remove_stopwords)

test_data['article'] = test_data['article'].apply(remove_stopwords)
test_data['highlights'] = test_data['highlights'].apply(remove_stopwords)

# Save data after removing stopwords
train_data.to_csv('stopwords_removed_train_data.csv', index=False)
val_data.to_csv('stopwords_removed_val_data.csv', index=False)
test_data.to_csv('stopwords_removed_test_data.csv', index=False)


In [None]:
##Use SpaCy for lemmatization to convert words to their base forms.

In [1]:
# !pip install spacy
# !python -m spacy download en_core_web_sm


In [2]:
# !pip install spacy tqdm
# !python -m spacy download en_core_web_sm


In [15]:
import pandas as pd
import re
import spacy
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter

# Load datasets
train_data = pd.read_csv('cleaned_train_data.csv')
val_data = pd.read_csv('cleaned_val_data.csv')
test_data = pd.read_csv('cleaned_test_data.csv')

# Function to clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# Apply cleaning to all datasets
train_data['article'] = train_data['article'].apply(clean_text)
train_data['highlights'] = train_data['highlights'].apply(clean_text)

val_data['article'] = val_data['article'].apply(clean_text)
val_data['highlights'] = val_data['highlights'].apply(clean_text)

test_data['article'] = test_data['article'].apply(clean_text)
test_data['highlights'] = test_data['highlights'].apply(clean_text)

# Load the pre-trained SpaCy model globally
nlp = spacy.load('en_core_web_sm')

# Function for lemmatization using nlp.pipe for batch processing
def lemmatize_texts_with_progress(texts):
    lemmatized_texts = []
    for doc in tqdm(nlp.pipe(texts, batch_size=50, disable=['parser', 'ner']), total=len(texts)):
        lemmatized_texts.append(' '.join([token.lemma_ for token in doc]))
    return lemmatized_texts

# Apply lemmatization with progress bar
train_data['article'] = lemmatize_texts_with_progress(train_data['article'].tolist())
train_data['highlights'] = lemmatize_texts_with_progress(train_data['highlights'].tolist())

val_data['article'] = lemmatize_texts_with_progress(val_data['article'].tolist())
val_data['highlights'] = lemmatize_texts_with_progress(val_data['highlights'].tolist())

test_data['article'] = lemmatize_texts_with_progress(test_data['article'].tolist())
test_data['highlights'] = lemmatize_texts_with_progress(test_data['highlights'].tolist())

# Save lemmatized data
train_data.to_csv('lemmatized_train_data.csv', index=False)
val_data.to_csv('lemmatized_val_data.csv', index=False)
test_data.to_csv('lemmatized_test_data.csv', index=False)


100%|████████████████████████████████████████████████████████████████████████| 287113/287113 [3:35:38<00:00, 22.19it/s]
100%|█████████████████████████████████████████████████████████████████████████| 287113/287113 [14:04<00:00, 339.91it/s]
100%|████████████████████████████████████████████████████████████████████████████| 13368/13368 [08:01<00:00, 27.76it/s]
100%|███████████████████████████████████████████████████████████████████████████| 13368/13368 [00:41<00:00, 318.43it/s]
100%|████████████████████████████████████████████████████████████████████████████| 11490/11490 [07:00<00:00, 27.33it/s]
100%|███████████████████████████████████████████████████████████████████████████| 11490/11490 [00:35<00:00, 326.70it/s]


In [17]:
from collections import Counter

# Function to replace all numbers in the text with a placeholder <NUM>
def replace_numbers(text):
    return re.sub(r'\d+', '<NUM>', text)

# Function to replace rare words in the text with a placeholder <RARE>
# A word is considered rare if its frequency is below the specified threshold (default is 5)
def remove_rare_words(text, freq_threshold=5):
    words = text.split()  # Split the text into words
    word_freq = Counter(words)  # Count the frequency of each word
    # Identify words that occur less than the frequency threshold
    rare_words = {word for word, freq in word_freq.items() if freq < freq_threshold}
    # Replace rare words with <RARE>, keep other words unchanged
    filtered_text = [word if word not in rare_words else '<RARE>' for word in words]
    return ' '.join(filtered_text)  # Join the words back into a single string

# Apply number replacement and rare word removal to the 'article' and 'highlights' columns of all datasets
train_data['article'] = train_data['article'].apply(replace_numbers).apply(remove_rare_words)
train_data['highlights'] = train_data['highlights'].apply(replace_numbers).apply(remove_rare_words)

val_data['article'] = val_data['article'].apply(replace_numbers).apply(remove_rare_words)
val_data['highlights'] = val_data['highlights'].apply(replace_numbers).apply(remove_rare_words)

test_data['article'] = test_data['article'].apply(replace_numbers).apply(remove_rare_words)
test_data['highlights'] = test_data['highlights'].apply(replace_numbers).apply(remove_rare_words)

# Save the processed datasets with handled numbers and rare words
train_data.to_csv('handled_train_data.csv', index=False)  # Save the training data
val_data.to_csv('handled_val_data.csv', index=False)      # Save the validation data
test_data.to_csv('handled_test_data.csv', index=False)    # Save the test data

In [21]:
# Apply sentence tokenization to the 'article' column
# This splits the text of each article into a list of sentences
train_data['sentences'] = train_data['article'].apply(sent_tokenize)
val_data['sentences'] = val_data['article'].apply(sent_tokenize)
test_data['sentences'] = test_data['article'].apply(sent_tokenize)

# Save the datasets with the new 'sentences' column to CSV files
train_data.to_csv('tokenized_train_data.csv', index=False)  # Save the training data
val_data.to_csv('tokenized_val_data.csv', index=False)      # Save the validation data
test_data.to_csv('tokenized_test_data.csv', index=False)    # Save the test data

In [17]:
# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

Preparing for Model Building

In [17]:
# Load the pre-trained T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Define a function to tokenize the 'article' and 'highlights' columns
def tokenize_function(examples):
    # Tokenize the 'article' text with a maximum length of 512 tokens, truncating longer sequences
    model_inputs = tokenizer(examples['article'], max_length=512, truncation=True)
    
    # Tokenize the 'highlights' text (used as labels) with a maximum length of 150 tokens
    labels = tokenizer(examples['highlights'], max_length=150, truncation=True)
    
    # Set the 'input_ids' from the tokenized highlights as labels for the model
    model_inputs['labels'] = labels['input_ids']
    
    return model_inputs

# Apply the tokenization function to the training, validation, and test datasets
# The map function applies the tokenization in batches for efficiency
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [23]:
# from datasets import load_from_disk

# # Load the tokenized datasets
# tokenized_train_dataset = load_from_disk('tokenized_datasets/train')
# tokenized_val_dataset = load_from_disk('tokenized_datasets/val')
# tokenized_test_dataset = load_from_disk('tokenized_datasets/test')

In [1]:
# !pip install accelerate -U


In [33]:
import accelerate
print(accelerate.__version__)


0.33.0


In [3]:
# pip install accelerate transformers torch datasets


In [1]:
from datasets import load_from_disk

# Load the tokenized datasets
tokenized_train_dataset = load_from_disk('tokenized_datasets/train')
tokenized_val_dataset = load_from_disk('tokenized_datasets/val')
tokenized_test_dataset = load_from_disk('tokenized_datasets/test')


Adjusting Padding for Tokenized Text Data in T5 Model Training

In [35]:
import torch
from transformers import T5Tokenizer

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small', legacy=False)

# Function to adjust padding
def adjust_padding(examples, max_length=512):
    # Adjust inputs
    inputs = tokenizer.pad(
        {"input_ids": examples["input_ids"], "attention_mask": examples["attention_mask"]},
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    # Adjust labels
    labels = tokenizer.pad(
        {"input_ids": examples["labels"]},
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    # Convert tensors to lists
    examples["input_ids"] = inputs["input_ids"].tolist()
    examples["attention_mask"] = inputs["attention_mask"].tolist()
    examples["labels"] = labels["input_ids"].tolist()
    return examples

# Adjust padding for each dataset
max_length_article = 512
max_length_summary = 150

tokenized_train_dataset = tokenized_train_dataset.map(lambda examples: adjust_padding(examples, max_length=max_length_article), batched=True)
tokenized_val_dataset = tokenized_val_dataset.map(lambda examples: adjust_padding(examples, max_length=max_length_article), batched=True)
tokenized_test_dataset = tokenized_test_dataset.map(lambda examples: adjust_padding(examples, max_length=max_length_article), batched=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Shuffling, Subsampling, and Padding Adjustments for T5 Model Training on Smaller Datasets

In [1]:
from datasets import load_from_disk
import random
from tqdm import tqdm
from transformers import T5Tokenizer

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small', legacy=False)

# Load the tokenized datasets
tokenized_train_dataset = load_from_disk('tokenized_datasets/train')
tokenized_val_dataset = load_from_disk('tokenized_datasets/val')
tokenized_test_dataset = load_from_disk('tokenized_datasets/test')

# Function to shuffle and select a subset of the dataset with progress bar
def shuffle_and_select(dataset, num_samples, seed=42):
    indices = list(range(len(dataset)))
    random.seed(seed)
    random.shuffle(indices)
    selected_indices = indices[:num_samples]
    subset = dataset.select(selected_indices)
    return subset

# Select a smaller subset of the dataset with progress bar
print("Shuffling and selecting smaller train dataset...")
small_train_dataset = shuffle_and_select(tokenized_train_dataset, 5000)
print("Shuffling and selecting smaller val dataset...")
small_val_dataset = shuffle_and_select(tokenized_val_dataset, 1000)

# Adjust padding for smaller datasets
def adjust_padding(examples, max_length=512):
    # Adjust inputs
    inputs = tokenizer.pad(
        {"input_ids": examples["input_ids"], "attention_mask": examples["attention_mask"]},
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    # Adjust labels
    labels = tokenizer.pad(
        {"input_ids": examples["labels"]},
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    # Convert tensors to lists
    examples["input_ids"] = inputs["input_ids"].tolist()
    examples["attention_mask"] = inputs["attention_mask"].tolist()
    examples["labels"] = labels["input_ids"].tolist()
    return examples

print("Adjusting padding for smaller train dataset...")
small_train_dataset = small_train_dataset.map(lambda examples: adjust_padding(examples, max_length=512), batched=True)
print("Adjusting padding for smaller val dataset...")
small_val_dataset = small_val_dataset.map(lambda examples: adjust_padding(examples, max_length=512), batched=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Shuffling and selecting smaller train dataset...
Shuffling and selecting smaller val dataset...
Adjusting padding for smaller train dataset...


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Adjusting padding for smaller val dataset...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [17]:
def tokenize_function(examples):
    model_inputs = tokenizer(examples['article'], max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(examples['highlights'], max_length=150, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs


Training the model

In [5]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

# Load the model
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Adjust training parameters
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # Reduce the number of epochs
    per_device_train_batch_size=8,  # Increase the batch size if you have enough GPU memory
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    fp16=True,  # Use mixed precision training if supported by your hardware
    disable_tqdm=False,  # Ensure tqdm progress bar is enabled
    report_to="none"  # Ensure no integration with external logging services
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_val_dataset,
)

# Show progress with tqdm
print("Starting training...")
trainer.train()
print("Training completed.")

# Save the model and tokenizer
model.save_pretrained('saved_model/t5-small')
tokenizer.save_pretrained('saved_model/t5-small')




Starting training...


Epoch,Training Loss,Validation Loss
1,0.3353,0.321402


Training completed.


('saved_model/t5-small\\tokenizer_config.json',
 'saved_model/t5-small\\special_tokens_map.json',
 'saved_model/t5-small\\spiece.model',
 'saved_model/t5-small\\added_tokens.json')

In [7]:
# Load the model and tokenizer for inference
model = T5ForConditionalGeneration.from_pretrained('saved_model/t5-small')
tokenizer = T5Tokenizer.from_pretrained('saved_model/t5-small')

# Generate summaries function
def generate_summary(text, model, tokenizer, max_length=150, min_length=40, num_beams=4):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length, num_beams=num_beams, length_penalty=2.0, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Test the model with a sample text from the test set
sample_text = tokenized_test_dataset['article'][0]
print("Article:", sample_text)
print("Summary:", generate_summary(sample_text, model, tokenizer))


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Article: ever noticed how plane seats appear to be getting smaller and smaller with increasing numbers of people taking to the skies some experts are questioning if having such packed out planes is putting passengers at risk they say that the shrinking space on aeroplanes is not only uncomfortable  its putting our health and safety in danger more than squabbling over the arm rest shrinking space on planes putting our health and safety in danger this week a us consumer advisory group set up by the department of transportation said at a public hearing that while the government is happy to set standards for animals flying on planes it doesnt stipulate a minimum amount of space for humans in a world where animals have more rights to space and food than humans said charlie leocha consumer representative on the committee it is time that the dot and faa take a stand for humane treatment of passengers but could crowding on planes lead to more serious issues than fighting for space in the overh

In [13]:
from datasets import load_from_disk
from transformers import T5Tokenizer

# Load the tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small', legacy=False)

# Load the tokenized datasets
tokenized_train_dataset = load_from_disk('tokenized_datasets/train')
tokenized_val_dataset = load_from_disk('tokenized_datasets/val')
tokenized_test_dataset = load_from_disk('tokenized_datasets/test')

# Function to adjust padding
def adjust_padding(examples, max_length=512):
    # Adjust inputs
    inputs = tokenizer.pad(
        {"input_ids": examples["input_ids"], "attention_mask": examples["attention_mask"]},
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    # Adjust labels
    labels = tokenizer.pad(
        {"input_ids": examples["labels"]},
        padding="max_length",
        max_length=max_length,
        return_tensors="pt"
    )
    # Convert tensors to lists
    examples["input_ids"] = inputs["input_ids"].tolist()
    examples["attention_mask"] = inputs["attention_mask"].tolist()
    examples["labels"] = labels["input_ids"].tolist()
    return examples

# Adjust padding for each dataset
print("Adjusting padding for train dataset...")
tokenized_train_dataset = tokenized_train_dataset.map(lambda examples: adjust_padding(examples, max_length=512), batched=True)
print("Adjusting padding for val dataset...")
tokenized_val_dataset = tokenized_val_dataset.map(lambda examples: adjust_padding(examples, max_length=512), batched=True)
print("Adjusting padding for test dataset...")
tokenized_test_dataset = tokenized_test_dataset.map(lambda examples: adjust_padding(examples, max_length=512), batched=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Adjusting padding for train dataset...


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Adjusting padding for val dataset...


Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Adjusting padding for test dataset...


Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [4]:
# !pip install rouge-score


In [5]:
# !pip install evaluate


In [6]:
# !pip install flask


In [7]:
# !pip install gradio


In [13]:
import gradio as gr
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('saved_model/t5-small')
tokenizer = T5Tokenizer.from_pretrained('saved_model/t5-small')

def generate_summary(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, num_beams=4, length_penalty=2.0, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Define the Gradio interface
iface = gr.Interface(
    fn=generate_summary,
    inputs="textbox",
    outputs="textbox",
    title="Text Summarizer",
    description="Enter a text to generate its summary using T5 model."
)

# Launch the interface
iface.launch()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


