In [1]:
! pip install transformers torch pandas numpy rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curan

In [2]:
import re
import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch.optim as optim

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import torch
gpu_available = torch.cuda.is_available()
if gpu_available:
    num_gpus = torch.cuda.device_count()
    print(f"Number of available GPUs: {num_gpus}")

    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {gpu_name}")
else:
    print("CUDA is not available. Using CPU.")

Number of available GPUs: 1
GPU 0: Tesla T4


In [None]:
import pandas as pd

file_path="/content/drive/MyDrive/IR-A4/Reviews.csv"
#Red the csv file
encoding = 'latin1'
valid_chunks = []

df=pd.read_csv(file_path)
print(df.info())

#Dropping other columns
df=df[['Text','Summary']]

#Dropping Missing Values
new_df=df.dropna()

#Dropping Duplicates
df=new_df.drop_duplicates()

#Resetting the index
df.reset_index(drop=True, inplace=True)

print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568428 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394967 entries, 0 to 394966
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Text  

In [29]:
#Preprocessing the text
import re
import unicodedata
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Predefined dictionary for expanding acronyms - tried apis
acronym_dict = {
    "isn't": "is not",
    "can't": "cannot",
    "won't": "will not",
    "shouldn't": "should not",
     "lol": "laugh out loud",
    "brb": "be right back",
    "btw": "by the way",
    "omg": "oh my god",
    "imo": "in my opinion",
    "idk": "I don't know",
    "tbh": "to be honest",
    "rofl": "rolling on the floor laughing",
    "gtg": "got to go",
    "smh": "shaking my head",
    "np": "no problem",
    "fyi": "for your information",
    "iirc": "if I recall correctly",
    "afaik": "as far as I know","faq": "frequently asked questions",
    "qc": "quality control",
    "oem": "original equipment manufacturer",
    "odm": "original design manufacturer",
    "sku": "stock keeping unit",
    "eol": "end of life",
    "nib": "new in box",
    "bnib": "brand new in box",
    "nwot": "new without tags",
    "nwt": "new with tags",
    "obo": "or best offer",
    "bnwt": "brand new with tags",
    "bnip": "brand new in package",
    "asap": "as soon as possible",
    "eta": "estimated time of arrival",
    "nos": "new old stock",
    "nip": "new in package",
    "dslr": "digital single lens reflex",
    "led": "light emitting diode",
    "lcd": "liquid crystal display",
    "oled": "organic light emitting diode",
    "hdr": "high dynamic range",
    "ips": "in-plane switching",
    "va": "vertical alignment",
    "tn": "twisted nematic",
    "g-sync": "nvidia's adaptive sync technology",
    "freesync": "amd's adaptive sync technology",
    "hdr10": "high dynamic range (10-bit)",
    "hdmi": "high-definition multimedia interface",
    "usb-c": "universal serial bus type-c",
    "ssd": "solid state drive",
    "hdd": "hard disk drive",
    "nvme": "non-volatile memory express",
    "ram": "random access memory",
    "cpu": "central processing unit",
    "gpu": "graphics processing unit",
    "vram": "video random access memory",
    "mbps": "megabits per second",
    "gbps": "gigabits per second",
    "ghz": "gigahertz",
    "mhz": "megahertz",
    "rgb": "red green blue",
    "cmyk": "cyan magenta yellow black",
    "ppi": "pixels per inch",
    "dpi": "dots per inch",
    "arc": "audio return channel",
    "psu": "power supply unit",
    "ups": "uninterruptible power supply",
    "vga": "video graphics array",
    "dvi": "digital visual interface",
    "osd": "on-screen display",
    "dpi": "dots per inch",
    "hifi": "high fidelity",
    "thx": "george lucas' audio certification program",
    "dsp": "digital signal processing",
    "nfc": "near field communication",
    "rf": "radio frequency",
    "ir": "infrared",
    "usb": "universal serial bus",
    "mp3": "mpeg audio layer-3",
    "flac": "free lossless audio codec",
    "aac": "advanced audio coding",
    "aptx": "audio codec for bluetooth",
    "dts": "digital theater systems",
    "bt": "bluetooth",
    "nfc": "near field communication",
    "ai": "artificial intelligence",
    "iot": "internet of things",
    "ar": "augmented reality",
    "vr": "virtual reality",
    "ml": "machine learning",
    "dl": "deep learning",
    "cnn": "convolutional neural network",
    "rnn": "recurrent neural network",
    "nlp": "natural language processing",
    "lstm": "long short-term memory",
    "aws": "amazon web services",
    "gcp": "google cloud platform",
    "azure": "microsoft azure",
    "saas": "software as a service",
    "paas": "platform as a service",
    "iaas": "infrastructure as a service",
    "devops": "development and operations",
    "ci/cd": "continuous integration/continuous deployment",
    "agile": "adaptive, iterative, and incremental",
    "scrum": "a framework for agile development",
    "kubernetes": "container orchestration platform",
    "docker": "containerization platform",
    "git": "version control system",
    "github": "online git repository hosting service",
    "bitbucket": "git repository management tool",
    "jira": "project management tool",
    "slack": "team collaboration tool",
    "crm": "customer relationship management",
    "erp": "enterprise resource planning",
    "pos": "point of sale",
    "seo": "search engine optimization",
    "sem": "search engine marketing",
    "ppc": "pay per click",
    "cta": "call to action",
    "ctr": "click-through rate",
    "cpa": "cost per acquisition",
    "cpc": "cost per click",
    "cpl": "cost per lead",
    "roi": "return on investment",
    "ux": "user experience"
}


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [28]:
#Preprocessing Text
# Function to remove HTML tags
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# Function to remove accented characters
def remove_accented_chars(text):
    normalized_text = unicodedata.normalize('NFKD', text)
    return normalized_text.encode('ascii', 'ignore').decode('utf-8', 'ignore')

# Function to expand acronyms using predefined dictionary
def expand_acronyms(text):
    for acronym, expanded in acronym_dict.items():
        text = text.replace(acronym, expanded)
    return text

# Function to remove special characters
def remove_special_characters(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# Function for lemmatization
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized_tokens)

# Function for text normalization (including stopwords removal)
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = remove_html_tags(text)
    text = remove_accented_chars(text)
    text = expand_acronyms(text)
    text = remove_special_characters(text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize tokens
    preprocessed_text = lemmatize_text(' '.join(filtered_tokens))
    return preprocessed_text


In [None]:
df['CleanedText'] = df['Text'].apply(preprocess_text)
df['CleanedSummary'] = df['Summary'].apply(preprocess_text)

print(df.head())
df_cleaned = df.dropna()
df_cleaned.reset_index(drop=True, inplace=True)
df_cleaned.drop_duplicates(inplace=True)
print("Before dropping Missing Values count")
print(df.info())
# Verify the updated DataFrame info
print("After dropping Missing Values count")
print(df_cleaned.info())
df_cleaned.to_csv("/content/drive/MyDrive/IR-A4/CleanedReviews.csv",index=False)

In [93]:
#RUN FROM HERE

import pandas as pd
import torch
df=pd.read_csv("/content/drive/MyDrive/IR-A4/CleanedReviews.csv")
print(df.info())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device:",device)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

df['training'] = df['CleanedText'] + " TL;DR " + df['CleanedSummary']
print(df.info())
print(df.head(3))

# Now, you can proceed with your operations on the cleaned DataFrame
reviews = []
for i in df['training']:
  i=str(i)
  reviews.append(i)

print(len(reviews))
print(reviews[:5])
# Example operations after dropping NaN rows
# reviews = [review.replace("\n", " TL;DR ") for review in reviews]
avg_length = sum([len(review.split()) for review in reviews]) / len(reviews)

print(avg_length)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394967 entries, 0 to 394966
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Text            394967 non-null  object
 1   Summary         394967 non-null  object
 2   CleanedText     394966 non-null  object
 3   CleanedSummary  394191 non-null  object
dtypes: object(4)
memory usage: 12.1+ MB
None
device: cuda
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394191 entries, 0 to 394190
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Text            394191 non-null  object
 1   Summary         394191 non-null  object
 2   CleanedText     394191 non-null  object
 3   CleanedSummary  394191 non-null  object
 4   training        394191 non-null  object
dtypes: object(5)
memory usage: 15.0+ MB
None
                                                Text                Summary  \
0  I have b

In [94]:
max_length = 100

In [95]:
from transformers import AutoTokenizer, AutoModelWithLMHead
tokenizer = AutoTokenizer.from_pretrained("gpt2")

tokenizer.encode(" TL;DR ")
extra_length = len(tokenizer.encode(" TL;DR "))

In [96]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPT2ReviewDataset(Dataset):
    def __init__(self, tokenizer, reviews, max_len):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.reviews = reviews
        self.result = []

        for review in self.reviews:
            # Encode the text using tokenizer.encode(). We add EOS at the end
            tokenized = self.tokenizer.encode(review + self.eos, truncation=True, max_length=self.max_len)


            # Padding/truncating the encoded sequence to max_len
            padded = self.pad_truncate(tokenized)

            # Creating a tensor and adding to the result
            self.result.append(torch.tensor(padded, dtype=torch.long))


    def __len__(self):
        return len(self.result)


    def __getitem__(self, item):
        return self.result[item]

    def pad_truncate(self, name):
        name_length = len(name) - extra_length
        if name_length < self.max_len:
            difference = self.max_len - name_length
            result = name + [self.eos_id] * difference
        elif name_length > self.max_len:
            result = name[:self.max_len + 3]+[self.eos_id]
        else:
            result = name
        return result

In [97]:
reviews_df=df.sample(20000)
reviews_df.reset_index(drop=True, inplace=True)
reviews_df.info()
reviews=reviews_df.training.values.tolist()
reviews_dataset = GPT2ReviewDataset(tokenizer,reviews, max_length)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Text            20000 non-null  object
 1   Summary         20000 non-null  object
 2   CleanedText     20000 non-null  object
 3   CleanedSummary  20000 non-null  object
 4   training        20000 non-null  object
dtypes: object(5)
memory usage: 781.4+ KB


In [98]:
train_reviews=reviews_dataset[:15000]
test_reviews=reviews_dataset[15000:]
print(len(train_reviews))
print(len(test_reviews))
print(type(train_reviews))
print(type(test_reviews))
train_summaries=reviews_df['CleanedSummary'][:15000].tolist()
test_summaries=reviews_df['CleanedSummary'][15000:].tolist()

import random
random_numbers = random.sample(range(1, 100 + 1), 5)
for i in random_numbers:
  #Training verification
  ind=i
  train_review_tensor=train_reviews[ind]
  train_review=tokenizer.decode(train_review_tensor)
  train_summary=train_summaries[ind]
  print("Train")
  print("Review",train_review)
  print('SUmmary',train_summary)

  #testing verification
  ind=i
  test_review_tensor=test_reviews[ind]
  test_review=tokenizer.decode(test_review_tensor)
  test_summary=test_summaries[ind]
  print("\nTest")
  print("Review",test_review)
  print('Summary',test_summary)


# for item in reviews_dataset:
#   print(tokenizer.decode(item))
#   break
# print(train_summaries[:1])

15000
5000
<class 'list'>
<class 'list'>
Train
Review love coffee bold black one become new favorite terrifically rich bold taste thats sure get going morning love great price TL;DR black tiger coffee people kcup<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoft

In [99]:
for item in train_reviews:
  print(tokenizer.decode(item)), print(item)
  break

ordered coffee le expensive doughnut shop coffee usually order pleasantly surprised taste price im coffee snob great buy would definitely order run TL;DR pay<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext

In [100]:
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 3e-4

In [101]:
train_dataloader = DataLoader(train_reviews, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

In [102]:
for batch in train_dataloader:
    print(batch.shape)  # Output the shape of each batch (should be [batch_size, max_length])
    break

torch.Size([32, 104])


In [103]:
import torch
from tqdm import tqdm

def train(model, optimizer, dl, epochs):
    for epoch in range(epochs):
        for idx, batch in enumerate(dl):
             with torch.set_grad_enabled(True):
                optimizer.zero_grad()
                batch = batch.to(device)
                output = model(batch, labels=batch)
                loss = output[0]
                loss.backward()
                optimizer.step()
                if idx % 100 == 0:
                    print("loss: %f, %d"%(loss, idx))

In [104]:
import torch.optim as optim

from transformers import AutoModelWithLMHead
model = AutoModelWithLMHead.from_pretrained("gpt2")
model = model.to(device)
# Prepare optimizer
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
train(model, optimizer, train_dataloader, epochs=EPOCHS)



loss: 9.946204, 0
loss: 2.400096, 100
loss: 2.154653, 200
loss: 2.383995, 300
loss: 2.185949, 400
loss: 1.936072, 0
loss: 1.903428, 100
loss: 2.182784, 200
loss: 1.934762, 300
loss: 2.214087, 400
loss: 1.719444, 0
loss: 1.795128, 100
loss: 1.893429, 200
loss: 1.780359, 300
loss: 2.146274, 400


In [105]:
import torch

# Assuming `model` is your trained PyTorch model
# Specify the file path where you want to save the model
model_path = '/content/drive/MyDrive/IR-A4/trained_model-with-cleaning.pth'

# Save the model's state dictionary to the specified file path
torch.save(model.state_dict(), model_path)

print(f"Model saved successfully at: {model_path}")

Model saved successfully at: /content/drive/MyDrive/IR-A4/trained_model-with-cleaning.pth


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead

# Specify the file path where your trained model is saved
model_path = '/content/drive/MyDrive/IR-A4/trained_model-without-cleaning.pth'

# Create an instance of the model (ensure it's the same architecture as the trained model)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelWithLMHead.from_pretrained("gpt2")

# Load the saved state dictionary into the model
model.load_state_dict(torch.load(model_path, map_location=torch.device(device)))

print(f"Model loaded successfully from: {model_path}")


In [106]:
import torch
import numpy as np

def topk(probs, n=9):
    # The scores are initially softmaxed to convert to probabilities
    probs = torch.softmax(probs, dim= -1)

    # PyTorch has its own topk method, which we use here
    tokensProb, topIx = torch.topk(probs, k=n)

    # The new selection pool (9 choices) is normalized
    tokensProb = tokensProb / torch.sum(tokensProb)

    # Send to CPU for numpy handling
    tokensProb = tokensProb.cpu().detach().numpy()

    # Make a random choice from the pool based on the new prob distribution
    choice = np.random.choice(n, 1, p = tokensProb)
    tokenId = topIx[choice][0]

    return int(tokenId)

In [107]:
def model_infer(model, tokenizer, review, max_length=15):
    # Preprocess the init token (task designator)
    review_encoded = tokenizer.encode(review)
    result = review_encoded
    initial_input = torch.tensor(review_encoded).unsqueeze(0).to(device)

    with torch.set_grad_enabled(False):
        # Feed the init token to the model
        output = model(initial_input)

        # Flatten the logits at the final time step
        logits = output.logits[0,-1]

        # Make a top-k choice and append to the result
        result.append(topk(logits))

        # For max_length times:
        for _ in range(max_length):
            # Feed the current sequence to the model and make a choice
            input = torch.tensor(result).unsqueeze(0).to(device)
            output = model(input)
            logits = output.logits[0,-1]
            res_id = topk(logits)

            # If the chosen token is EOS, return the result
            if res_id == tokenizer.eos_token_id:
                return tokenizer.decode(result)
            else: # Append to the sequence
                result.append(res_id)
    # IF no EOS is generated, return after the max_len
    return tokenizer.decode(result)

In [108]:
print(device)

cuda


In [109]:
predicted_summaries=[]
for review_tensor in test_reviews:
  test_review=tokenizer.decode(review_tensor)
  summary = model_infer(model, tokenizer, test_review + " TL;DR ").split(" TL;DR ")[1].strip()
  predicted_summaries.append(summary)


print(len(test_summaries))
print(len(predicted_summaries))

5000
5000


In [110]:
eval_dict={
    'Test Summaries':test_summaries,
    'Predicted Summaries':predicted_summaries
}

eval_df=pd.DataFrame(eval_dict)
eval_df.info()
eval_df.dropna()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Test Summaries       5000 non-null   object
 1   Predicted Summaries  5000 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB


Unnamed: 0,Test Summaries,Predicted Summaries
0,good,good<|endoftext|><|endoftext|><|endoftext|><|e...
1,delicious strong decaffinated coffee,delicious strong decaffinated coffee<|endoftex...
2,msgsensitive,msgsensitive<|endoftext|><|endoftext|><|endoft...
3,bewaugmented realitye food poisoning,bewaugmented realitye food poisoning<|endoftex...
4,fantastic,fantastic<|endoftext|><|endoftext|><|endoftext...
...,...,...
4995,hot sauce,hot sauce<|endoftext|><|endoftext|><|endoftext...
4996,fast country,fast country<|endoftext|><|endoftext|><|endoft...
4997,best whey ever,best whey ever<|endoftext|><|endoftext|><|endo...
4998,great product,great product<|endoftext|><|endoftext|><|endof...


In [None]:
#With Cleaning

In [111]:
test_summaries = []
predicted_summaries = []

# Iterate over rows of the DataFrame using iterrows()
for idx, row in eval_df.iterrows():
    test_summary = row['Test Summaries']
    predicted_summary = row['Predicted Summaries']

    # Check if either test_summary or predicted_summary is empty or None
    if not test_summary or not predicted_summary:
        continue  # Skip empty or None values

    # Append valid summaries to respective lists
    test_summaries.append(test_summary)
    predicted_summaries.append(predicted_summary)

# Update DataFrame with filtered summaries
eval_df['Test Summaries'] = test_summaries
eval_df['Predicted Summaries'] = predicted_summaries

# Display DataFrame information after filtering
print(eval_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Test Summaries       5000 non-null   object
 1   Predicted Summaries  5000 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB
None


In [None]:
#Without Ckleaning 5000 rows -> 4998 rows

In [88]:
test_summaries = []
predicted_summaries = []

# Iterate over rows of the DataFrame using iterrows()
for idx, row in eval_df.iterrows():
    test_summary = row['Test Summaries']
    predicted_summary = row['Predicted Summaries']

    # Check if either test_summary or predicted_summary is empty or None
    if not test_summary or not predicted_summary:
        continue  # Skip empty or None values

    # Append valid summaries to respective lists
    test_summaries.append(test_summary)
    predicted_summaries.append(predicted_summary)

# Update DataFrame with filtered summaries
eval_df['Test Summaries'] = test_summaries
eval_df['Predicted Summaries'] = predicted_summaries

# Display DataFrame information after filtering
print(eval_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4998 entries, 0 to 4997
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Test Summaries       4998 non-null   object
 1   Predicted Summaries  4998 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB
None


In [112]:
from rouge import Rouge
def calculate_rouge_scores(actual_summaries, predicted_summaries):
    rouge = Rouge()
    scores = rouge.get_scores(predicted_summaries, actual_summaries, avg=True)
    return scores

rouge_scores = calculate_rouge_scores(eval_df['Predicted Summaries'],eval_df['Test Summaries'])
print("ROUGE Scores:")
print(rouge_scores)

# Print in specified format
print("ROUGE Scores - With Cleaning Text and Summary")
print("=" * 20)
for metric, scores in rouge_scores.items():
    print(f"{metric}:")
    print(f"Precision: {scores['p']:.2f}")  # Precision
    print(f"Recall: {scores['r']:.2f}")     # Recall
    print(f"F1-Score: {scores['f']:.2f}")    # F1-Score
    print("=" * 20)

ROUGE Scores:
{'rouge-1': {'r': 0.4230503535353553, 'p': 0.42332051948052124, 'f': 0.4226614135712477}, 'rouge-2': {'r': 0.26477308080808054, 'p': 0.2637912554112551, 'f': 0.26401380395327834}, 'rouge-l': {'r': 0.4230503535353553, 'p': 0.42332051948052124, 'f': 0.4226614135712477}}
ROUGE Scores - With Cleaning Text and Summary
rouge-1:
Precision: 0.42
Recall: 0.42
F1-Score: 0.42
rouge-2:
Precision: 0.26
Recall: 0.26
F1-Score: 0.26
rouge-l:
Precision: 0.42
Recall: 0.42
F1-Score: 0.42


In [90]:
from rouge import Rouge
def calculate_rouge_scores(actual_summaries, predicted_summaries):
    rouge = Rouge()
    scores = rouge.get_scores(predicted_summaries, actual_summaries, avg=True)
    return scores

rouge_scores = calculate_rouge_scores(eval_df['Predicted Summaries'],eval_df['Test Summaries'])
print("ROUGE Scores:")
print(rouge_scores)

# Print in specified format
print("ROUGE Scores - Without Cleaning Text and Summary")
print("=" * 20)
for metric, scores in rouge_scores.items():
    print(f"{metric}:")
    print(f"Precision: {scores['p']:.2f}")  # Precision
    print(f"Recall: {scores['r']:.2f}")     # Recall
    print(f"F1-Score: {scores['f']:.2f}")    # F1-Score
    print("=" * 20)

ROUGE Scores:
{'rouge-1': {'r': 0.37920268379628763, 'p': 0.3843781977803248, 'f': 0.38031376413088885}, 'rouge-2': {'r': 0.26290339643068894, 'p': 0.2690307557993447, 'f': 0.2646454367117812}, 'rouge-l': {'r': 0.37920268379628763, 'p': 0.3843781977803248, 'f': 0.38031376413088885}}
ROUGE Scores - Without Cleaning Text and Summary
rouge-1:
Precision: 0.38
Recall: 0.38
F1-Score: 0.38
rouge-2:
Precision: 0.27
Recall: 0.26
F1-Score: 0.26
rouge-l:
Precision: 0.38
Recall: 0.38
F1-Score: 0.38


In [113]:
#With Text Preprocessing
print("Input Text")
review_text=input("Enter your input Text")
print("Give your actual summary")
actual_summary=input("Enter your actual summary")


input_text=preprocess_text(review_text)
input_summary=preprocess_text(actual_summary)
predicted_summary= model_infer(model, tokenizer, input_text + " TL;DR ").split(" TL;DR ")[1].strip()
scores= calculate_rouge_scores([input_summary],[predicted_summary])


print("Input Text:", review_text)
print("Actual Summary:", actual_summary)
print("Predicted Summary:", predicted_summary)
print("ROUGE Scores:", scores)
print("ROUGE-1: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores['rouge-1']['p'], scores['rouge-1']['r'], scores['rouge-1']['f']))
print("ROUGE-2: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores['rouge-2']['p'], scores['rouge-2']['r'], scores['rouge-2']['f']))
print("ROUGE-L: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores['rouge-l']['p'], scores['rouge-l']['r'], scores['rouge-l']['f']))

Input Text
Enter your input TextThis tea is not expensive, but is the same full-bodied brew you get at a good Irish B&B.  m-m-m. 
Give your actual summary
Enter your actual summaryLyon's Irish tea bags
Input Text: This tea is not expensive, but is the same full-bodied brew you get at a good Irish B&B.  m-m-m. 
Actual Summary: Lyon's Irish tea bags
Predicted Summary: erya krindt
ROUGE Scores: {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}}
ROUGE-1: Precision: 0.00, Recall: 0.00, F1-Score: 0.00
ROUGE-2: Precision: 0.00, Recall: 0.00, F1-Score: 0.00
ROUGE-L: Precision: 0.00, Recall: 0.00, F1-Score: 0.00


In [114]:
#Without Text preprocessing

In [91]:
print("Input Text")
review_text=input("Enter your input Text")
print("Give your actual summary")
actual_summary=input("Enter your actual summary")


input_text=review_text
input_summary=actual_summary
predicted_summary= model_infer(model, tokenizer, input_text + " TL;DR ").split(" TL;DR ")[1].strip()
scores= calculate_rouge_scores([input_summary],[predicted_summary])


print("Input Text:", review_text)
print("Actual Summary:", actual_summary)
print("Predicted Summary:", predicted_summary)
print("ROUGE Scores:", scores)
print("ROUGE-1: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores['rouge-1']['p'], scores['rouge-1']['r'], scores['rouge-1']['f']))
print("ROUGE-2: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores['rouge-2']['p'], scores['rouge-2']['r'], scores['rouge-2']['f']))
print("ROUGE-L: Precision: {:.2f}, Recall: {:.2f}, F1-Score: {:.2f}".format(scores['rouge-l']['p'], scores['rouge-l']['r'], scores['rouge-l']['f']))


Input Text
Enter your input TextThis tea is not expensive, but is the same full-bodied brew you get at a good Irish B&B.  m-m-m. 
Give your actual summary
Enter your actual summary Lyon's Irish tea bags
Input Text: This tea is not expensive, but is the same full-bodied brew you get at a good Irish B&B.  m-m-m. 
Actual Summary:  Lyon's Irish tea bags
Predicted Summary: Tea
ROUGE Scores: {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}}
ROUGE-1: Precision: 0.00, Recall: 0.00, F1-Score: 0.00
ROUGE-2: Precision: 0.00, Recall: 0.00, F1-Score: 0.00
ROUGE-L: Precision: 0.00, Recall: 0.00, F1-Score: 0.00
