In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import numpy as np
import pandas as pd
import requests
import re
from tqdm import tqdm
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments,EarlyStoppingCallback
import warnings

# Set pandas display options
pd.set_option("display.max_colwidth", 200)

# Ignore warnings
warnings.filterwarnings("ignore")

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/amazon_food_review_xlnet/Reviews.csv")

In [None]:
data.drop_duplicates(subset=['Text'],inplace=True)#dropping duplicates
data.dropna(axis=0,inplace=True)#dropping na

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data[['Text', 'Summary']].sample(5)

In [None]:
stop_words = set(stopwords.words('english'))

def text_cleaner(text,num):
    newString = text.lower()
    newString = BeautifulSoup(newString, "lxml").text
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    # newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString)
    newString = re.sub('[m]{2,}', 'mm', newString)
    if(num==0):
        tokens = [w for w in newString.split() if not w in stop_words]
    else:
        tokens=newString.split()
    long_words=[]
    for i in tokens:
        if len(i)>1:                                                 #removing short word
            long_words.append(i)
    return (" ".join(long_words)).strip()

In [None]:
#call the function
cleaned_text = []
for t in data['Text']:
    cleaned_text.append(text_cleaner(t,0))

In [None]:
cleaned_text[:5]

In [None]:
cleaned_summary = []
for t in data['Summary']:
    cleaned_summary.append(text_cleaner(t,1))

In [None]:
cleaned_summary[:10]

In [None]:
data['cleaned_text']=cleaned_text
data['cleaned_summary']=cleaned_summary

In [None]:
data.replace('', np.nan, inplace=True)
data.dropna(axis=0,inplace=True)

In [None]:
import matplotlib.pyplot as plt

text_word_count = []
summary_word_count = []

# populate the lists with sentence lengths
for i in data['cleaned_text']:
      text_word_count.append(len(i.split()))

for i in data['cleaned_summary']:
      summary_word_count.append(len(i.split()))

length_df = pd.DataFrame({'text':text_word_count, 'summary':summary_word_count})

length_df.hist(bins = 30)
plt.show()

In [None]:
cnt=0
for i in data['cleaned_summary']:
    if(len(i.split())<=8):
        cnt=cnt+1
print(cnt/len(data['cleaned_summary']))

In [None]:
max_text_len=30
max_summary_len=8

In [None]:
cleaned_text =np.array(data['cleaned_text'])
cleaned_summary=np.array(data['cleaned_summary'])

short_text=[]
short_summary=[]

for i in range(len(cleaned_text)):
    if(len(cleaned_summary[i].split())<=max_summary_len and len(cleaned_text[i].split())<=max_text_len):
        short_text.append(cleaned_text[i])
        short_summary.append(cleaned_summary[i])

df=pd.DataFrame({'text':short_text,'summary':short_summary})

In [None]:
df.sample(10)

In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/amazon_food_review_xlnet/preprocessed_df.csv', index=False)

In [None]:
df.sample(10)

## Loading processed data

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/amazon_food_review_xlnet/preprocessed_df.csv')

In [6]:
traning_set = df.sample(30000)

In [7]:
train_df, eval_df = train_test_split(traning_set, test_size=0.2, random_state=42)

## XLnet Approach

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLNetTokenizer, XLNetLMHeadModel
from torch import nn

In [None]:
class TextSummaryDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_text_len=30, max_summary_len=8):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_text_len = max_text_len
        self.max_summary_len = max_summary_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['text']
        summary = self.dataframe.iloc[idx]['summary']

        # Tokenize input text and summary
        text_enc = self.tokenizer(
            text, max_length=self.max_text_len, truncation=True, padding="max_length", return_tensors="pt"
        )
        summary_enc = self.tokenizer(
            summary, max_length=self.max_summary_len, truncation=True, padding="max_length", return_tensors="pt"
        )

        return {
            'input_ids': text_enc['input_ids'].squeeze(),
            'attention_mask': text_enc['attention_mask'].squeeze(),
            'labels': summary_enc['input_ids'].squeeze()
        }

In [None]:
# Load the XLNet tokenizer and model
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
model = XLNetLMHeadModel.from_pretrained("xlnet-base-cased")

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLNetLMHeadModel(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0-11): 12 x XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (lm_loss): Linear(in_features=768, out_features=32000, bias=True)
)

In [None]:
from torch import nn

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [None]:
train_dataset = TextSummaryDataset(train_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [None]:
# Assuming train_loader is a DataLoader instance for the dataset
epochs = 1
for epoch in range(epochs):
    model.train()
    epoch_loss = 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        # print(outputs)
        # loss = outputs.loss
        break
    break
        # labels = labels.type(torch.float64).to(device)
        # loss = criterion(outputs, labels
        # loss = outputs.loss
        # epoch_loss += loss.item()

        # # Backward pass and optimization
        # loss.backward()
        # optimizer.step()

    # avg_loss = epoch_loss / len(train_loader)
    # print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

Training Epoch 1/1:   0%|          | 0/1500 [00:00<?, ?it/s]


In [None]:
def generate_summary(text, model, tokenizer, max_length=8):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=30).to(device)
        output = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=4,
            early_stopping=True
        )
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
# Example input text for prediction
sample = data[['Text', 'Summary']].sample()
print('Text:---', sample['Text'].values[0])
print('Actual Summary:---', sample['Summary'].values[0])
# Generate summary
text = sample['Text'].values[0]
summary = generate_summary(text, model, tokenizer)
print("Generated Summary:", summary)

In [None]:
hf_api = 'c951279b61000ec196651e2204f6d35ad0374e33'

In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [9]:
# Convert the DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)


# Initialize the T5 tokenizer and model (or any other summarization model like BART)
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [20]:
# Preprocess the data for tokenization
def preprocess_function(examples):
    inputs = [f"summarize: {doc}" for doc in examples['summary']]
    model_inputs = tokenizer(inputs,
                             max_length=30,
                             truncation=True,
                             padding="max_length")

    # Tokenize the summaries
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['summary'],
                           max_length=8,
                           truncation=True,
                           padding="max_length")

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [16]:
# Filter out invalid values in 'summary' column of eval_dataset
def filter_invalid_summary(example):
    # Check if the summary is a string and not empty
    return isinstance(example['summary'], str) and example['summary']

# Apply the filter to eval_dataset
eval_dataset = eval_dataset.filter(filter_invalid_summary)

Filter:   0%|          | 0/5999 [00:00<?, ? examples/s]

In [21]:
# Apply preprocessing for tokenization
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5999 [00:00<?, ? examples/s]

In [22]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save at the end of each epoch
    load_best_model_at_end=True,  # Load the best model at the end
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    remove_unused_columns=False,
    logging_dir='./logs',
    save_total_limit=3,
    fp16=True,
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Number of epochs with no improvement after which training will stop
    early_stopping_threshold=0.0   # Minimum improvement to consider for early stopping
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Use the evaluation dataset
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]
)

In [29]:
for batch in train_dataset:
    print(batch)
    break

{'text': 'tastes delicous worth extra penny big dunkin fan step kind middle dunkin starbucks weak strong', 'summary': 'love cbou', '__index_level_0__': 31122, 'input_ids': [21603, 10, 13434, 20, 2176, 1162, 1494, 996, 23925, 600, 146, 29, 2917, 1819, 1147, 773, 2214, 146, 29, 2917, 2213, 13863, 7, 5676, 1101, 1, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], 'labels': [333, 3, 75, 4076, 1, 0, 0, 0]}


In [23]:
# Train the model
trainer.train()

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
# Save the fine-tuned model
trainer.save_model('/content/drive/MyDrive/Colab Notebooks/amazon_food_review_xlnet/fine_tuned_t5')

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model and tokenizer
model_name = '/content/drive/MyDrive/Colab Notebooks/amazon_food_review_xlnet/fine_tuned_t5'  # Path to your fine-tuned model
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

def summarize_text(input_text):
    # Prepend the "summarize: " prefix
    input_text = f"summarize: {input_text}"

    # Tokenize the input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate the summary
    with torch.no_grad():
        summary_ids = model.generate(input_ids, max_length=8, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the summary ids to text
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
# Example input text for prediction
sample = data[['Text', 'Summary']].sample()
print('Text:---', sample['Text'].values[0])
print('Actual Summary:---', sample['Summary'].values[0])
# Generate summary
summary = summarize_text(sample['Text'].values[0])
print("predicted summary:---", summary)

## Experiment with BART

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
# Convert the DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# Choose BART model and tokenizer
model_name = "facebook/bart-large-cnn"  # You can also try "facebook/bart-base" if memory is an issue
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    inputs = [f"summarize: {doc}" for doc in examples['text']]
    model_inputs = tokenizer(inputs, max_length=30, truncation=True, padding="max_length")

    # Tokenize summaries as labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['summary'], max_length=8, truncation=True, padding="max_length")

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
# Apply preprocessing for tokenization
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/24000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [None]:
# Define training arguments with save best model only
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save at the end of each epoch
    load_best_model_at_end=True,  # Load the best model at the end of training
    save_total_limit=1,           # Keep only the best checkpoint
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    logging_dir='./logs',
    save_steps=500,               # Required for saving steps; adjust for intermediate saves
    fp16=True,
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Stop training after 3 epochs of no improvement
    early_stopping_threshold=0.0  # Minimum improvement required to count as progress
)

# Initialize Trainer with early stopping callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]
)

In [None]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,2.0631,2.216404
2,1.4922,2.123227
3,1.1841,2.206438
4,0.9378,2.565966
5,0.6908,2.796913


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3750, training_loss=1.2580758382161459, metrics={'train_runtime': 1587.0638, 'train_samples_per_second': 151.223, 'train_steps_per_second': 4.726, 'total_flos': 7618727116800000.0, 'train_loss': 1.2580758382161459, 'epoch': 5.0})

In [None]:
# Save the fine-tuned model
trainer.save_model('/content/drive/MyDrive/Colab Notebooks/amazon_food_review_xlnet/BART/fine_tuned_t5')

In [None]:
# Load the fine-tuned model and tokenizer
model_name = '//content/drive/MyDrive/Colab Notebooks/amazon_food_review_xlnet/BART/fine_tuned_t5'  # Path to your fine-tuned model
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

def summarize_text(input_text):
    # Prepend the "summarize: " prefix
    input_text = f"summarize: {input_text}"

    # Tokenize the input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=30, truncation=True)

    # Generate the summary
    with torch.no_grad():
        summary_ids = model.generate(input_ids, max_length=8, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the summary ids to text
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
# Example input text for prediction
sample = eval_df[['text', 'summary']].sample()
print('Text:---', sample['text'].values[0])
print('Actual Summary:---', sample['summary'].values[0])
# Generate summary
summary = summarize_text(sample['text'].values[0])
print("predicted summary:---", summary)

## Experiment with Mistral

In [None]:
# Proceed with your task setup
from transformers import AutoModelForCausalLM, AutoTokenizer

# Replace with the desired model's name
model_name = "mistralai/Mistral-7B-v0.1"
hf_token = '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'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token)

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    use_auth_token=hf_token,
    # trust_remote_code=True
)


In [None]:
def preprocess_function(examples):
    inputs = [f"summarize: {doc}" for doc in examples['text']]
    model_inputs = tokenizer(inputs, max_length=30, truncation=True, padding="max_length")

    # Tokenize summaries as labels
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['summary'], max_length=8, truncation=True, padding="max_length")

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
# Apply preprocessing for tokenization
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save at the end of each epoch
    load_best_model_at_end=True,  # Load the best model at the end
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    logging_dir='./logs',
    save_total_limit=3,
    fp16=True,
)

# Define early stopping callback
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3,  # Number of epochs with no improvement after which training will stop
    early_stopping_threshold=0.0   # Minimum improvement to consider for early stopping
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Use the evaluation dataset
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Save the fine-tuned model
trainer.save_model('/content/drive/MyDrive/Colab Notebooks/amazon_food_review_xlnet/Mistral/fine_tuned_t5')

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model and tokenizer
model_name = '/content/drive/MyDrive/Colab Notebooks/amazon_food_review_xlnet/fine_tuned_t5'  # Path to your fine-tuned model
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Set the model to evaluation mode
model.eval()

def summarize_text(input_text):
    # Prepend the "summarize: " prefix
    input_text = f"summarize: {input_text}"

    # Tokenize the input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate the summary
    with torch.no_grad():
        summary_ids = model.generate(input_ids, max_length=8, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the summary ids to text
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
# Example input text for prediction
sample = data[['Text', 'Summary']].sample()
print('Text:---', sample['Text'].values[0])
print('Actual Summary:---', sample['Summary'].values[0])
# Generate summary
summary = summarize_text(sample['Text'].values[0])
print("predicted summary:---", summary)

## Experiment with ROBERTa and mlflow for model tracking

In [None]:
!pip install mlflow
import mlflow



In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import RobertaTokenizer, RobertaModel
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
# Define a custom dataset class
class TextSummaryDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_text_len=30, max_summary_len=8):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_text_len = max_text_len
        self.max_summary_len = max_summary_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text = self.dataframe.iloc[idx]['text']
        summary = self.dataframe.iloc[idx]['summary']

        text = str(text)
        summary = str(summary)

        # Tokenize text and summary
        text_enc = self.tokenizer(
            text, max_length=self.max_text_len, truncation=True, padding="max_length", return_tensors="pt"
        )
        summary_enc = self.tokenizer(
            summary, max_length=self.max_summary_len, truncation=True, padding="max_length", return_tensors="pt"
        )

        return {
            'input_ids': text_enc['input_ids'].squeeze(),
            'attention_mask': text_enc['attention_mask'].squeeze(),
            'labels': summary_enc['input_ids'].squeeze()
        }

# Define a custom model for regression
class SummarizationModel(nn.Module):
    def __init__(self, model_name, hidden_size=768, output_size=8):
        super(SummarizationModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state[:, 0, :]  # Use CLS token output
        return self.fc(last_hidden_state)

In [None]:
# Initialize the tokenizer and dataset
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)

In [None]:
train_dataset = TextSummaryDataset(train_df, tokenizer)
eval_dataset = TextSummaryDataset(eval_df, tokenizer)

In [None]:
# Set up DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=16)

In [None]:
# Initialize model, loss, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SummarizationModel(model_name).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training loop
epochs = 1
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Compute loss
        labels = labels.type(torch.float64).to(device)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        break
    break

    # avg_train_loss = train_loss / len(train_loader)
    # print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss}")

    # # Evaluation
    # model.eval()
    # eval_loss = 0
    # with torch.no_grad():
    #     for batch in tqdm(eval_loader, desc="Evaluating"):
    #         input_ids = batch['input_ids'].to(device)
    #         attention_mask = batch['attention_mask'].to(device)
    #         labels = batch['labels'].to(device)

    #         outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    #         labels = labels.type(torch.float64).to(device)
    #         loss = criterion(outputs, labels)
    #         eval_loss += loss.item()

    # avg_eval_loss = eval_loss / len(eval_loader)
    # print(f"Epoch {epoch + 1}, Evaluation Loss: {avg_eval_loss}")

Training Epoch 1/1:   0%|          | 0/1500 [00:00<?, ?it/s]


In [None]:
outputs

tensor([[ -7.7873,   2.1722,   1.4601,   0.7657,   1.0459,   0.5854,   0.2639,
          -7.1481],
        [ -9.1604,   1.7268,   1.9047,   0.9589,   1.1601,   0.8705,   0.5243,
          -8.5502],
        [ -6.7615,   1.5140,   1.7342,   0.7446,   0.9319,   0.5183,   0.0981,
          -6.1580],
        [ -7.5883,   1.8031,   1.9015,   0.8972,   0.8649,   0.5684,  -0.0737,
          -6.9250],
        [ -5.0001,   0.8504,   1.0189,   0.6480,   0.8240,   0.5995,   0.3138,
          -4.7593],
        [ -6.6955,   1.4337,   1.8030,   0.8500,   0.8874,   0.4506,   0.1272,
          -6.1935],
        [ -5.8108,   0.8134,   1.4245,   0.7045,   0.9025,   0.6717,   0.1360,
          -5.5033],
        [ -7.2322,   1.7904,   1.6117,   0.9275,   0.8297,   0.5292,  -0.1275,
          -6.6621],
        [ -5.4705,   1.2401,   1.1800,   0.5290,   0.9235,   0.5243,   0.3108,
          -5.0931],
        [-10.0969,   2.7529,   2.4469,   0.8445,   1.0063,   0.2650,  -0.0748,
          -9.1199],
        [ 

In [None]:
labels

tensor([[0.0000e+00, 1.1613e+04, 4.1040e+03, 6.5920e+03, 2.1000e+02, 3.4000e+01,
         2.4000e+01, 2.0000e+00],
        [0.0000e+00, 2.8200e+02, 1.7498e+04, 1.0200e+02, 7.8600e+02, 1.3900e+02,
         2.0000e+00, 1.0000e+00],
        [0.0000e+00, 1.2338e+04, 3.6260e+03, 7.0000e+00, 3.8950e+03, 2.0000e+00,
         1.0000e+00, 1.0000e+00],
        [0.0000e+00, 1.7693e+04, 2.4000e+01, 2.0000e+00, 1.0000e+00, 1.0000e+00,
         1.0000e+00, 1.0000e+00],
        [0.0000e+00, 2.9000e+01, 1.6300e+03, 1.9880e+03, 2.6930e+04, 5.6260e+03,
         9.7490e+03, 2.0000e+00],
        [0.0000e+00, 1.6319e+04, 2.6380e+03, 2.4000e+01, 2.0000e+00, 1.0000e+00,
         1.0000e+00, 1.0000e+00],
        [0.0000e+00, 1.4656e+04, 3.9990e+03, 2.2290e+03, 4.5600e+02, 2.0000e+00,
         1.0000e+00, 1.0000e+00],
        [0.0000e+00, 4.3800e+02, 1.9750e+03, 4.6371e+04, 6.8100e+02, 2.0000e+00,
         1.0000e+00, 1.0000e+00],
        [0.0000e+00, 4.1700e+02, 9.6630e+03, 5.3000e+01, 1.0964e+04, 2.0000e+00,

In [None]:
# Save the model (using PyTorch directly instead of mlflow)
torch.save(model.state_dict(), "roberta_model.pth")

In [None]:
mlflow.end_run()

In [None]:
# Initialize MLflow
mlflow.set_experiment("RoBERTa_Summarization_PyTorch")
with mlflow.start_run() as run:
    mlflow.log_param("epochs", 5)
    mlflow.log_param("learning_rate", 2e-5)
    mlflow.log_param("batch_size", 16)
    mlflow.log_param("model_name", model_name)

    # Training loop
    epochs = 3
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Compute loss
            # outputs = outputs.view(-1, outputs.size(-1))          # Flatten to [batch_size * seq_len, vocab_size]
            # labels = labels.view(-1)
            # batch_size, seq_len, vocab_size = outputs.size()
            labels = labels.type(torch.float64).to(device)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Training Loss: {avg_train_loss}")

        # Evaluation
        model.eval()
        eval_loss = 0
        with torch.no_grad():
            for batch in tqdm(eval_loader, desc="Evaluating"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                labels = labels.type(torch.float64).to(device)
                loss = criterion(outputs, labels)
                eval_loss += loss.item()

        avg_eval_loss = eval_loss / len(eval_loader)
        print(f"Epoch {epoch + 1}, Evaluation Loss: {avg_eval_loss}")

        # Log metrics
        mlflow.log_metric("train_loss", avg_train_loss, step=epoch)
        mlflow.log_metric("eval_loss", avg_eval_loss, step=epoch)

    # ending experiment
    mlflow.end_run()

    # Save the model with MLflow
    mlflow.pytorch.log_model(model, "model")

Training Epoch 1/3: 100%|██████████| 1500/1500 [03:15<00:00,  7.69it/s]


Epoch 1, Training Loss: 46988.56999214683


Evaluating: 100%|██████████| 375/375 [00:13<00:00, 27.33it/s]


Epoch 1, Evaluation Loss: 47283.91938757075


Training Epoch 2/3: 100%|██████████| 1500/1500 [03:13<00:00,  7.74it/s]


Epoch 2, Training Loss: 46614.68234745845


Evaluating: 100%|██████████| 375/375 [00:13<00:00, 27.91it/s]


Epoch 2, Evaluation Loss: 47397.212080616366


Training Epoch 3/3: 100%|██████████| 1500/1500 [03:12<00:00,  7.77it/s]


Epoch 3, Training Loss: 45797.62204018029


Evaluating: 100%|██████████| 375/375 [00:13<00:00, 27.70it/s]


Epoch 3, Evaluation Loss: 47847.65279782519




In [None]:
# Save the model
torch.save(model.state_dict(), "model.pth")
print("Model saved successfully.")

Model saved successfully.


In [None]:
!mlflow ui --host localhost --port 5000 &

[2024-12-01 08:54:21 +0000] [36283] [INFO] Starting gunicorn 23.0.0
[2024-12-01 08:54:21 +0000] [36283] [INFO] Listening at: http://127.0.0.1:5000 (36283)
[2024-12-01 08:54:21 +0000] [36283] [INFO] Using worker: sync
[2024-12-01 08:54:21 +0000] [36284] [INFO] Booting worker with pid: 36284
[2024-12-01 08:54:21 +0000] [36285] [INFO] Booting worker with pid: 36285
[2024-12-01 08:54:21 +0000] [36290] [INFO] Booting worker with pid: 36290
[2024-12-01 08:54:21 +0000] [36291] [INFO] Booting worker with pid: 36291
[2024-12-01 08:54:54 +0000] [36283] [INFO] Handling signal: int
[2024-12-01 08:54:54 +0000] [36284] [INFO] Worker exiting (pid: 36284)
[2024-12-01 08:54:54 +0000] [36290] [INFO] Worker exiting (pid: 36290)
[2024-12-01 08:54:54 +0000] [36291] [INFO] Worker exiting (pid: 36291)
[2024-12-01 08:54:54 +0000] [36285] [INFO] Worker exiting (pid: 36285)
[2024-12-01 08:54:55 +0000] [36283] [INFO] Shutting down: Master


In [None]:
import torch
import torch.nn as nn
from transformers import XLNetModel, XLNetTokenizer
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

def prepare_tokenizer():
    """
    Initialize and return XLNet tokenizer

    Returns:
        XLNetTokenizer: Configured XLNet tokenizer
    """
    return XLNetTokenizer.from_pretrained('xlnet-base-cased')

def create_dataset(texts, summaries, tokenizer, max_length=512):
    """
    Create a dataset for text summarization

    Args:
        texts (list): Input texts
        summaries (list): Corresponding summaries
        tokenizer (XLNetTokenizer): Tokenizer to encode texts
        max_length (int): Maximum sequence length

    Returns:
        SummarizationDataset: Custom PyTorch dataset
    """
    class SummarizationDataset(Dataset):
        def __init__(self, texts, summaries, tokenizer, max_length):
            self.texts = texts
            self.summaries = summaries
            self.tokenizer = tokenizer
            self.max_length = max_length

        def __len__(self):
            return len(self.texts)

        def __getitem__(self, idx):
            text_encoding = self.tokenizer(
                self.texts[idx],
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            summary_encoding = self.tokenizer(
                self.summaries[idx],
                max_length=self.max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            return {
                'input_ids': text_encoding['input_ids'].flatten(),
                'attention_mask': text_encoding['attention_mask'].flatten(),
                'labels': summary_encoding['input_ids'].flatten()
            }

    return SummarizationDataset(texts, summaries, tokenizer, max_length)

def create_data_loaders(train_dataset, val_dataset=None, batch_size=4):
    """
    Create DataLoaders for training and validation

    Args:
        train_dataset (Dataset): Training dataset
        val_dataset (Dataset, optional): Validation dataset
        batch_size (int): Batch size for DataLoaders

    Returns:
        tuple: Training and validation DataLoaders
    """
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size) if val_dataset else None

    return train_loader, val_loader

def create_xlnet_summarization_model(learning_rate=2e-5):
    """
    Create XLNet-based summarization model

    Args:
        learning_rate (float): Learning rate for optimizer

    Returns:
        XLNetSummarizationModel: Custom PyTorch Lightning model
    """
    class XLNetSummarizationModel(pl.LightningModule):
        def __init__(self, learning_rate):
            super().__init__()

            # Load pre-trained XLNet model
            self.xlnet_base = XLNetModel.from_pretrained('xlnet-base-cased')

            # Freeze base model parameters
            for param in self.xlnet_base.parameters():
                param.requires_grad = False

            # Custom layers for summarization
            self.summary_head = nn.Sequential(
                nn.Linear(self.xlnet_base.config.hidden_size, 512),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(512, self.xlnet_base.config.vocab_size)
            )

            self.learning_rate = learning_rate

        def forward(self, input_ids, attention_mask):
            xlnet_output = self.xlnet_base(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            last_hidden_state = xlnet_output.last_hidden_state
            summary_logits = self.summary_head(last_hidden_state)

            return summary_logits

        def training_step(self, batch, batch_idx):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']

            logits = self(input_ids, attention_mask)
            loss = nn.CrossEntropyLoss()(
                logits.view(-1, logits.size(-1)),
                labels.view(-1)
            )

            self.log('train_loss', loss)
            return loss

        def configure_optimizers(self):
            return torch.optim.AdamW(
                self.parameters(),
                lr=self.learning_rate
            )

    return XLNetSummarizationModel(learning_rate)

def train_model(model, train_loader, val_loader=None, max_epochs=5):
    """
    Train the summarization model

    Args:
        model (pl.LightningModule): PyTorch Lightning model
        train_loader (DataLoader): Training data loader
        val_loader (DataLoader, optional): Validation data loader
        max_epochs (int): Maximum number of training epochs

    Returns:
        pl.LightningModule: Trained model
    """
    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator='gpu' if torch.cuda.is_available() else 'cpu'
    )

    trainer.fit(model, train_loader, val_loader)
    return model

def generate_summary(model, tokenizer, text, max_length=150):
    """
    Generate summary for given text

    Args:
        model (nn.Module): Trained summarization model
        tokenizer (XLNetTokenizer): Tokenizer
        text (str): Input text to summarize
        max_length (int): Maximum summary length

    Returns:
        str: Generated summary
    """
    inputs = tokenizer(
        text,
        max_length=512,
        return_tensors='pt',
        padding=True,
        truncation=True
    )

    model.eval()
    with torch.no_grad():
        output = model(inputs['input_ids'], inputs['attention_mask'])

    summary_ids = torch.argmax(output, dim=-1)
    summary = tokenizer.decode(
        summary_ids[0],
        skip_special_tokens=True,
        max_length=max_length
    )

    return summary

def main():
    """
    Main function to demonstrate model training and inference
    """
    # Sample data
    train_texts = [
        "Long input text about a complex topic...",
        "Another lengthy document to summarize..."
    ]
    train_summaries = [
        "Concise summary of the first text",
        "Brief overview of the second document"
    ]

    # Prepare tokenizer
    tokenizer = prepare_tokenizer()

    # Create dataset
    train_dataset = create_dataset(train_texts, train_summaries, tokenizer)
    train_loader, _ = create_data_loaders(train_dataset)

    # Create and train model
    model = create_xlnet_summarization_model()
    trained_model = train_model(model, train_loader)

    # Perform inference
    test_text = "Input text you want to summarize..."
    summary = generate_summary(trained_model, tokenizer, test_text)
    print("Generated Summary:", summary)

if __name__ == "__main__":
    main()