# Importing Libraries

In [None]:
!pip install numpy==1.26.4

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading the Data

In [None]:
#loading the dataset
raw_data = pd.read_csv("hf://datasets/kartikay/review-summarizer/raw/data.csv")

# Basic Data Exploration

In [None]:
#shape
raw_data.shape

In [None]:
#size
raw_data.size

In [None]:
#columns
raw_data.columns

In [None]:
#dimension
raw_data.ndim

In [None]:
#dtypes
raw_data.dtypes

In [None]:
#copy raw_data to df
df = raw_data.copy()

In [None]:
#view data
df.head()

# Splitting of data

In [None]:
from sklearn.model_selection import train_test_split

# First: split train vs temp (temp = val + test)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)

# Second: split temp into val and test (50/50 of 20% = 10% each)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

#note : since hugging face expects the data in a dictonary format that is why splitting is different from traditional ML

# Data Pre-processing

### 1.Converting pandas dataframe to Dataset Dictonary

In [None]:
from datasets import Dataset, DatasetDict

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df),
    "test": Dataset.from_pandas(test_df),

})

#note: this is for the sake to make compatable with hugging face

In [None]:
# Randomly sample smaller dataset
small_dataset = DatasetDict({
    "train": dataset["train"].shuffle(seed=42).select(range(7000)),
    "validation": dataset["validation"].shuffle(seed=42).select(range(1000)),
    "test": dataset["test"].shuffle(seed=42).select(range(2000)),
})

### 2.Tokenization

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

In [None]:
max_input_length = 512
max_target_length = 64


'''def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["target"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs'''

In [None]:
def preprocess_function(examples):
    inputs = ["summarize: " + text for text in examples["text"]]
    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True
    )
    
    labels = tokenizer(
        examples["target"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_df = small_dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_df

### 3.Dynamic Padding

In [None]:
#helps to make padding dynamically for both input and target
#it will take two parameters the tokenizer we used and the target model
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model='google/flan-t5-small', return_tensors="tf", label_pad_token_id=-100)


### 4.Feeding to datacollator

* Example to see how datacollator work

In [None]:
#First, we need to remove the columns with strings because the collator won’t know how to pad these elements
#tokenized_df = tokenized_df.remove_columns(
    #dataset["train"].column_names
#)
#note : in our case we already made it perfect

* Since the collator expects a list of dicts, where each dict represents a single example in the dataset, we also need to wrangle the data into the expected format before passing it to the data collator.

In [None]:
features = [tokenized_df["train"][i] for i in range(2)]
data_collator(features)

* The only thing datacollator will do is dynamic padding of the input to ensure equal lenght

# Model Building

In [None]:
#for mounting our drive
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#loading all the hyperparameters from training Arguments class
from transformers import TrainingArguments
training_args = TrainingArguments("/kaggle/working/") #chaning working directory as per need

In [None]:
#loading the model
from transformers import TFAutoModelForSeq2SeqLM
import tensorflow as tf
model = TFAutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

In [None]:
#part of preprocesing
#converting from hugging face dataset to tensorflow dataset (since we are building using tensorflow)
tf_train_dataset = model.prepare_tf_dataset(
    tokenized_df["train"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=8,
)

tf_eval_dataset = model.prepare_tf_dataset(
    tokenized_df["validation"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=8,
)

* Note: Hugging Face T5 models usually provide their own loss internally when you pass labels during training, so no need to specify loss explicitly.

In [None]:
from transformers import create_optimizer
batch_size = 16  # Increased if your GPU can handle it (try 16 first, then 8 if OOM)
epochs = 8       # More epochs for better convergence
train_data_len = len(tokenized_df["train"])
steps_per_epoch = train_data_len // batch_size
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1 * num_train_steps)
optimizer, lr_schedule = create_optimizer(
    init_lr=5e-5,  # Keep this - good for FLAN-T5
    num_warmup_steps=num_warmup_steps,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

In [None]:
earlystop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=3,        # Increased from 2 - gives more chances to improve
    restore_best_weights=True
)
model.fit(
    tf_train_dataset, 
    validation_data=tf_eval_dataset, 
    epochs=epochs,     # Use the variable you defined above instead of hardcoded 10
    callbacks=[earlystop]
)

In [None]:
model.history.history

In [None]:
!pip install rouge_score
!pip install evaluate

# Evaluating the model

In [None]:
import evaluate

rouge = evaluate.load("rouge")

In [None]:
from tqdm import tqdm
def generate_predictions(model, tokenizer, dataset, input_col="text", target_col="target"):
    predictions = []
    references = []
    for example in tqdm(dataset):
        input_text = "summarize: " + example[input_col]  # Added prefix!
        reference_summary = example[target_col]
        
        inputs = tokenizer(
            input_text,
            return_tensors="tf",
            padding=True,
            truncation=True,
            max_length=512  # Match your training input length
        )
        
        summary_ids = model.generate(
            **inputs,
            max_new_tokens=20,    # Better control than max_length
            min_length=2,         # Ensure some output
            num_beams=4,
            length_penalty=0.8,   # Reduced penalty for short summaries
            early_stopping=True
        )
        
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()
        predictions.append(generated_summary)
        references.append(reference_summary)
    return predictions, references

In [None]:
# Use the original dataset, not tokenized version
subset = small_dataset['test'].select(range(100))  # Original text data
# Convert to list of dicts  
dataset_samples = [{"text": t, "target": s} for t, s in zip(subset["text"], subset["target"])]
# Generate predictions and compute ROUGE
predictions, references = generate_predictions(model, tokenizer, dataset_samples)
# Compute ROUGE scores
rouge = evaluate.load("rouge")  # Make sure rouge is loaded
results = rouge.compute(predictions=predictions, references=references)

In [None]:
for key in results:
    print(f"{key}: {results[key]:.4f}")

# Test On model

In [None]:
def print_summary(idx):
    # Use original dataset, not tokenized
    review = small_dataset["test"][idx]["text"]  # Original text
    target = small_dataset["test"][idx]["target"]  # Original target
    
    # Add the instruction prefix used during training
    input_text = "summarize: " + review
    
    inputs = tokenizer(
        input_text, 
        return_tensors="tf", 
        padding=True, 
        truncation=True,
        max_length=512  # Match your training settings
    )
    
    # Use same generation parameters as evaluation
    summary_ids = model.generate(
        **inputs, 
        max_new_tokens=20,    # Same as evaluation function
        min_length=2,
        num_beams=4,
        length_penalty=0.8,
        early_stopping=True
    )
    
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
   
    print(f">>> Review: {review}")
    print(f"\n>>> Target: {target}")
    print(f"\n>>> Summary: {summary}")

In [None]:
print_summary(11)

# Saving the model

In [None]:
model.save_pretrained("model")
tokenizer.save_pretrained("tokenizer")

# Exporting to Hugging face

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.push_to_hub("suryaummadi/Flan-T5-Short-review-summarizer")
tokenizer.push_to_hub("suryaummadi/Flan-T5-Short-review-summarizer")


In [None]:
#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

# Trying with uploaded hugging Face model

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained("suryaummadi/Flan-T5-Short-review-summarizer")
tokenizer = AutoTokenizer.from_pretrained("suryaummadi/Flan-T5-Short-review-summarizer")

In [None]:
def summarize_review(review):
    input_text = "summarize: " + review
    inputs = tokenizer(
        input_text,
        return_tensors="tf",
        padding=True,
        truncation=True,
        max_length=512
    )

    summary_ids = model.generate(
        **inputs,
        max_new_tokens=20,      # Same as your test code
        min_length=2,
        num_beams=4,
        length_penalty=0.8,
        early_stopping=True
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
summarize_review("The product is good but not upto the mark")

In [None]:
## The end