## Preprocessing

In [1]:
import pandas as pd
import re

In [5]:
data = pd.read_csv('test_selenium.csv')
summaries = data['Summary']
analyses = data['Analysis']
titles = data['Title']

corpus = [f"{title} {summary} {analysis}" for title, summary, analysis in zip(titles, summaries, analyses)]

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    return text

preprocessed_corpus = [preprocess_text(text) for text in corpus]

tokenized_corpus = [text.split() for text in preprocessed_corpus]

## Training Word Embedding Model To Obtain Embeddings

In [8]:
from gensim.models import Word2Vec

Word2Vec_model_sg = Word2Vec(tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4, sg=1)

Word2Vec_model_sg.save('Word2Vec_model_sg')

In [10]:
test_vector = Word2Vec_model_sg.wv['darkness']
print(test_vector)

[-0.18757609 -0.02523065  0.10822345  0.25279427  0.09404436 -0.14556998
  0.31167802  0.56639236 -0.07748945 -0.11939991 -0.2481058  -0.37848467
 -0.0188977  -0.1670922  -0.10497189 -0.08356968 -0.0615345   0.10890697
 -0.10079912 -0.40617284 -0.00819069 -0.1524098   0.16265742 -0.03898332
 -0.06940196  0.04204768 -0.03904338 -0.06583715 -0.15557028  0.20045884
  0.3325572   0.00232724  0.14604664 -0.25277588 -0.20083642  0.21060328
  0.01625244 -0.34341818 -0.21040498 -0.30221328  0.05520817 -0.08441182
  0.0313545   0.01122725  0.27458957 -0.2581353  -0.3079668  -0.02439034
 -0.22528283  0.05035021 -0.07226291 -0.00409833 -0.03894206  0.06516527
  0.09041492 -0.08996686  0.08819439 -0.06942315  0.07562473  0.10474674
  0.00300599 -0.16347326  0.36412808  0.02851762 -0.01989603  0.17973474
  0.06861334  0.25913286 -0.15470749  0.06786061 -0.00504779  0.20889491
  0.2030788  -0.25446317  0.15321966  0.13806863  0.31823     0.15149638
 -0.43128183 -0.07699223 -0.12870276 -0.00841347  0

In [18]:
similar_words = Word2Vec_model_sg.wv.most_similar('friend')
print(similar_words)

[('brother', 0.8718932867050171), ('wife', 0.8443353176116943), ('servant', 0.8382858633995056), ('sister', 0.8214103579521179), ('aunt', 0.8185446858406067), ('girlfriend', 0.8092584609985352), ('acquaintance', 0.802879810333252), ('grandfather', 0.8003736138343811), ('roommate', 0.7951571345329285), ('companion', 0.7915104627609253)]


In [25]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def sentence_vector(sentence, model):
    words = sentence.split()
    valid_vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(valid_vectors, axis=0) if valid_vectors else np.zeros(model.vector_size)

user_input = "I am feeling very sad and lonely. I don't have any friends to talk to."
preprocessed_input = preprocess_text(user_input)
input_vector = sentence_vector(preprocessed_input, Word2Vec_model_sg)

corpus_vectors = [sentence_vector(entry, Word2Vec_model_sg) for entry in preprocessed_corpus]
similarities = cosine_similarity([input_vector], corpus_vectors)

most_similar_index = np.argmax(similarities)
print(f"Most similar entry: {corpus[most_similar_index]}")


Most similar entry: the-killers George brings Max and Al their meals, but they can’t remember who ordered what. As they eat, they catch George looking at them. Al suggests that “maybe the boy meant it for a joke,” and George laughs. Max tells him not to laugh and George says alright. To Al, Max says “he thinks it’s all right,” and Al replies, “Oh, he’s a thinker.” Max and Al want to seem like they are in control of the situation and know what they are doing, but their confusion over who ordered what reveals how easy it is to fluster them (and shows them to be either a little stupid or unobservant—an inauspicious beginning for criminals). Frustrated by this confusion, they again emasculate George to put him in his place as their inferior.


# Fine-Tuning Pretrained Models

In [1]:
import pandas as pd

data = pd.read_csv('test_selenium.csv')
data['Input'] = data['Summary']
data['Output'] = data['Analysis']

data[['Input', 'Output']].to_csv('fine_tuning_data.csv', index=False)

In [2]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files={"train": 'fine_tuning_data.csv'})

split = dataset['train'].train_test_split(test_size=0.1)
train_dataset = split['train']
test_dataset = split['test']

split = train_dataset.train_test_split(test_size=0.1)
train_dataset = split['train']
val_dataset = split['test']

print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")
print(f"Test size: {len(test_dataset)}")


  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 5095 examples [00:00, 51968.66 examples/s]

Train size: 4126
Validation size: 459
Test size: 510





In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
def preprocess_function(examples):
    # Check for empty inputs or outputs
    if "Input" not in examples or "Output" not in examples:
        raise ValueError("Input or output column missing in dataset.")
    
    # Ensure input and output are non-empty strings
    inputs = examples["Input"]  # Replace with your dataset's input column
    targets = examples["Output"]  # Replace with your dataset's output column
    
    if not inputs or not targets:
        raise ValueError("Empty input or output found in the dataset.")

    # Tokenize inputs and outputs
    model_inputs = tokenizer(
        inputs,
        max_length=256,
        truncation=True,
        padding="max_length"
    )
    labels = tokenizer(
        targets,
        max_length=256,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Reapply preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)



[A
[A
[A
[A
Map: 100%|██████████| 4126/4126 [00:03<00:00, 1352.93 examples/s]

[A
Map: 100%|██████████| 459/459 [00:00<00:00, 1203.57 examples/s]

[A
Map: 100%|██████████| 510/510 [00:00<00:00, 1285.08 examples/s]


In [5]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)


In [6]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

from torch import cuda
print(cuda.is_available())  # Should return True if GPU is available

cuda.empty_cache()



True


In [7]:
from datasets import load_metric

metric = load_metric("rouge")  # Replace with the appropriate metric for your task

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a list of strings
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {key: value.mid.fmeasure for key, value in result.items()}

  metric = load_metric("rouge")  # Replace with the appropriate metric for your task
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [12]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import os

# Clear GPU cache and set memory configuration
torch.cuda.empty_cache()
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./fine_tuned_t5",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save at the end of each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=1,  # Reduce batch size to fit in memory
    per_device_eval_batch_size=1,  # Match eval batch size with training batch size
    gradient_accumulation_steps=4,  # Simulate larger batch size (effective batch size = 1 * 4 = 4)
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=200,             # Log less frequently to reduce console clutter
    fp16=False,                    # Set to True if your GPU supports mixed precision
    predict_with_generate=True,    # Enable text generation for evaluation
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

# Use Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Custom evaluation metrics
)

# Start training
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('fine_tuned_t5')
tokenizer.save_pretrained('fine_tuned_t5_tokenizer')


 33%|███▎      | 1031/3093 [09:19<18:38,  1.84it/s]
  6%|▋         | 200/3093 [00:46<10:56,  4.41it/s]

{'loss': 2.2385, 'grad_norm': 0.6222633123397827, 'learning_rate': 4.6766892984157775e-05, 'epoch': 0.19}


 13%|█▎        | 400/3093 [01:33<10:42,  4.19it/s]

{'loss': 1.2009, 'grad_norm': 0.702217161655426, 'learning_rate': 4.3533785968315555e-05, 'epoch': 0.39}


 19%|█▉        | 600/3093 [02:20<09:48,  4.24it/s]

{'loss': 1.1696, 'grad_norm': 0.6178997755050659, 'learning_rate': 4.030067895247333e-05, 'epoch': 0.58}


 26%|██▌       | 800/3093 [03:07<09:05,  4.20it/s]

{'loss': 1.1847, 'grad_norm': 0.7727011442184448, 'learning_rate': 3.706757193663111e-05, 'epoch': 0.78}


 32%|███▏      | 1000/3093 [03:55<08:40,  4.02it/s]

{'loss': 1.2205, 'grad_norm': 0.6378457546234131, 'learning_rate': 3.383446492078888e-05, 'epoch': 0.97}


 33%|███▎      | 1031/3093 [04:03<08:21,  4.11it/s]
 33%|███▎      | 1031/3093 [06:04<08:21,  4.11it/s]

{'eval_loss': 1.1342835426330566, 'eval_rouge1': 0.14481260142274638, 'eval_rouge2': 0.027660204847849115, 'eval_rougeL': 0.1178565143934396, 'eval_rougeLsum': 0.11788527549730675, 'eval_runtime': 120.8576, 'eval_samples_per_second': 3.798, 'eval_steps_per_second': 3.798, 'epoch': 1.0}


 39%|███▉      | 1200/3093 [06:45<07:36,  4.15it/s]   

{'loss': 1.2068, 'grad_norm': 0.7466623187065125, 'learning_rate': 3.0601357904946654e-05, 'epoch': 1.16}


 45%|████▌     | 1400/3093 [07:33<06:53,  4.09it/s]

{'loss': 1.1732, 'grad_norm': 1.4678646326065063, 'learning_rate': 2.736825088910443e-05, 'epoch': 1.36}


 52%|█████▏    | 1600/3093 [08:22<05:50,  4.26it/s]

{'loss': 1.1821, 'grad_norm': 0.46581369638442993, 'learning_rate': 2.4135143873262206e-05, 'epoch': 1.55}


 58%|█████▊    | 1800/3093 [09:10<05:02,  4.28it/s]

{'loss': 1.1682, 'grad_norm': 0.605596125125885, 'learning_rate': 2.0902036857419983e-05, 'epoch': 1.75}


 65%|██████▍   | 2000/3093 [09:57<04:19,  4.22it/s]

{'loss': 1.1023, 'grad_norm': 0.568564772605896, 'learning_rate': 1.766892984157776e-05, 'epoch': 1.94}



 67%|██████▋   | 2063/3093 [12:01<04:08,  4.14it/s]

{'eval_loss': 1.1275582313537598, 'eval_rouge1': 0.15079385505966109, 'eval_rouge2': 0.029024563117201262, 'eval_rougeL': 0.12142024200429173, 'eval_rougeLsum': 0.12138241741218958, 'eval_runtime': 109.2471, 'eval_samples_per_second': 4.201, 'eval_steps_per_second': 4.201, 'epoch': 2.0}


 71%|███████   | 2200/3093 [12:35<03:42,  4.01it/s]  

{'loss': 1.1564, 'grad_norm': 0.4658587574958801, 'learning_rate': 1.4435822825735532e-05, 'epoch': 2.13}


 78%|███████▊  | 2400/3093 [13:22<02:48,  4.12it/s]

{'loss': 1.1353, 'grad_norm': 0.6698911786079407, 'learning_rate': 1.1202715809893308e-05, 'epoch': 2.33}


 84%|████████▍ | 2600/3093 [14:09<01:58,  4.16it/s]

{'loss': 1.139, 'grad_norm': 0.6918927431106567, 'learning_rate': 7.969608794051083e-06, 'epoch': 2.52}


 91%|█████████ | 2800/3093 [14:57<01:09,  4.22it/s]

{'loss': 1.1572, 'grad_norm': 0.891018807888031, 'learning_rate': 4.736501778208859e-06, 'epoch': 2.71}


 97%|█████████▋| 3000/3093 [15:44<00:21,  4.25it/s]

{'loss': 1.1589, 'grad_norm': 0.5955147743225098, 'learning_rate': 1.5033947623666343e-06, 'epoch': 2.91}



100%|██████████| 3093/3093 [17:59<00:00,  4.13it/s]

{'eval_loss': 1.1250534057617188, 'eval_rouge1': 0.15096450782852516, 'eval_rouge2': 0.028817421582187856, 'eval_rougeL': 0.12104128063610468, 'eval_rougeLsum': 0.12105277787846327, 'eval_runtime': 112.6125, 'eval_samples_per_second': 4.076, 'eval_steps_per_second': 4.076, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 3093/3093 [18:01<00:00,  2.86it/s]


{'train_runtime': 1081.2941, 'train_samples_per_second': 11.447, 'train_steps_per_second': 2.86, 'train_loss': 1.2379047900698466, 'epoch': 3.0}


('fine_tuned_t5_tokenizer\\tokenizer_config.json',
 'fine_tuned_t5_tokenizer\\special_tokens_map.json',
 'fine_tuned_t5_tokenizer\\spiece.model',
 'fine_tuned_t5_tokenizer\\added_tokens.json')