In [None]:
!pip install evaluate
!pip install rouge_score
!pip install sentencepiece

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_datasets as tfds

import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer

import os
import nltk
import evaluate

import matplotlib.pyplot as plt

import re
import textwrap

from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration, TFAutoModelForSeq2SeqLM, AutoTokenizer
import sentencepiece as spm

import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


## Read Data

In [None]:
df_model = pd.read_csv("/content/drive/My Drive/DATA/urban_dict_filtered_v2.csv")
#df_model = pd.read_csv("/content/drive/My Drive/NLP_Final_Project/DATA/urban_dict_filtered.csv")
pd.set_option('display.max_columns', None)
df_model.head()

Unnamed: 0,word,definition,example,author,thumbs_up,thumbs_down,bracketed_dfn,bracketed_exmpl,vote_ratio,vote_diff,compare,compare2,question,compare3,masked_example
0,cartossin,The act of giving up on both physical and ment...,Bobby's been going through a lot lately. His g...,efnet-truth,512786,100442,"['giving up', 'on both', 'growth']","[""Bobby's"", 'Cartossin', 'see what happens']",0.836208,412344,Yes,Yes,What is the meaning of cartossin in the follow...,Yes,bobby's been going through a lot lately. his g...
1,feeding the fish,Smoking the dank marijuana,Who's feeding the fish tonight?,meistergoat,115220,3,"['Smoking', 'the dank', 'marijuana']",['tonight'],0.999974,115217,Yes,Yes,What is the meaning of feeding the fish in the...,Yes,who's <extra_id_0> tonight?
2,woody,n A wooden roller coaster,"I like steel coaster, but I prefer the classic...",kwood4800,92128,1137,"['wooden', 'roller coaster']","['steel', 'coaster', 'woodie']",0.987809,90991,Yes,Yes,What is the meaning of woody in the following ...,Yes,"i like steel coaster, but i prefer the classic..."
3,WMAF,White male Asian Female couple.,Look at that WMAF couple over-there.,Indian Bastard,155821,89921,"['White male', 'Asian Female', 'couple']","['Look at that', 'couple']",0.634084,65900,Yes,Yes,What is the meaning of WMAF in the following e...,Yes,look at that <extra_id_0> couple over-there.
4,Buzzfeed,"When Barack Obama used to smoked pot at : AM, ...",I remember when Buzzfeed was something I did b...,Polly Tick,53561,554,"['Barack Obama', '2:00 AM', '2013']","['I remember when', 'college', '2 AM']",0.989763,53007,Yes,Yes,What is the meaning of Buzzfeed in the followi...,Yes,i remember when <extra_id_0> was something i d...


In [None]:
def clean_text(text):
    text = str(text)
    # Remove special characters like "</s>"
    text = re.sub(r"</s>\d*,", "", text)

    # Remove numeric characters and parentheses
    text = re.sub(r"[0-9]+", "", text)  # Removes all numbers
    text = re.sub(r"[()]", "", text)  # Removes parentheses
    text = re.sub(r"\t", "", text)  # Removes tabs
    text = re.sub(r"[\r\n]", " ", text)  # Removes new lines
    text = re.sub(r"[*#_]", "", text)  # Removes some non-standard punctuation

    # Replace common typos
    corrections = {
        " teh ": " the ",
        " u " : " you ",
        " adn " : " and ",
        " tho " : " though ",
        " . " : " ",
        # Add more corrections here if needed
    }
    for wrong, right in corrections.items():
        text = text.replace(wrong, right)

    # Remove extra quotation marks and correct double spaces
    text = text.replace('""', '"').replace("  ", " ")

    # Trim leading and trailing whitespace
    text = text.strip()

    return text

## Clean and Format Text Inputs

In [None]:
text_inputs = []
for line in df_model.to_dict('records'):
    word = clean_text(line['word'])
    definition = clean_text(line['definition'])
    example = clean_text(line['example'])
    text_inputs.append({'orig': f"What is the meaning of {word} in the following example sentence?: {example}", 'target': f"The definition of {word} is {definition}"})

print(text_inputs[:5])

[{'orig': "What is the meaning of cartossin in the following example sentence?: Bobby's been going through a lot lately. His girlfriend dumped him and he just lost his job, he's decided to Cartossin the rest of the year and see what happens in the new year.", 'target': 'The definition of cartossin is The act of giving up on both physical and mental growth.'}, {'orig': "What is the meaning of feeding the fish in the following example sentence?: Who's feeding the fish tonight?", 'target': 'The definition of feeding the fish is Smoking the dank marijuana'}, {'orig': 'What is the meaning of woody in the following example sentence?: I like steel coaster, but I prefer the classic woody. see also woodie', 'target': 'The definition of woody is n A wooden roller coaster'}, {'orig': 'What is the meaning of WMAF in the following example sentence?: Look at that WMAF couple over-there.', 'target': 'The definition of WMAF is White male Asian Female couple.'}, {'orig': 'What is the meaning of Buzzfee

In [None]:
# Create splits
np.random.shuffle(text_inputs)
num_valid_samples = int(0.15 * len(text_inputs))
num_train_samples = len(text_inputs) - 2 * num_valid_samples
train_pairs = text_inputs[:num_train_samples]
valid_pairs = text_inputs[num_train_samples : num_train_samples + num_valid_samples]
test_pairs = text_inputs[num_train_samples + num_valid_samples :]

print(f"{len(text_inputs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(valid_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

17324 total pairs
12128 training pairs
2598 validation pairs
2598 test pairs


In [None]:
# Save splits to separate csv files, to load only part at a time later
train_file = '/content/drive/My Drive/DATA/train_pairs.csv'
valid_file = '/content/drive/My Drive/DATA/valid_pairs.csv'
test_file = '/content/drive/My Drive/DATA/test_pairs.csv'
# train_file = '/content/drive/My Drive/NLP_Final_Project/DATA/train_pairs.csv'
# valid_file = '/content/drive/My Drive/NLP_Final_Project/DATA/valid_pairs.csv'
# test_file = '/content/drive/My Drive/NLP_Final_Project/DATA/test_pairs.csv'

pd.DataFrame(train_pairs).to_csv(train_file)
pd.DataFrame(valid_pairs).to_csv(valid_file)
pd.DataFrame(test_pairs).to_csv(test_file)

In [None]:
def preprocess_data(text_pairs, tokenizer, model, max_length=128):
    orig_text = [orig for orig, target in text_pairs]
    orig_encoded = tokenizer.batch_encode_plus(
        orig_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'
    )

    orig_input_ids = np.array(orig_encoded["input_ids"], dtype="int32")
    orig_attention_masks = np.array(orig_encoded["attention_mask"], dtype="int32")

    target_text = [target for orig, target in text_pairs]
    target_encoded = tokenizer.batch_encode_plus(
        target_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

    label_ids = np.array(target_encoded['input_ids'])
    decoder_input_ids = model._shift_right(label_ids)

    return [orig_input_ids, orig_attention_masks, decoder_input_ids], label_ids

In [None]:
class TranslationDataGenerator(tf.keras.utils.Sequence):

    def __init__(self,
                 tokenizer,
                 model,
                 n_examples,
                 data_filename,
                 max_length=128,
                 batch_size=16,
                 shuffle=True):

        self.tokenizer = tokenizer
        self.model = model
        self.n_examples = n_examples
        self.data_filename = data_filename
        self.max_length = max_length
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

    def __len__(self):
        # Return the number of batches in the full dataset
        return self.n_examples // self.batch_size

    def __getitem__(self, idx):
        batch_start = idx * self.batch_size
        batch_end = (idx + 1) * self.batch_size

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this batch
        batch_idx_skip = self.row_order[:batch_start] + self.row_order[batch_end:]
        df = pd.read_csv(self.data_filename, skiprows=batch_idx_skip)

        text_pairs = df[['orig', 'target']].values.astype(str).tolist()

        batch_data = preprocess_data(
            text_pairs,
            self.tokenizer,
            self.model,
            self.max_length
        )

        return batch_data

    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)

            if i == self.__len__()-1:
                self.on_epoch_end()

    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

## Load Model and Tokenizer

In [None]:
# Load the pretrained tensorflow model
model_name = 'flan-t5-base'
t5_tokenizer = AutoTokenizer.from_pretrained("google/" + model_name)
t5_model = TFAutoModelForSeq2SeqLM.from_pretrained("google/" + model_name)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
# tunable parameters
max_length = 256 #can tune
batch_size = 8 #can tune
learning_rate = 1e-4
epochs = 3

In [None]:
# Create the data generators for train and validation data, tensorflow version

train_data_generator = TranslationDataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=len(train_pairs),
    data_filename=train_file,
    max_length=max_length,
    batch_size=batch_size
)

valid_data_generator = TranslationDataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=len(valid_pairs),
    data_filename=valid_file,
    max_length=max_length,
    batch_size=batch_size
)

In [None]:
def build_t5_training_wrapper_model(t5_model, max_length):
    input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(max_length), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='labels')

    t5_logits = t5_model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[0]

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids],
                                  outputs=[t5_logits])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

In [None]:
model_wrapper = build_t5_training_wrapper_model(t5_model, max_length)

In [None]:
checkpoint_dir = '/content/drive/My Drive/model_checkpoints/'
#checkpoint_dir = '/content/drive/My Drive/NLP_Final_Project/model_checkpoints/'
checkpoint_filepath = f"{checkpoint_dir}{model_name}_{max_length}_{epochs}_" + "slang_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(verbose=1,
    filepath=checkpoint_filepath,
    save_weights_only=True)

### Optional: Load Weights

In [None]:
#model_wrapper.load_weights(checkpoint_dir + "flan-t5-base_128_3_slang_weights.03-0.81.hdf5", skip_mismatch=False)

## Train Model

In [None]:
history = model_wrapper.fit(train_data_generator,
                  validation_data=valid_data_generator,
                  epochs=epochs,
                  callbacks=[model_checkpoint_callback])

### Optional: Visualize Training Loss

In [None]:
hist = history.history
x_arr = np.arange(len(hist['loss'])) + 1

fig = plt.figure(figsize=(24, 8))
ax = fig.add_subplot(1, 2, 1)
ax.plot(x_arr, hist['loss'], '-o', label='Train loss')
ax.plot(x_arr, hist['val_loss'], '--<', label='Validation loss')

ax.legend(fontsize=15)
ax.set_xlabel('Epoch', size=15)
ax.set_ylabel('Loss', size=15)

In [None]:
df_model_test = pd.DataFrame(test_pairs).applymap(clean_text)
df_model_test.head()

## Generate Predictions and References

In [None]:
predictions = []
predictions_dict = {}

for example in df_model_test['orig'][:500]:
      if len(predictions) % 100 == 0:
          pd.DataFrame(predictions).to_csv(f'/content/drive/My Drive/DATA/FLAN_T5_fine_tuned_predictions_{max_length}.csv')
          print("Saved to My Drive")
      test_inputs = t5_tokenizer([example], return_tensors='tf')
      test_output_ids = t5_model.generate(test_inputs['input_ids'],
                                          num_beams=3,
                                          no_repeat_ngram_size=3,
                                          min_length=30,
                                          max_length=128,
                                          temperature=0.97,
                                          output_scores = True)

      predictions_dict[example] = [t5_tokenizer.decode(out_ids, skip_special_tokens=True,
                                clean_up_tokenization_spaces=False) for out_ids in test_output_ids]
      predictions.extend([t5_tokenizer.decode(out_ids, skip_special_tokens=True,
                                clean_up_tokenization_spaces=False) for out_ids in test_output_ids])
      print(f"Progress: {len(predictions)} out of {len(df_model_test['orig'][:500])}")

print(predictions_dict)

In [None]:
pd.DataFrame(predictions).to_csv(f'/content/drive/My Drive/DATA/FLAN_T5_fine_tuned_predictions_{max_length}.csv')

In [None]:
references = []
references_dict = {}

for line in df_model_test[:500].to_dict('records'):
    example = line['orig']
    definition = line['target']

    references_dict[example] = definition
    references.append(definition)

print(references)



In [None]:
pd.DataFrame(references).to_csv(f'/content/drive/My Drive/DATA/FLAN_T5_fine_tuned_references_{max_length}.csv')

## Evaluate

In [None]:
bleu = evaluate.load('bleu')

results = bleu.compute(predictions=predictions, references=references,
          max_order = 3)

print(results)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.10704725527788037, 'precisions': [0.36728927954795165, 0.1649384838553699, 0.11560757467012643], 'brevity_penalty': 0.5595042066215838, 'length_ratio': 0.632629418773997, 'translation_length': 19113, 'reference_length': 30212}


In [None]:
rouge = evaluate.load('rouge')

results = rouge.compute(predictions=predictions, references=references)

print(results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.31918552566819947, 'rouge2': 0.18470383881176877, 'rougeL': 0.2830156894111816, 'rougeLsum': 0.2825959388367477}


In [None]:
nltk.download('punkt')

In [None]:
# Calculate BLEU Score
from nltk.translate.bleu_score import sentence_bleu

for word, generated_sentences in predictions_dict.items():
    reference_list = references_dict[word]
    for reference_sentence in reference_list:
        for generated_sentence in generated_sentences:
            # Calculate BLEU score
            reference_tokens = nltk.word_tokenize(reference_sentence.lower())
            generated_tokens = nltk.word_tokenize(generated_sentence.lower())

            bleu_score = sentence_bleu([reference_tokens], generated_tokens)
            #print(f'BLEU Score for "{generated_sentence}" and "{reference_sentence}": {bleu_score}')
            print(f'BLEU Score for "{word}": {bleu_score}')
