In [None]:
!pip install --upgrade pip
!git clone https://github.com/google-research/bleurt.git
%cd bleurt
!pip install .
%cd ..

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install evaluate
!pip install rouge_score

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from ev

In [None]:
import os
import re
import numpy as np
import pandas as pd

from bleurt import score

import evaluate
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
from transformers import T5Tokenizer, TFT5ForConditionalGeneration, TFAutoModelForSeq2SeqLM, AutoTokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
## USED THE ORIGINAL FILTERED DATASET

df_model = pd.read_csv("/content/drive/My Drive/DATA/urban_dict_filtered.csv")
# df_model = pd.read_csv("/content/drive/My Drive/NLP_Final_Project/DATA/urban_dict_filtered.csv")
pd.set_option('display.max_columns', None)
df_model.head()

Unnamed: 0.1,Unnamed: 0,word,definition,example,author,vote_ratio,vote_diff,masked_example
0,0,Tilt,Usually a term in video games (though it can r...,Phage: Dude I lost a lot of Ranked games in Le...,Lance Ted Mosby Hardwood,0.909168,964,phage: dude i lost a lot of ranked games in le...
1,1,aishiteru,"A Japanese term, meaning ""I'm in love with you""","Aishiteru, Ami.",Ami,0.827114,1385,"<extra_id_0>, ami."
2,2,Felch,(1) verb. The act of sucking or licking ejacu...,I couldn't belive it. After Nigel packed my a...,Pymp,0.811438,2897,i couldn't belive it. after nigel packed my a...
3,3,funemployment,a happy time in one's life when one is not emp...,people wonder how I pay my bills when I'm on f...,JBMason,0.852373,802,people wonder how i pay my bills when i'm on <...
4,4,remove kebab,Memetic euphemism for ethnic cleansing directe...,"1. ""REMOVE KEBAB remove kebab you are worst tu...",cwm,0.823952,1082,"1. ""<extra_id_0> <extra_id_0> you are worst tu..."


In [None]:
def clean_text(text):
    text = str(text)
    # Remove special characters like "</s>"
    text = re.sub(r"</s>\d*,", "", text)

    # Remove numeric characters and parentheses
    text = re.sub(r"[0-9]+", "", text)  # Removes all numbers
    text = re.sub(r"[()]", "", text)  # Removes parentheses
    text = re.sub(r"\t", "", text)  # Removes tabs
    text = re.sub(r"[\r\n]", " ", text)  # Removes new lines
    text = re.sub(r"[*#_]", "", text)  # Removes some non-standard punctuation

    # Replace common typos or slangs
    corrections = {
        " teh ": " the ",
        " u " : " you ",
        " adn " : " and ",
        " tho " : " though ",
        " . " : " ",
        # Add more corrections here if needed
    }
    for wrong, right in corrections.items():
        text = text.replace(wrong, right)

    # Remove extra quotation marks and correct double spaces
    text = text.replace('""', '"').replace("  ", " ")

    # Trim leading and trailing whitespace
    text = text.strip()

    return text

In [None]:
text_inputs = []
for line in df_model.to_dict('records'):
    word = clean_text(line['word'])
    definition = clean_text(line['definition'])
    example = clean_text(line['example'])
    text_inputs.append({'orig': f"What is the meaning of {word} in the following example sentence?: {example}", 'target': f"The definition of {word} is {definition}"})

print(text_inputs[:5])

[{'orig': "What is the meaning of Tilt in the following example sentence?: Phage: Dude I lost a lot of Ranked games in League of Legends and it's all my stupid noob teammates' faults. Sheen: Dude you're on tilt. Take a break from League for a bit man.", 'target': "The definition of Tilt is Usually a term in video games though it can really be used in any activity or hobby, tilt is an emotional state when doing the exact same thing activity over and over produces negative results. It's an emotional breakdown and fustration of your hard work not resulting in the success that you crave so desperately. When you or someone is in a tilt state of mind, the best thing to do is take a break from that activity and try not to think about it as much."}, {'orig': 'What is the meaning of aishiteru in the following example sentence?: Aishiteru, Ami.', 'target': 'The definition of aishiteru is A Japanese term, meaning "I\'m in love with you"'}, {'orig': "What is the meaning of Felch in the following e

In [None]:
# Let's create some splits
np.random.shuffle(text_inputs)
num_valid_samples = int(0.15 * len(text_inputs))
num_train_samples = len(text_inputs) - 2 * num_valid_samples
train_pairs = text_inputs[:num_train_samples]
valid_pairs = text_inputs[num_train_samples : num_train_samples + num_valid_samples]
test_pairs = text_inputs[num_train_samples + num_valid_samples :]

print(f"{len(text_inputs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(valid_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

19316 total pairs
13522 training pairs
2897 validation pairs
2897 test pairs


In [None]:
# Save splits to separate csv files, to load only part at a time later
train_file = '/content/drive/My Drive/DATA/train_pairs.csv'
valid_file = '/content/drive/My Drive/DATA/valid_pairs.csv'
test_file = '/content/drive/My Drive/DATA/test_pairs.csv'
# train_file = '/content/drive/My Drive/NLP_Final_Project/DATA/train_pairs.csv'
# valid_file = '/content/drive/My Drive/NLP_Final_Project/DATA/valid_pairs.csv'
# test_file = '/content/drive/My Drive/NLP_Final_Project/DATA/test_pairs.csv'

pd.DataFrame(train_pairs).to_csv(train_file)
pd.DataFrame(valid_pairs).to_csv(valid_file)
pd.DataFrame(test_pairs).to_csv(test_file)

In [None]:
def preprocess_data(text_pairs, tokenizer, model, max_length=128):
    orig_text = [orig for orig, target in text_pairs]
    orig_encoded = tokenizer.batch_encode_plus(
        orig_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'
    )

    orig_input_ids = np.array(orig_encoded["input_ids"], dtype="int32")
    orig_attention_masks = np.array(orig_encoded["attention_mask"], dtype="int32")

    target_text = [target for orig, target in text_pairs]
    target_encoded = tokenizer.batch_encode_plus(
        target_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )

    label_ids = np.array(target_encoded['input_ids'])
    decoder_input_ids = model._shift_right(label_ids)

    return [orig_input_ids, orig_attention_masks, decoder_input_ids], label_ids

In [None]:
class TranslationDataGenerator(tf.keras.utils.Sequence):

    def __init__(self,
                 tokenizer,
                 model,
                 n_examples,
                 data_filename,
                 max_length=128,
                 batch_size=16,
                 shuffle=True):

        self.tokenizer = tokenizer
        self.model = model
        self.n_examples = n_examples
        self.data_filename = data_filename
        self.max_length = max_length
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Initialize row order, call on_epoch_end to shuffle row indices
        self.row_order = np.arange(1, self.n_examples+1)
        self.on_epoch_end()

    def __len__(self):
        # Return the number of batches in the full dataset
        return self.n_examples // self.batch_size

    def __getitem__(self, idx):
        batch_start = idx * self.batch_size
        batch_end = (idx + 1) * self.batch_size

        # Indices to skip are the ones in the shuffled row_order before and
        # after the chunk we'll use for this batch
        batch_idx_skip = self.row_order[:batch_start] + self.row_order[batch_end:]
        df = pd.read_csv(self.data_filename, skiprows=batch_idx_skip)

        text_pairs = df[['orig', 'target']].values.astype(str).tolist()

        batch_data = preprocess_data(
            text_pairs,
            self.tokenizer,
            self.model,
            self.max_length
        )

        return batch_data

    def __call__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)

            if i == self.__len__()-1:
                self.on_epoch_end()

    def on_epoch_end(self):
        if self.shuffle:
            self.row_order = list(np.random.permutation(self.row_order))

In [None]:
# Load the pretrained tensorflow model

model_name = 't5-base'
t5_tokenizer = AutoTokenizer.from_pretrained(model_name)
t5_model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [None]:
max_length = 128 #can tune
batch_size = 16 #can tune
learning_rate = 1e-4
epochs = 3

In [None]:
# Create the data generators for train and validation data, tensorflow version
train_data_generator = TranslationDataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=len(train_pairs),
    data_filename=train_file,
    max_length=max_length,
    batch_size=batch_size
)

valid_data_generator = TranslationDataGenerator(
    tokenizer=t5_tokenizer,
    model=t5_model,
    n_examples=len(valid_pairs),
    data_filename=valid_file,
    max_length=max_length,
    batch_size=batch_size
)

In [None]:
def build_t5_training_wrapper_model(t5_model, max_length):
    input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(max_length), dtype=tf.int32, name='attention_mask')
    decoder_input_ids = layers.Input(shape=(max_length), dtype=tf.int32, name='labels')

    t5_logits = t5_model(input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids)[0]

    model = tf.keras.models.Model(inputs=[input_ids, attention_mask, decoder_input_ids],
                                  outputs=[t5_logits])
    model.compile(optimizer=tf.keras.optimizers.Adam(),
                  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    return model

In [None]:
model_wrapper = build_t5_training_wrapper_model(t5_model, max_length)

In [None]:
# As in the first notebook, we should add a model checkpoint callback to save
# the trained model weights after each epoch. Edit the filepath to where
# you want to save the weights in your own Drive

checkpoint_dir = '/content/drive/My Drive/model_checkpoints/'
# checkpoint_dir = '/content/drive/My Drive/NLP_Final_Project/model_checkpoints/'
checkpoint_filepath = checkpoint_dir + model_name + '_slang_weights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True)

In [None]:
# Now call .fit on the model_wrapper, passing in the data generators and the
# model checkpoint callback

# history = model_wrapper.fit(train_data_generator,
#                   validation_data=valid_data_generator,
#                   epochs=epochs,
#                   callbacks=[model_checkpoint_callback])

In [None]:
history = model_wrapper.load_weights(checkpoint_dir + "t5-base_slang_weights.01-0.53.hdf5", skip_mismatch=False)

In [None]:
hist = history.history
x_arr = np.arange(len(hist['loss'])) + 1

fig = plt.figure(figsize=(24, 8))
ax = fig.add_subplot(1, 2, 1)
ax.plot(x_arr, hist['loss'], '-o', label='Train loss')
ax.plot(x_arr, hist['val_loss'], '--<', label='Validation loss')

ax.legend(fontsize=15)
ax.set_xlabel('Epoch', size=15)
ax.set_ylabel('Loss', size=15)

In [None]:
df_model_test = pd.DataFrame(test_pairs).applymap(clean_text)
df_model_test.head()

In [None]:
predictions = []
predictions_dict = {}

for example in df_model_test['orig'][:5]:
      if len(predictions) % 100 == 0:
          pd.DataFrame(predictions).to_csv(f'/content/drive/My Drive/DATA/T5_base_predictions_{max_length}.csv')
          print("Saved to My Drive")
      test_inputs = t5_tokenizer([example], return_tensors='tf')
      test_output_ids = t5_model.generate(test_inputs['input_ids'],
                                          num_beams=3,
                                          no_repeat_ngram_size=3,
                                          min_length=30,
                                          max_length=128,
                                          temperature=0.97,
                                          output_scores = True)

      predictions_dict[example] = [t5_tokenizer.decode(out_ids, skip_special_tokens=True,
                                clean_up_tokenization_spaces=False) for out_ids in test_output_ids]
      predictions.extend([t5_tokenizer.decode(out_ids, skip_special_tokens=True,
                                clean_up_tokenization_spaces=False) for out_ids in test_output_ids])
      print(f"Progress: {len(predictions)} out of {len(df_model_test['orig'][:500])}")

print(predictions_dict)

In [None]:
# pd.DataFrame(predictions_dict).to_csv('/content/drive/My Drive/NLP_Final_Project/DATA/T5_baseline_predictions.csv')
pd.DataFrame(predictions).to_csv('/content/drive/My Drive/DATA/T5_baseline_predictions.csv')

In [None]:
references = []
references_dict = {}

for line in df_model_test[:500].to_dict('records'):
    word = line['orig']
    definition = line['target']
    references_dict['word'] = definition
    references.append(definition)
print(references)



In [None]:
# pd.DataFrame(predictions_dict).to_csv('/content/drive/My Drive/NLP_Final_Project/DATA/T5_baseline_references.csv')
pd.DataFrame(references).to_csv('/content/drive/My Drive/DATA/T5_baseline_references.csv')

In [None]:
bleu = evaluate.load('bleu')

results = bleu.compute(predictions=predictions, references=references,
          max_order = 3)
print(results)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.01536045570932169, 'precisions': [0.32557158525267865, 0.07619131507090637, 0.029024232930986823], 'brevity_penalty': 0.17138252177454802, 'length_ratio': 0.3618131868131868, 'translation_length': 11853, 'reference_length': 32760}


In [None]:
rouge = evaluate.load('rouge')

results = rouge.compute(predictions=predictions, references=references)

print(results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.21923789924524284, 'rouge2': 0.06207326116492841, 'rougeL': 0.16979007694658543, 'rougeLsum': 0.1694196253281292}


In [None]:
!wget https://storage.googleapis.com/bleurt-oss-21/BLEURT-20.zip .
!unzip BLEURT-20.zip

In [None]:
bleurt_checkpoint= 'BLEURT-20'
scorer = score.BleurtScorer(bleurt_checkpoint)

In [None]:
def get_bleurt_scores(references_fp, predictions_fp):
    references = pd.read_csv(references_fp)
    references = list(references['0'])
    predictions = pd.read_csv(predictions_fp)
    predictions = list(predictions['0'])
    scores = scorer.score(references=references, candidates=predictions)
    print(scores)
    return scores

In [None]:
references_fp = drive_fp + 'DATA/T5_baseline_references.csv'
predictions_fp = drive_fp + 'DATA/T5_baseline_predictions.csv'
scores = get_bleurt_scores(references_fp, predictions_fp)

In [None]:
print(np.mean(scores))