In [1]:
!pip install transformers[torch] -U
!pip install evaluate
!pip install rouge_score
!pip install sentencepiece

Collecting transformers[torch]
  Downloading transformers-4.36.0-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate, transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed accelerate-0.25.0 transformers-4.36.0
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading

In [2]:
from transformers import AutoTokenizer, OPTForQuestionAnswering, OPTForCausalLM, TFAutoModelForSeq2SeqLM
import torch
import pandas as pd
import tensorflow as tf
import numpy as np
import re
import sentencepiece as spm
import evaluate
import rouge_score
import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


## Load Data

In [5]:
df_model = pd.read_csv("/content/drive/My Drive/NLP_Final_Project/DATA/urban_dict_filtered_v2.csv")
#df_model = pd.read_csv("/content/drive/My Drive/DATA/urban_dict_filtered_v2.csv")
pd.set_option('display.max_columns', None)
df_model.head()

Unnamed: 0,word,definition,example,author,thumbs_up,thumbs_down,bracketed_dfn,bracketed_exmpl,vote_ratio,vote_diff,compare,compare2,question,compare3,masked_example
0,cartossin,The act of giving up on both physical and ment...,Bobby's been going through a lot lately. His g...,efnet-truth,512786,100442,"['giving up', 'on both', 'growth']","[""Bobby's"", 'Cartossin', 'see what happens']",0.836208,412344,Yes,Yes,What is the meaning of cartossin in the follow...,Yes,bobby's been going through a lot lately. his g...
1,feeding the fish,Smoking the dank marijuana,Who's feeding the fish tonight?,meistergoat,115220,3,"['Smoking', 'the dank', 'marijuana']",['tonight'],0.999974,115217,Yes,Yes,What is the meaning of feeding the fish in the...,Yes,who's <extra_id_0> tonight?
2,woody,n A wooden roller coaster,"I like steel coaster, but I prefer the classic...",kwood4800,92128,1137,"['wooden', 'roller coaster']","['steel', 'coaster', 'woodie']",0.987809,90991,Yes,Yes,What is the meaning of woody in the following ...,Yes,"i like steel coaster, but i prefer the classic..."
3,WMAF,White male Asian Female couple.,Look at that WMAF couple over-there.,Indian Bastard,155821,89921,"['White male', 'Asian Female', 'couple']","['Look at that', 'couple']",0.634084,65900,Yes,Yes,What is the meaning of WMAF in the following e...,Yes,look at that <extra_id_0> couple over-there.
4,Buzzfeed,"When Barack Obama used to smoked pot at : AM, ...",I remember when Buzzfeed was something I did b...,Polly Tick,53561,554,"['Barack Obama', '2:00 AM', '2013']","['I remember when', 'college', '2 AM']",0.989763,53007,Yes,Yes,What is the meaning of Buzzfeed in the followi...,Yes,i remember when <extra_id_0> was something i d...


In [6]:
def clean_text(text):
    text = str(text)
    # Remove special characters like "</s>"
    text = re.sub(r"</s>\d*,", "", text)

    # Remove numeric characters and parentheses
    text = re.sub(r"[0-9]+", "", text)  # Removes all numbers
    text = re.sub(r"[()]", "", text)  # Removes parentheses
    text = re.sub(r"\t", "", text)  # Removes tabs
    text = re.sub(r"[\r\n]", " ", text)  # Removes new lines
    text = re.sub(r"[*#_]", "", text)  # Removes some non-standard punctuation

    # Replace common typos or slangs
    corrections = {
        " teh ": " the ",
        " u " : " you ",
        " adn " : " and ",
        " tho " : " though ",
        " . " : " ",
        # Add more corrections here if needed
    }
    for wrong, right in corrections.items():
        text = text.replace(wrong, right)

    # Remove extra quotation marks and correct double spaces
    text = text.replace('""', '"').replace("  ", " ")

    # Trim leading and trailing whitespace
    text = text.strip()

    return text

## Clean and Format Text Inputs

In [7]:
# preprocessing data
prefix = 'Define this slang word or phrase: '
text_inputs = []

for line in df_model.to_dict('records'):
    word = clean_text(line['word'])
    definition = clean_text(line['definition'])
    example = clean_text(line['example'])
    combined = prefix + word + '. Example: ' + example + '. Definition: ' + definition
    test = prefix + word + '. Example: ' + example
    if len(combined) >= 5:
      text_inputs.append({'combined': combined, 'word': word, 'test': test, 'definition': definition, 'example': example})

In [8]:
df_OPT = pd.DataFrame(text_inputs)
df_OPT.dropna()
df_OPT.drop_duplicates(inplace=True)

In [9]:
# Create some splits
np.random.shuffle(text_inputs)
num_valid_samples = int(0.15 * len(text_inputs))
num_train_samples = len(text_inputs) - 2 * num_valid_samples
train_pairs = text_inputs[:num_train_samples]
valid_pairs = text_inputs[num_train_samples : num_train_samples + num_valid_samples]
test_pairs = text_inputs[num_train_samples + num_valid_samples :]

print(f"{len(text_inputs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(valid_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

16811 total pairs
11769 training pairs
2521 validation pairs
2521 test pairs


In [10]:
df_model_test = pd.DataFrame(test_pairs)
df_model_test.head()

Unnamed: 0,combined,word,test,definition,example
0,Define this slang word or phrase: uwu. Example...,uwu,Define this slang word or phrase: uwu. Example...,A word which is used to describe a cuteness ov...,man: uwu russian government: `excecute plan on...
1,Define this slang word or phrase: nd. Example:...,nd,Define this slang word or phrase: nd. Example:...,Slang word for and. One of the many reasons I ...,nd he was like so cute
2,Define this slang word or phrase: gulag. Examp...,gulag,Define this slang word or phrase: gulag. Examp...,Where Men go to die,'boys im at the gulag'
3,Define this slang word or phrase: matthew espi...,matthew espinosa,Define this slang word or phrase: matthew espi...,cutest guy on earth,matthew espinosa is sexy
4,Define this slang word or phrase: echo chamber...,echo chamber,Define this slang word or phrase: echo chamber...,an insular communication space where everyone ...,The broadcast is just another echo chamber for...


# OPT

In [11]:
## CAUSAL LM
model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

## Generate Predictions and References

In [13]:
predictions = []

for line in df_model_test[:500].to_dict('records'):
      if len(predictions) % 100 == 0:
          #pd.DataFrame(predictions).to_csv('/content/drive/My Drive/DATA/OPT_base_predictions.csv')
          pd.DataFrame(predictions).to_csv('/content/drive/My Drive/NLP_Final_Project/DATA/OPT_base_predictions_500.csv')
          print("Saved to My Drive")
      word = line['word']
      example = line['test']
      test_inputs = tokenizer([example], return_tensors='pt')

      test_output_ids = model.generate(test_inputs['input_ids'],
                                          num_beams=3,
                                          no_repeat_ngram_size=3,
                                          min_length=30,
                                          max_length=256,
                                          temperature=0.97,
                                          output_scores = True,
                                          do_sample = True)

      predictions.extend([tokenizer.decode(out_ids, skip_special_tokens=True,
                                clean_up_tokenization_spaces=False) for out_ids in test_output_ids])
      print(f"Progress: {len(predictions)} out of {len(df_model_test[:500])}")

print(predictions)

Saved to My Drive
Progress: 1 out of 500
Progress: 2 out of 500
Progress: 3 out of 500
Progress: 4 out of 500
Progress: 5 out of 500
Progress: 6 out of 500
Progress: 7 out of 500
Progress: 8 out of 500
Progress: 9 out of 500
Progress: 10 out of 500
Progress: 11 out of 500
Progress: 12 out of 500
Progress: 13 out of 500
Progress: 14 out of 500
Progress: 15 out of 500
Progress: 16 out of 500
Progress: 17 out of 500
Progress: 18 out of 500
Progress: 19 out of 500
Progress: 20 out of 500
Progress: 21 out of 500
Progress: 22 out of 500
Progress: 23 out of 500
Progress: 24 out of 500
Progress: 25 out of 500
Progress: 26 out of 500
Progress: 27 out of 500
Progress: 28 out of 500
Progress: 29 out of 500
Progress: 30 out of 500
Progress: 31 out of 500
Progress: 32 out of 500
Progress: 33 out of 500
Progress: 34 out of 500
Progress: 35 out of 500
Progress: 36 out of 500
Progress: 37 out of 500
Progress: 38 out of 500
Progress: 39 out of 500
Progress: 40 out of 500
Progress: 41 out of 500
Progres

In [15]:
pd.DataFrame(predictions).to_csv('/content/drive/My Drive/NLP_Final_Project/DATA/OPT_base_predictions_500.csv')

In [16]:
references = []

for line in df_model_test[:500].to_dict('records'):
    references.append(line['combined'])
print(references)

['Define this slang word or phrase: uwu. Example: man: uwu russian government: `excecute plan on man`. Definition: A word which is used to describe a cuteness overload. It can also lead to people cringing at you.', 'Define this slang word or phrase: nd. Example: nd he was like so cute. Definition: Slang word for and. One of the many reasons I cannot successfully communicate with other teenagers through the internet, since they do not write in clear English', "Define this slang word or phrase: gulag. Example: 'boys im at the gulag'. Definition: Where Men go to die", 'Define this slang word or phrase: matthew espinosa. Example: matthew espinosa is sexy. Definition: cutest guy on earth', 'Define this slang word or phrase: echo chamber. Example: The broadcast is just another echo chamber for self serving interests.. Definition: an insular communication space where everyone agrees with the information and no outside input is allowed', 'Define this slang word or phrase: Bombaclot. Example: A

In [17]:
pd.DataFrame(references).to_csv('/content/drive/My Drive/NLP_Final_Project/DATA/OPT_base_references_500.csv')

## Evaluate

In [18]:
bleu = evaluate.load('bleu')

results = bleu.compute(predictions=predictions, references=references,
          max_order = 3)

print(results)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.3313168175602875, 'precisions': [0.3856098604144796, 0.3134398680749146, 0.30090408025694915], 'brevity_penalty': 1.0, 'length_ratio': 1.5630374669543285, 'translation_length': 51438, 'reference_length': 32909}


In [19]:
rouge = evaluate.load('rouge')

results = rouge.compute(predictions=predictions, references=references)

print(results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.45846602236237977, 'rouge2': 0.38854370036751606, 'rougeL': 0.4401940584704671, 'rougeLsum': 0.4308770289765046}


# FLAN-T5

In [None]:
text_inputs = []
for line in df_model.to_dict('records'):
    word = clean_text(line['word'])
    definition = clean_text(line['definition'])
    example = clean_text(line['example'])
    text_inputs.append({'orig': f"What is the meaning of {word} in the following example sentence?: {example}", 'target': f"The definition of {word} is {definition}"})

print(text_inputs[:5])

[{'orig': "What is the meaning of cartossin in the following example sentence?: Bobby's been going through a lot lately. His girlfriend dumped him and he just lost his job, he's decided to Cartossin the rest of the year and see what happens in the new year.", 'target': 'The definition of cartossin is The act of giving up on both physical and mental growth.'}, {'orig': "What is the meaning of feeding the fish in the following example sentence?: Who's feeding the fish tonight?", 'target': 'The definition of feeding the fish is Smoking the dank marijuana'}, {'orig': 'What is the meaning of woody in the following example sentence?: I like steel coaster, but I prefer the classic woody. see also woodie', 'target': 'The definition of woody is n A wooden roller coaster'}, {'orig': 'What is the meaning of WMAF in the following example sentence?: Look at that WMAF couple over-there.', 'target': 'The definition of WMAF is White male Asian Female couple.'}, {'orig': 'What is the meaning of Buzzfee

In [None]:
np.random.shuffle(text_inputs)
num_valid_samples = int(0.15 * len(text_inputs))
num_train_samples = len(text_inputs) - 2 * num_valid_samples
train_pairs = text_inputs[:num_train_samples]
valid_pairs = text_inputs[num_train_samples : num_train_samples + num_valid_samples]
test_pairs = text_inputs[num_train_samples + num_valid_samples :]

print(f"{len(text_inputs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(valid_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

16811 total pairs
11769 training pairs
2521 validation pairs
2521 test pairs


In [None]:
df_model_test = pd.DataFrame(test_pairs).applymap(clean_text)
df_model_test.head()

Unnamed: 0,orig,target
0,What is the meaning of jelly in the following ...,"The definition of jelly is Being jealous, or h..."
1,What is the meaning of franklin in the followi...,The definition of franklin is one that is extr...
2,What is the meaning of stingy in the following...,The definition of stingy is reluctant to part ...
3,What is the meaning of belle in the following ...,The definition of belle is Means beautiful in ...
4,What is the meaning of White Boy in the follow...,The definition of White Boy is Derogotory term...


In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

## Generate Predictions and References

In [None]:
predictions = []
predictions_dict = {}


for example in df_model_test['orig'][:500]:
      if len(predictions) % 100 == 0:
          pd.DataFrame(predictions).to_csv('/content/drive/My Drive/DATA/FLAN_T5_base_predictions.csv')
          print("Saved to My Drive")
      test_inputs = tokenizer([example], return_tensors='tf')
      test_output_ids = model.generate(test_inputs['input_ids'],
                                          num_beams=3,
                                          no_repeat_ngram_size=3,
                                          min_length=30,
                                          max_length=128,
                                          temperature=0.97,
                                          output_scores = True)

      predictions_dict[example] = [tokenizer.decode(out_ids, skip_special_tokens=True,
                                clean_up_tokenization_spaces=False) for out_ids in test_output_ids]
      predictions.extend([tokenizer.decode(out_ids, skip_special_tokens=True,
                                clean_up_tokenization_spaces=False) for out_ids in test_output_ids])
      print(f"Progress: {len(predictions)} out of {len(df_model_test['orig'][:500])}")

print(predictions_dict)

In [None]:
pd.DataFrame(predictions).to_csv('/content/drive/My Drive/DATA/FLAN_T5_base_predictions.csv')

In [None]:
references = []
references_dict = {}

for line in df_model_test[:500].to_dict('records'):
    example = line['orig']
    definition = line['target']

    references_dict[example] = definition
    references.append(definition)

print(references)



In [None]:
pd.DataFrame(references).to_csv('/content/drive/My Drive/DATA/FLAN_T5_base_references.csv')

## Evaluate

In [None]:
bleu = evaluate.load('bleu')

results = bleu.compute(predictions=predictions, references=references,
          max_order = 3)
print(results)

{'bleu': 0.03270879940892802, 'precisions': [0.22896837580595641, 0.041267560664112385, 0.01122381110741603], 'brevity_penalty': 0.6910185720962962, 'length_ratio': 0.7301462758504735, 'translation_length': 13028, 'reference_length': 17843}


In [None]:
rouge = evaluate.load('rouge')

results = rouge.compute(predictions=predictions, references=references)

print(results)

{'rouge1': 0.2038708536554356, 'rouge2': 0.05125917487891374, 'rougeL': 0.15536513079566308, 'rougeLsum': 0.15541589768960296}
