In [2]:
!pip install sentencepiece
!pip install transformers[torch]
!pip install accelerate -U
!pip install datasets
!pip install evaluate
!pip install rouge_score



In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_datasets as tfds

import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer

import os
import nltk

import matplotlib.pyplot as plt

import re
import textwrap

from transformers import AutoTokenizer, GPT2Tokenizer, TFOPTForCausalLM
import sentencepiece as spm

import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


## Load Data

In [4]:
#df_model = pd.read_csv("/content/drive/My Drive/DATA/urban_dict_filtered_v2.csv")
df_model = pd.read_csv("/content/drive/My Drive/NLP_Final_Project/DATA/urban_dict_filtered_v2.csv")
df_model.head()
pd.set_option('display.max_columns', None)

OPT

In [5]:
def clean_text(text):
    text = str(text)
    # Remove special characters like "</s>"
    text = re.sub(r"</s>\d*,", "", text)

    # Remove numeric characters and parentheses
    text = re.sub(r"[0-9]+", "", text)  # Removes all numbers
    text = re.sub(r"[()]", "", text)  # Removes parentheses
    text = re.sub(r"\t", "", text)  # Removes tabs
    text = re.sub(r"[\r\n]", " ", text)  # Removes new lines
    text = re.sub(r"[*#_]", "", text)  # Removes some non-standard punctuation

    # Replace common typos
    corrections = {
        " teh ": " the ",
        " u " : " you ",
        " adn " : " and ",
        " tho " : " though ",
        " . " : " ",
        # Add more corrections here if needed
    }
    for wrong, right in corrections.items():
        text = text.replace(wrong, right)

    # Remove extra quotation marks and correct double spaces
    text = text.replace('""', '"').replace("  ", " ")

    # Trim leading and trailing whitespace
    text = text.strip()

    return text

In [6]:
# preprocessing data
prefix = 'Define this slang word or phrase: '

text_inputs = []
for line in df_model.to_dict('records'):
    word = clean_text(line['word'])
    definition = clean_text(line['definition'])
    example = clean_text(line['example'])
    combined = prefix + word + '. Example: ' + example + '. Definition: ' + definition
    test = prefix + word + '. Example: ' + example
    if len(combined) >= 5:
      text_inputs.append({'text': combined, 'test': test, 'word': word, 'definition': definition, 'example': example})

In [7]:
# Load data into a DataFrame
df = pd.DataFrame(text_inputs)
df['combined'] = df['text']
df['combined'] = df['combined'].astype(str)
df.head()

Unnamed: 0,text,test,word,definition,example,combined
0,Define this slang word or phrase: cartossin. E...,Define this slang word or phrase: cartossin. E...,cartossin,The act of giving up on both physical and ment...,Bobby's been going through a lot lately. His g...,Define this slang word or phrase: cartossin. E...
1,Define this slang word or phrase: feeding the ...,Define this slang word or phrase: feeding the ...,feeding the fish,Smoking the dank marijuana,Who's feeding the fish tonight?,Define this slang word or phrase: feeding the ...
2,Define this slang word or phrase: woody. Examp...,Define this slang word or phrase: woody. Examp...,woody,n A wooden roller coaster,"I like steel coaster, but I prefer the classic...",Define this slang word or phrase: woody. Examp...
3,Define this slang word or phrase: WMAF. Exampl...,Define this slang word or phrase: WMAF. Exampl...,WMAF,White male Asian Female couple.,Look at that WMAF couple over-there.,Define this slang word or phrase: WMAF. Exampl...
4,Define this slang word or phrase: Buzzfeed. Ex...,Define this slang word or phrase: Buzzfeed. Ex...,Buzzfeed,"When Barack Obama used to smoked pot at : AM, ...",I remember when Buzzfeed was something I did b...,Define this slang word or phrase: Buzzfeed. Ex...


In [8]:
df.dropna()
df.drop_duplicates(inplace=True)
df.shape

(16811, 6)

In [9]:
# Create some splits
np.random.shuffle(text_inputs)
num_valid_samples = int(0.15 * len(text_inputs))
num_train_samples = len(text_inputs) - 2 * num_valid_samples
train_pairs = text_inputs[:num_train_samples]
valid_pairs = text_inputs[num_train_samples : num_train_samples + num_valid_samples]
test_pairs = text_inputs[num_train_samples + num_valid_samples :]

print(f"{len(text_inputs)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(valid_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

16811 total pairs
11769 training pairs
2521 validation pairs
2521 test pairs


In [10]:
# Save splits to separate csv files, to load only part at a time later
# train_file = '/content/drive/My Drive/DATA/train_pairs.txt'
# valid_file = '/content/drive/My Drive/DATA/valid_pairs.txt'
# test_file = '/content/drive/My Drive/DATA/test_pairs.txt'
train_file = '/content/drive/My Drive/NLP_Final_Project/DATA/opt_fine_tune_train_pairs.csv'
valid_file = '/content/drive/My Drive/NLP_Final_Project/DATA/opt_fine_tune_valid_pairs.csv'
test_file = '/content/drive/My Drive/NLP_Final_Project/DATA/opt_fine_tune_test_pairs.csv'


pd.DataFrame(train_pairs).to_csv(train_file)
pd.DataFrame(valid_pairs).to_csv(valid_file)
pd.DataFrame(test_pairs).to_csv(test_file)

In [11]:
print(train_pairs[:10])

[{'text': 'Define this slang word or phrase: poophole loophole. Example: Guy : "I\'m bummed because this hot chick I dated is saving herself for marriage." Guy : "Just explain the poophole loophole to her and you\'ll score for sure." Chick: "Thank goodness for the poophole loophole. Now I can honestly tell God and my parents tha. Definition: The logic that allows girls who are saving their virginity for marriage to instead have premarital anal intercourse. Typically the boyfriends of such girls will convince them that, if they take it in the ass and don\'t pop the cherry, they\'re still pure an', 'test': 'Define this slang word or phrase: poophole loophole. Example: Guy : "I\'m bummed because this hot chick I dated is saving herself for marriage." Guy : "Just explain the poophole loophole to her and you\'ll score for sure." Chick: "Thank goodness for the poophole loophole. Now I can honestly tell God and my parents tha', 'word': 'poophole loophole', 'definition': "The logic that allows

## Load Train Test Data

In [None]:
from datasets import load_dataset
#datasets = load_dataset("text", data_files={"train": '/content/drive/My Drive/NLP_Final_Project/DATA/opt_fine_tune_train_pairs.csv', "validation": '/content/drive/My Drive/NLP_Final_Project/DATA/opt_fine_tune_valid_pairs.csv'})
datasets = load_dataset("text", data_files={"train": '/content/drive/My Drive/DATA/train_pairs.txt', "validation": '/content/drive/My Drive/DATA/valid_pairs.txt'})

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
#picking examples to visualize text
import pandas as pd
import random
from IPython.display import display, HTML
from datasets import ClassLabel

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])

    # Set display options to show full content
    with pd.option_context('display.max_colwidth', None):
        display(HTML(df.to_html()))


In [None]:
show_random_elements(datasets["train"])

Unnamed: 0,text
0,"5396,""Define this slang word or phrase: donald duck. Example: Hey, it's a nice day today. I think I'll donald duck it.. Definition: v. to wear a shirt with no pants."",""Define this slang word or phrase: donald duck. Example: Hey, it's a nice day today. I think I'll donald duck it."",donald duck,v. to wear a shirt with no pants.,""Hey, it's a nice day today. I think I'll donald duck it."""
1,"840,""Define this slang word or phrase: Super Saiyan. Example: When Goku turned super saiyan,he tore Frieza a new asshole!. Definition: A saiyan who has gone through a transformation which multiplies his current power and turns his hair blonde and his eyes green/light blue"",""Define this slang word or phrase: Super Saiyan. Example: When Goku turned super saiyan,he tore Frieza a new asshole!"",Super Saiyan,A saiyan who has gone through a transformation which multiplies his current power and turns his hair blonde and his eyes green/light blue,""When Goku turned super saiyan,he tore Frieza a new asshole!"""
2,"8702,""Define this slang word or phrase: Next level bullshit. Example: """"Dude the shit that's coming out of your mouth isn't just bullshit. Its some next level bullshit. Like there is no point of going any further with this explanation dude. YOU'RE DONE!!!!!!!!!!!. Definition: When the bullshit that comes out of a persons mouth reaches a point of absolute nonsense."",""Define this slang word or phrase: Next level bullshit. Example: """"Dude the shit that's coming out of your mouth isn't just bullshit. Its some next level bullshit. Like there is no point of going any further with this explanation dude. YOU'RE DONE!!!!!!!!!!!"",Next level bullshit,When the bullshit that comes out of a persons mouth reaches a point of absolute nonsense.,""""""Dude the shit that's coming out of your mouth isn't just bullshit. Its some next level bullshit. Like there is no point of going any further with this explanation dude. YOU'RE DONE!!!!!!!!!!!"""
3,"951,""Define this slang word or phrase: hair of the dog. Example: Steve: Man, I'm really paying for all those keg stands I did last night. Tony: Yeah, I'm hurting too. Hair of the dog? Steve: Why not, man. We got a few beers left in the fridge.. Definition: An alcoholic beverage consumed as a hangover rememdy. The phrase comes from the expression """"hair of the dog that bit you"""", meaning that the best cure for what ails you is to have some more of it. In ancient times it was literally used to say that if a dog"",""Define this slang word or phrase: hair of the dog. Example: Steve: Man, I'm really paying for all those keg stands I did last night. Tony: Yeah, I'm hurting too. Hair of the dog? Steve: Why not, man. We got a few beers left in the fridge."",hair of the dog,""An alcoholic beverage consumed as a hangover rememdy. The phrase comes from the expression """"hair of the dog that bit you"""", meaning that the best cure for what ails you is to have some more of it. In ancient times it was literally used to say that if a dog"",""Steve: Man, I'm really paying for all those keg stands I did last night. Tony: Yeah, I'm hurting too. Hair of the dog? Steve: Why not, man. We got a few beers left in the fridge."""
4,"10246,""Define this slang word or phrase: first kiss. Example: First Girl: I can't believe what happened last night! Jake kissed me! Second Girl: Omg your first kiss!. Definition: Your first experience, with someone of the other/same sex, of having another person put his/her lips on yours."",Define this slang word or phrase: first kiss. Example: First Girl: I can't believe what happened last night! Jake kissed me! Second Girl: Omg your first kiss!,first kiss,""Your first experience, with someone of the other/same sex, of having another person put his/her lips on yours."",First Girl: I can't believe what happened last night! Jake kissed me! Second Girl: Omg your first kiss!"
5,"8332,Define this slang word or phrase: blowing smoke up my ass. Example: Joe: That automobile salesman sure thought you were a good driver. Did you hear all the compliments he gave you? Bob: Yeah. He was just blowing smoke up my ass to get me to buy the car.. Definition: Complimenting a person merely to gain something in return.,Define this slang word or phrase: blowing smoke up my ass. Example: Joe: That automobile salesman sure thought you were a good driver. Did you hear all the compliments he gave you? Bob: Yeah. He was just blowing smoke up my ass to get me to buy the car.,blowing smoke up my ass,Complimenting a person merely to gain something in return.,Joe: That automobile salesman sure thought you were a good driver. Did you hear all the compliments he gave you? Bob: Yeah. He was just blowing smoke up my ass to get me to buy the car."
6,"4260,""Define this slang word or phrase: sweatermeat. Example: Check out the sweatermeat on the new chick behind the counter! Sweeeeeeeeeeeeeet!. Definition: The breasts, especially spectacular breasts."",Define this slang word or phrase: sweatermeat. Example: Check out the sweatermeat on the new chick behind the counter! Sweeeeeeeeeeeeeet!,sweatermeat,""The breasts, especially spectacular breasts."",Check out the sweatermeat on the new chick behind the counter! Sweeeeeeeeeeeeeet!"
7,"7797,""Define this slang word or phrase: Anarchy. Example: In , anarchists in the Spanish provinces of Catalonia and Aragon collectivised industry and agriculture, and established a working example of anarchy.. Definition: """"Anarchy"""" arises from ancient Greek """"An,"""" meaning without and """"Archos"""" meaning leader. In modern political philosophy anarchy, or anarchism the ideology which aims to create anarchy is traced back, often, to Proudhon, and in particular his work """"What is pr"",""Define this slang word or phrase: Anarchy. Example: In , anarchists in the Spanish provinces of Catalonia and Aragon collectivised industry and agriculture, and established a working example of anarchy."",Anarchy,""""""Anarchy"""" arises from ancient Greek """"An,"""" meaning without and """"Archos"""" meaning leader. In modern political philosophy anarchy, or anarchism the ideology which aims to create anarchy is traced back, often, to Proudhon, and in particular his work """"What is pr"",""In , anarchists in the Spanish provinces of Catalonia and Aragon collectivised industry and agriculture, and established a working example of anarchy."""
8,"8429,""Define this slang word or phrase: Naive. Example: Some people just vote for whoever; they're so naive.. Definition: Generally speaking, to be naive means you do not think enough. People who are """"naive"""" tend to believe in whatever they are told, without questioning whether it is right or wrong. As for age, it can be anywhere from -. Anyone who has not lived through and s"",Define this slang word or phrase: Naive. Example: Some people just vote for whoever; they're so naive.,Naive,""Generally speaking, to be naive means you do not think enough. People who are """"naive"""" tend to believe in whatever they are told, without questioning whether it is right or wrong. As for age, it can be anywhere from -. Anyone who has not lived through and s"",Some people just vote for whoever; they're so naive."
9,"304,""Define this slang word or phrase: Irina. Example: Irina Shayk. Definition: Irina is a Russian name, it comes from the Greek name Irene; meaning “peace”. Irina’s are usually very beautiful women/girls, with exotic features. They tend to be very physically attractive; face and body wise but also mentally attractive too. They make v"",Define this slang word or phrase: Irina. Example: Irina Shayk,Irina,""Irina is a Russian name, it comes from the Greek name Irene; meaning “peace”. Irina’s are usually very beautiful women/girls, with exotic features. They tend to be very physically attractive; face and body wise but also mentally attractive too. They make v"",Irina Shayk"


In [None]:
model_checkpoint = "facebook/opt-350m"

In [None]:
max_length = 128

In [None]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 13523
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 2898
    })
})

## Load Tokenizer and Tokenize Datasets

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Function to tokenize a batch of texts
def tokenize_function(examples):
    return tokenizer(examples["text"])

# Apply the tokenization function to the train and validation datasets
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4,remove_columns=["text"])

## Load Model Checkpoint

In [31]:
from transformers import AutoModelForCausalLM, OPTForCausalLM
model = OPTForCausalLM.from_pretrained(model_checkpoint)

NameError: ignored

In [32]:
tokenizer.decode(tokenized_datasets["train"][10]["input_ids"])

NameError: ignored

In [None]:
block_size = 128

In [None]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=16,
    num_proc=4,
)

In [None]:
tokenizer.decode(lm_datasets["train"][2]["input_ids"])

## Define Training Arguments

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
model_name = 'VarunOPT'
training_args = TrainingArguments(
    f"{model_name}-finetuned-slangQA_V2",
    evaluation_strategy = "epoch",
    num_train_epochs= 1,
    learning_rate=1e-5,
    weight_decay=0.01,
    push_to_hub=True,
    hub_token = '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' # Varun's write token
)

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"]
)

## Train Model

In [None]:
trainer.train()
train_history = pd.DataFrame(trainer.state.log_history)
train_history.head()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.8173,2.851719


Unnamed: 0,loss,learning_rate,epoch,step,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
0,3.0529,8.18314e-06,0.18,500,,,,,,,,,
1,2.9367,6.366279e-06,0.36,1000,,,,,,,,,
2,2.8775,4.549419e-06,0.55,1500,,,,,,,,,
3,2.846,2.732558e-06,0.73,2000,,,,,,,,,
4,2.8173,9.156977e-07,0.91,2500,,,,,,,,,


In [None]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1702089032.9eeb714db22f.5167.0:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

'https://huggingface.co/vrvenkatesh/VarunOPT-finetuned-slangQA_V2/tree/main/'

## Load Fine-Tuned Model

In [16]:
#load fine tuned model
from transformers import AutoModelForCausalLM, OPTModel, OPTForCausalLM

model = OPTForCausalLM.from_pretrained("vrvenkatesh/VarunOPT-finetuned-slangQA_V2")

config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [17]:
tokenizer = AutoTokenizer.from_pretrained("vrvenkatesh/VarunOPT-finetuned-slangQA_V2")

tokenizer_config.json:   0%|          | 0.00/669 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

## Look at Sample Output

In [18]:
slang_word = "broke"
inputs = tokenizer(f"{slang_word} is defined as: ", return_tensors="pt")
outputs = model.generate(**inputs,
                         do_sample=True,
                         min_length=10,
                         max_length=50,
                         temperature=0.97,
                         repetition_penalty=1.5,
                         renormalize_logits=True
            )
outputs.shape

torch.Size([1, 50])

In [19]:
#output answers
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['broke is defined as:  > Any person who uses cocaine, meth or heroin to obtain money.  Noob = Used / Not used but still gets more gold then anyone else in the game that can get anything done at max lvl without']


## Evaluation

In [20]:
import torch
import pandas as pd
import tensorflow as tf
import numpy as np
import re
import sentencepiece as spm
import evaluate
import rouge_score
import nltk

In [23]:
df_model_test = pd.DataFrame(test_pairs)
df_model_test[['test']].head()

Unnamed: 0,test
0,Define this slang word or phrase: sprog. Examp...
1,Define this slang word or phrase: C-cup. Examp...
2,Define this slang word or phrase: High mainten...
3,Define this slang word or phrase: Wizard. Exam...
4,Define this slang word or phrase: bi-sexual. E...


In [24]:
predictions = []

for line in df_model_test[:500].to_dict('records'):
      if len(predictions) % 100 == 0:
          #pd.DataFrame(predictions).to_csv('/content/drive/My Drive/DATA/OPT_fine_tune_predictions.csv')
          pd.DataFrame(predictions).to_csv('/content/drive/My Drive/NLP_Final_Project/DATA/OPT_fine_tune_predictions_500.csv')
          print("Saved to My Drive")
      word = line['word']
      example = line['test']
      test_inputs = tokenizer([example], return_tensors='pt', max_length=255)

      test_output_ids = model.generate(test_inputs['input_ids'],
                                          num_beams=3,
                                          no_repeat_ngram_size=3,
                                          min_length=30,
                                          max_length=256,
                                          temperature=0.97,
                                          repetition_penalty=1.5,
                                          renormalize_logits=True,
                                          output_scores = True,
                                          do_sample = True)

      predictions.extend([tokenizer.decode(out_ids, skip_special_tokens=True,
                                clean_up_tokenization_spaces=False) for out_ids in test_output_ids])
      print(f"Progress: {len(predictions)} out of {len(df_model_test[:500])}")

print(predictions)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Saved to My Drive
Progress: 1 out of 500
Progress: 2 out of 500
Progress: 3 out of 500
Progress: 4 out of 500
Progress: 5 out of 500
Progress: 6 out of 500
Progress: 7 out of 500
Progress: 8 out of 500
Progress: 9 out of 500
Progress: 10 out of 500
Progress: 11 out of 500
Progress: 12 out of 500
Progress: 13 out of 500
Progress: 14 out of 500
Progress: 15 out of 500
Progress: 16 out of 500
Progress: 17 out of 500
Progress: 18 out of 500
Progress: 19 out of 500
Progress: 20 out of 500
Progress: 21 out of 500
Progress: 22 out of 500
Progress: 23 out of 500
Progress: 24 out of 500
Progress: 25 out of 500
Progress: 26 out of 500
Progress: 27 out of 500
Progress: 28 out of 500
Progress: 29 out of 500
Progress: 30 out of 500
Progress: 31 out of 500
Progress: 32 out of 500
Progress: 33 out of 500
Progress: 34 out of 500
Progress: 35 out of 500
Progress: 36 out of 500
Progress: 37 out of 500
Progress: 38 out of 500
Progress: 39 out of 500
Progress: 40 out of 500
Progress: 41 out of 500
Progres

In [25]:
#pd.DataFrame(predictions).to_csv('/content/drive/My Drive/DATA/OPT_fine_tune_predictions.csv')
pd.DataFrame(predictions).to_csv('/content/drive/My Drive/NLP_Final_Project/DATA/OPT_fine_tune_predictions_500.csv')

In [26]:
references = []

for line in df_model_test[:500].to_dict('records'):
    references.append(line['text'])
print(references)

['Define this slang word or phrase: sprog. Example: "I\'ll come by your place after I drop the sprogs at school". Definition: once disparaging term for a child, now often used affectionately. Chiefly British', 'Define this slang word or phrase: C-cup. Example: You can tell those nice breasts are c-cups.. Definition: Average and most satisfying boob size.', 'Define this slang word or phrase: High maintenance woman. Example: "If you have to reassure her through texts email or calls that you\'re lucky to have her, sheks so pretty, etc...- she\'s a high maintenance woman" "If you have to put effort into wooing her as a true gentleman should, be lucky that you\'ve snagged yourself a. Definition: Has higher than normal expectations; has a greater requirement for affection or attention; has more needs and/or demands and therefore more difficult or challenging. Doesn\'t equate to money or material possessions alone but may be needy in emotional attent', 'Define this slang word or phrase: Wizar

In [28]:
pd.DataFrame(references).to_csv('/content/drive/My Drive/NLP_Final_Project/DATA/OPT_fine_tune_references_500.csv')

In [29]:
import evaluate

bleu = evaluate.load('bleu')

results = bleu.compute(predictions=predictions, references=references,
          max_order = 3)

print(results)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.26764834810855515, 'precisions': [0.32217711415399436, 0.25336381902584354, 0.23488453219842323], 'brevity_penalty': 1.0, 'length_ratio': 2.1527181158346913, 'translation_length': 72665, 'reference_length': 33755}


In [30]:
rouge = evaluate.load('rouge')

results = rouge.compute(predictions=predictions, references=references)

print(results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.434064394263483, 'rouge2': 0.35012985211906567, 'rougeL': 0.40937497754167684, 'rougeLsum': 0.40923684381121206}
