## Imports:

In [1]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import BartTokenizer, BartModel, set_seed
from torch.utils.data import DataLoader, Dataset
from transformers import pipeline, DataCollatorForLanguageModeling

from datasets import load_dataset

import math
import statistics
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_sentences(file_path):
    final_path = "standardized_text/" + file_path
    sentences = []

    with open(final_path,"r") as f:
        sentences = f.readlines()

    return sentences

## First:
We first grab the codeswitched sentences...

In [5]:
spanglish_sentences = get_sentences("spanglish/sentimix2020.out")

We then declare the translation models... 

In [6]:
es_en_translator = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")
en_es_translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-es")




We now pass the sentences through the models

In [7]:
es_en_sentences = [es_en_translator(sentence) for sentence in spanglish_sentences]
en_es_sentences = [en_es_translator(sentence) for sentence in spanglish_sentences]

In [8]:
flattened_es_en_sentences = []
flattened_en_es_sentences = []

for sentence in es_en_sentences:
    flattened_es_en_sentences.append(sentence[0]['translation_text'])

for sentence in en_es_sentences:
    flattened_en_es_sentences.append(sentence[0]['translation_text'])
    

In [9]:
with open("standardized_text/spanglish_translated_en_es.out", "w") as f:
    for sentence in flattened_en_es_sentences:
        f.write(sentence + "\n")

with open("standardized_text/spanglish_translated_es_en.out", "w") as f:
    for sentence in flattened_es_en_sentences:
        f.write(sentence + "\n")

In [3]:
flattened_es_en_sentences = []
flattened_en_es_sentences = []

with open("standardized_text/spanglish_translated_en_es.out", "r") as f:
    tmp = []
    for sentence in f.readlines():
        tmp.append(sentence[:-1])

    flattened_en_es_sentences = tmp

with open("standardized_text/spanglish_translated_es_en.out", "r") as f:
    tmp = []
    for sentence in f.readlines():
        tmp.append(sentence[:-1])

    flattened_es_en_sentences = tmp

## After translation is done:
We load the model we wish to use into the notebook... 

### English

In [4]:
eng_model_id = "gpt2-large"
eng_model = GPT2LMHeadModel.from_pretrained(eng_model_id)
eng_tokenizer = GPT2TokenizerFast.from_pretrained(eng_model_id)
eng_tokenizer.add_special_tokens({'pad_token': '[PAD]'})


1

### Spanish

In [5]:
span_model_id = "flax-community/gpt-2-spanish"
span_model = GPT2LMHeadModel.from_pretrained(span_model_id)
span_tokenizer = GPT2TokenizerFast.from_pretrained(span_model_id)
span_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

We then get the text we preprocessed earlier to get perplexity scores...

In [6]:
eng_sentences = get_sentences("eng/sentiment140_converted.out")
span_sentences = get_sentences("span/combined_tass2020_converted.out")

In [16]:
print(len(eng_sentences))

1582575


and run it through the model's tokenizer...

In [7]:
eng_encodings = eng_tokenizer(eng_sentences, padding='max_length', truncation=True)
es_en_encodings = eng_tokenizer(flattened_es_en_sentences, padding='max_length', truncation=True)
span_encodings = span_tokenizer(span_sentences, padding='max_length', truncation=True)
en_es_encodings = span_tokenizer(flattened_en_es_sentences, padding='max_length', truncation=True)


# print(encodings[0])

: 

: 

In [27]:
print(eng_encodings['input_ids'].shape)
print(es_en_encodings['input_ids'].shape)
print(span_encodings['input_ids'].shape)
print(en_es_encodings['input_ids'].shape)

torch.Size([1582575, 311])
torch.Size([1999, 509])
torch.Size([7245, 739])
torch.Size([1999, 139])


## After processing the data:
We now can run it through the model to generate perplexity scores:

In [43]:
def perplexity(model, encodings):
    max_length = model.config.n_positions
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        print(input_ids.shape)
        target_ids = input_ids.clone()
        print(target_ids.shape)
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).mean()).item()
    return ppl

In [44]:
eng_ppl = perplexity(eng_model, eng_encodings)
span_ppl = perplexity(span_model, span_encodings)
es_en_ppl = perplexity(eng_model, es_en_encodings)
en_es_ppl = perplexity(span_model, en_es_encodings)


  0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([1582575, 311])


  0%|          | 0/1 [00:01<?, ?it/s]

torch.Size([1582575, 311])





IndexError: index out of range in self

Print out the mean perplexity over all the sentences...

In [23]:
#avg_spanglish_ppl = (es_en_ppl + en_es_ppl)/2

print(eng_ppl)
print(span_ppl)
#print(avg_spanglish_ppl)

49.76630401611328
142.9220428466797


We can now do these same steps but for different models...

### Bart:

In [2]:
model_name ='facebook/bart-base'

# Masking is a random process so results will vary unless this is set
# set_seed(0)

model = BartModel.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

We need to create an Evaluation Dataset class for Bart...

In [9]:
class EvalDataset(Dataset):
    def __init__(self, input_ids, decoder_input_ids):
        assert len(input_ids) == len(decoder_input_ids)
        self.input_ids         = input_ids
        self.decoder_input_ids = decoder_input_ids

    def __getitem__(self, index):
        return {'input_ids':         self.input_ids[index],
                'decoder_input_ids': self.decoder_input_ids[index]}

    def __len__(self):
        return len(self.input_ids)

as well as some methods to run our testing...

In [10]:
def chunk(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [95]:
def run_test(model=None, samples=None, seq_len=None, tokenizer=None, num_test_chars=None, batch_size=8, mlm_prob=0.15):
        # Tokenize.  verbose=False elminates message 'token sequences too long for model'

        # Add bos and eos tokens and create the decoder_input_ids
        # mask_token_id = 50264
        bos = tokenizer.bos_token_id               # = 0
        eos = tokenizer.eos_token_id               # = 2
        dst = model.config.decoder_start_token_id  # = 2 (same as eos token id)
        input_ids   = [[bos] + sample + [eos] for sample in samples]
        decoder_ids = [[dst] + iids[:-1]      for iids   in input_ids]  # shift_tokens_right

        # Put this all into a dataset and create the loader
        # The collator will take care of randomly masking the input_id tokens and creating the 
        # 'labels' keys with -100 for any non-masked token
        dataset    = EvalDataset(input_ids, decoder_ids)
        collator   = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=mlm_prob)
        dataloader = DataLoader(dataset, collate_fn=collator, batch_size=batch_size)

        # Run evaluation
        print('Testing')
        model.eval()
        losses = []
        for step, batch in enumerate(tqdm(dataloader, ncols=100, disable=False)):
            with torch.no_grad():
                torch.set_printoptions(threshold=10000, linewidth=150)
                decoder_ids = batch['decoder_input_ids']
                input_ids   = batch['input_ids']
                labels      = batch['labels']
                outputs = model(input_ids=input_ids, labels=labels, decoder_input_ids=decoder_ids)
            losses.append(outputs.loss.item())
        try:
            perplexity = math.exp(statistics.mean(losses))
        except OverflowError:
            perplexity = float('inf')
        return perplexity

In [14]:
samples = tokenizer(eng_sentences, return_tensors="pt", padding=True, truncation=True, max_length=1024)['input_ids']

In [15]:
print(samples.shape)

torch.Size([1582575, 313])


In [96]:
ppl = run_test(model, eng_sentences, None, tokenizer)

Testing


  0%|                                                                    | 0/197822 [00:00<?, ?it/s]


IndexError: index out of range in self