## Imports:

In [4]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import BartTokenizer, BartForConditionalGeneration, set_seed
from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer, BertLMHeadModel
from transformers import RobertaForMaskedLM, RobertaTokenizer
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import pipeline, DataCollatorForLanguageModeling

from datasets import load_dataset

import math
import statistics
import torch
from tqdm import tqdm

In [8]:
def get_sentences(file_path):
    final_path = "standardized_text/" + file_path
    sentences = []

    with open(final_path,"r") as f:
        sentences = f.readlines()

    return sentences

## First:
We first grab the codeswitched sentences...

In [176]:
spanglish_sentences = get_sentences("spanglish/sentimix2020.out")

We then declare the translation models... 

In [6]:
es_en_translator = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")
en_es_translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-es")




We now pass the sentences through the models

In [7]:
es_en_sentences = [es_en_translator(sentence) for sentence in spanglish_sentences]
en_es_sentences = [en_es_translator(sentence) for sentence in spanglish_sentences]

In [8]:
flattened_es_en_sentences = []
flattened_en_es_sentences = []

for sentence in es_en_sentences:
    flattened_es_en_sentences.append(sentence[0]['translation_text'])

for sentence in en_es_sentences:
    flattened_en_es_sentences.append(sentence[0]['translation_text'])
    

In [9]:
with open("standardized_text/spanglish_translated_en_es.out", "w") as f:
    for sentence in flattened_en_es_sentences:
        f.write(sentence + "\n")

with open("standardized_text/spanglish_translated_es_en.out", "w") as f:
    for sentence in flattened_es_en_sentences:
        f.write(sentence + "\n")

In [7]:
flattened_es_en_sentences = []
flattened_en_es_sentences = []

with open("standardized_text/spanglish_translated_en_es.out", "r") as f:
    tmp = []
    for sentence in f.readlines():
        tmp.append(sentence[:-1])

    flattened_en_es_sentences = tmp

with open("standardized_text/spanglish_translated_es_en.out", "r") as f:
    tmp = []
    for sentence in f.readlines():
        tmp.append(sentence[:-1])

    flattened_es_en_sentences = tmp

## After translation is done:
We load the model we wish to use into the notebook... 

### English

In [335]:
eng_model_id = "gpt2-large"
eng_model = GPT2LMHeadModel.from_pretrained(eng_model_id)
eng_tokenizer = GPT2TokenizerFast.from_pretrained(eng_model_id)
eng_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

bert_eng_model_id = "bert-base-uncased"
bert_eng_model = BertLMHeadModel.from_pretrained(bert_eng_model_id)
bert_eng_tokenizer = BertTokenizer.from_pretrained(bert_eng_model_id)
bert_eng_tokenizer.add_special_tokens({'pad_token': '[PAD]'})


If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0

### Spanish

In [331]:
span_model_id = "flax-community/gpt-2-spanish"
span_model = GPT2LMHeadModel.from_pretrained(span_model_id)
span_tokenizer = GPT2TokenizerFast.from_pretrained(span_model_id)
span_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

bert_span_model_id = "dccuchile/bert-base-spanish-wwm-uncased"
bert_span_model = BertLMHeadModel.from_pretrained(bert_span_model_id)
bert_span_tokenizer = BertTokenizer.from_pretrained(bert_span_model_id)
bert_span_tokenizer.add_special_tokens({'pad_token': '[PAD]'})


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Downloading (…)lve/main/config.json: 100%|██████████| 650/650 [00:00<00:00, 238kB/s]
Downloading pytorch_model.bin: 100%|██████████| 440M/440M [00:08<00:00, 53.5MB/s] 
If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Downloading (…)solve/main/vocab.txt: 100%|██████████| 248k/248k [00:00<00:00, 14.9MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 134/134 [00:00<00:00, 246kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 310/310 [00:00<00:00, 618kB/s]


0

We then get the text we preprocessed earlier to get perplexity scores...

In [9]:
eng_sentences = get_sentences("eng/sentiment140_converted.out")
span_sentences = get_sentences("span/combined_tass2020_converted.out")

In [16]:
print(len(eng_sentences))

1582575


and run it through the model's tokenizer...

In [10]:
eng_sentences = eng_sentences[:1500]
flattened_en_es_sentences = flattened_en_es_sentences[:1500]
span_sentences = span_sentences[:1500]
flattened_es_en_sentences = flattened_es_en_sentences[:1500]

In [96]:
print(type(span_sentences))

<class 'list'>


In [182]:
eng_encodings = eng_tokenizer(eng_sentences, padding='max_length', truncation=True, return_tensors="pt")
es_en_encodings = eng_tokenizer(flattened_es_en_sentences, padding='max_length', truncation=True, return_tensors="pt")
span_encodings = span_tokenizer(span_sentences, padding='max_length', truncation=True, max_length=1024, return_tensors="pt")
en_es_encodings = span_tokenizer(flattened_en_es_sentences, padding='max_length', truncation=True, max_length=1024, return_tensors="pt")

# print(encodings[0])

In [336]:
bert_eng_encodings = bert_eng_tokenizer(eng_sentences, padding='max_length', truncation=True, return_tensors="pt")
bert_es_en_encodings = bert_eng_tokenizer(flattened_es_en_sentences, padding='max_length', truncation=True, return_tensors="pt")
bert_span_encodings = bert_span_tokenizer(span_sentences, padding='max_length', truncation=True, max_length=1024, return_tensors="pt")
bert_en_es_encodings = bert_span_tokenizer(flattened_en_es_sentences, padding='max_length', truncation=True, max_length=1024, return_tensors="pt")

## After processing the data:
We now can run it through the model to generate perplexity scores:

In [346]:
def perplexity(model, sentences, tokenizer):
    max_length = model.config.n_positions
    stride = 512
    seq_len = 1024
    
    ppls = []
    for idx in tqdm(range(len(sentences))):
        inputs = tokenizer(sentences[idx], truncation=True, padding=True, max_length=1024, return_tensors="pt")
        nlls = []
        prev_end_loc = 0
        for begin_loc in range(0, seq_len, stride):
            end_loc = min(begin_loc + max_length, seq_len)
            trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
            input_ids = inputs.input_ids[:, begin_loc:end_loc]
            #print(input_ids.shape)
            target_ids = input_ids.clone()
            #print(target_ids.shape)
            target_ids[:, :-trg_len] = -100
            with torch.no_grad():
                outputs = model(input_ids, labels=target_ids)

                # loss is calculated using CrossEntropyLoss which averages over valid labels
                # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
                # to the left by 1.
                neg_log_likelihood = outputs.loss

            nlls.append(neg_log_likelihood)

            prev_end_loc = end_loc
            if end_loc == seq_len:
                break

        ppls.append(torch.exp(torch.stack(nlls).mean()).item())
    return statistics.mean(ppls)

In [190]:
def sentences_to_file(sentences, dest_file):
    with open(dest_file, "w") as f:
        for sentence in sentences:
            f.write(sentence + "\n")

In [191]:
sentences_to_file(eng_sentences, "standardized_text/eng/sentences.out")
sentences_to_file(span_sentences, "standardized_text/span/sentences.out")

In [184]:
eng_ppl = perplexity(eng_model, eng_sentences, eng_tokenizer)

100%|██████████| 1500/1500 [22:43<00:00,  1.10it/s]


In [185]:
span_ppl = perplexity(span_model, span_sentences, span_tokenizer)

100%|██████████| 1500/1500 [04:21<00:00,  5.73it/s]


In [186]:
es_en_ppl = perplexity(eng_model, flattened_es_en_sentences, eng_tokenizer)

100%|██████████| 1500/1500 [22:40<00:00,  1.10it/s] 


In [187]:
en_es_ppl = perplexity(span_model, flattened_en_es_sentences, span_tokenizer)

100%|██████████| 1500/1500 [04:04<00:00,  6.12it/s]


In [347]:
bert_eng_ppl = perplexity(bert_eng_model, eng_sentences, bert_eng_tokenizer)

100%|██████████| 1500/1500 [04:03<00:00,  6.16it/s]


In [342]:
bert_span_ppl = perplexity(bert_span_model, span_sentences, bert_span_tokenizer)

100%|██████████| 1500/1500 [04:11<00:00,  5.96it/s]


In [343]:
bert_es_en_ppl = perplexity(bert_eng_model, flattened_es_en_sentences, bert_eng_tokenizer)

100%|██████████| 1500/1500 [04:12<00:00,  5.94it/s]


In [344]:
bert_en_es_ppl = perplexity(bert_span_model, flattened_en_es_sentences, bert_span_tokenizer)

100%|██████████| 1500/1500 [04:10<00:00,  5.98it/s]


Print out the mean perplexity over all the sentences...

In [348]:
print(bert_eng_ppl)
print(bert_span_ppl)
print(bert_es_en_ppl)
print(bert_en_es_ppl)

8324137.729295817
1437454.7061899414
11999733.95928854
3954206.894305606


In [188]:
avg_spanglish_ppl = (es_en_ppl + en_es_ppl)/2

In [189]:
print(eng_ppl)
print(span_ppl)
print(es_en_ppl)
print(en_es_ppl)
print(avg_spanglish_ppl)

259.56097490469614
160.0516792122523
3319.269275833686
1554.115388373693
2436.6923321036893


We can now do these same steps but for different models...

### Bart:

In [303]:
bart_eng_model_name ='facebook/bart-base'
bart_span_model_name = 'vgaraujov/bart-base-spanish'
# Masking is a random process so results will vary unless this is set
# set_seed(0)

bart_span_model = BartForConditionalGeneration.from_pretrained(bart_span_model_name)
bart_span_tokenizer = AutoTokenizer.from_pretrained(bart_span_model_name)

bart_eng_model = BartForConditionalGeneration.from_pretrained(bart_eng_model_name)
bart_eng_tokenizer = BartTokenizer.from_pretrained(bart_eng_model_name)
#tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Downloading (…)neration_config.json: 100%|██████████| 191/191 [00:00<00:00, 34.7kB/s]


RoBERTa:

In [5]:
roberta_eng_model_name = 'roberta-base'
roberta_span_model_name = 'bertin-project/bertin-roberta-base-spanish'

roberta_eng_model = RobertaForMaskedLM.from_pretrained(roberta_eng_model_name)
roberta_eng_tokenizer = RobertaTokenizer.from_pretrained(roberta_eng_model_name)


roberta_span_model = RobertaForMaskedLM.from_pretrained(roberta_span_model_name)
roberta_span_tokenizer = RobertaTokenizer.from_pretrained(roberta_span_model_name)


We need to create an Evaluation Dataset class for Bart...

In [12]:
class EvalDataset(Dataset):
    def __init__(self, input_ids, decoder_input_ids):
        assert len(input_ids) == len(decoder_input_ids)
        self.input_ids         = input_ids
        self.decoder_input_ids = decoder_input_ids

    def __getitem__(self, index):
        return {'input_ids':         self.input_ids[index],
                'decoder_input_ids': self.decoder_input_ids[index]}

    def __len__(self):
        return len(self.input_ids)

as well as some methods to run our testing...

In [13]:
def chunk(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [15]:
def run_test(model=None, sentences=None, seq_len=None, tokenizer=None, num_test_chars=None, batch_size=8, mlm_prob=0.15, d_flag=False):
        samples = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=1024)['input_ids']
        # Add bos and eos tokens and create the decoder_input_ids
        # mask_token_id = 50264
        bos = torch.LongTensor([tokenizer.bos_token_id])               # = 0
        eos = torch.LongTensor([tokenizer.eos_token_id])               # = 2
        if(model.config.decoder_start_token_id != None):
            dst = torch.LongTensor([model.config.decoder_start_token_id]) # = 2 (same as eos token id)
        else:
              dst = bos
        input_ids   = [torch.cat((torch.cat((bos, sample)), eos)) for sample in samples]
        decoder_ids = [torch.cat((dst, input_id))[:-1] for input_id in input_ids]  # shift_tokens_right

        # Put this all into a dataset and create the loader
        # The collator will take care of randomly masking the input_id tokens and creating the 
        # 'labels' keys with -100 for any non-masked token
        dataset    = EvalDataset(input_ids, decoder_ids)
        collator   = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=mlm_prob)
        dataloader = DataLoader(dataset, collate_fn=collator, batch_size=batch_size)

        # Run evaluation
        print('Testing')
        model.eval()
        losses = []
        for step, batch in enumerate(tqdm(dataloader, ncols=100, disable=False)):
            with torch.no_grad():
                torch.set_printoptions(threshold=10000, linewidth=150)
                decoder_ids = batch['decoder_input_ids']
                input_ids   = batch['input_ids']
                labels      = batch['labels']
                if d_flag:
                    outputs = model(input_ids=input_ids, labels=labels)
                else:
                    outputs = model(input_ids=input_ids, labels=labels, decoder_input_ids=decoder_ids)

            losses.append(torch.exp(torch.FloatTensor([outputs.loss.item()])))
        try:
            perplexity = torch.mean(torch.FloatTensor(losses)).item()
        except OverflowError:
            perplexity = float('inf')
        return perplexity

In [295]:
bart_eng_ppl = run_test(bart_eng_model, eng_sentences, None, bart_eng_tokenizer)

Testing


100%|█████████████████████████████████████████████████████████████| 188/188 [08:44<00:00,  2.79s/it]


In [304]:
bart_span_ppl = run_test(bart_span_model, span_sentences, None, bart_span_tokenizer)

Testing


  0%|                                                                       | 0/188 [00:00<?, ?it/s]You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|█████████████████████████████████████████████████████████████| 188/188 [08:27<00:00,  2.70s/it]


In [312]:
bart_es_en_ppl = run_test(bart_eng_model, flattened_es_en_sentences, None, bart_eng_tokenizer)
bart_en_es_ppl = run_test(bart_span_model, flattened_en_es_sentences, None, bart_span_tokenizer)

Testing


100%|█████████████████████████████████████████████████████████████| 188/188 [37:47<00:00, 12.06s/it]


Testing


100%|█████████████████████████████████████████████████████████████| 188/188 [44:21<00:00, 14.16s/it]


In [310]:
print(torch.mean(torch.FloatTensor(bart_eng_ppl)).item())
print(torch.mean(torch.FloatTensor(bart_span_ppl)).item())

199.4083709716797
421.5256042480469


In [313]:
print(bart_en_es_ppl)
print(bart_es_en_ppl)

909.12158203125
792.385986328125


In [16]:
roberta_eng_ppl = run_test(roberta_eng_model, eng_sentences, None, roberta_eng_tokenizer)
roberta_span_ppl = run_test(roberta_span_model, span_sentences, None, roberta_span_tokenizer)
roberta_es_en_ppl = run_test(roberta_eng_model, flattened_es_en_sentences, None, roberta_eng_tokenizer)
roberta_en_es_ppl = run_test(roberta_span_model, flattened_en_es_sentences, None, roberta_span_tokenizer)


Testing


 13%|███████▉                                                      | 24/188 [01:01<07:02,  2.58s/it]