In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import gensim
from gensim.models import Word2Vec
import sklearn
from sklearn.model_selection import train_test_split
from gensim import downloader
from sklearn.metrics import accuracy_score
from torch.nn import functional as F
import pickle
from transformers import BertTokenizer, BertForMaskedLM
from tqdm import tqdm  # for our progress bar
from transformers import AdamW

In [2]:
!pip install transformers

You should consider upgrading via the '/anaconda/envs/py38_default/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
!pip install datasets

You should consider upgrading via the '/anaconda/envs/py38_default/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
device

device(type='cuda', index=0)

# Import 2015 DF

We use them because they are lighter and faster to train and eval

In [6]:
nyt_train_df = pd.read_csv('//home//student//project//project_final_files//nytcrosswords_balanced_days_train_2015.csv',parse_dates = ['Date'], encoding ="ISO-8859-1")
nyt_eval_df = pd.read_csv('/home//student//project//project_final_files//nytcrosswords_balanced_days_eval_2015.csv',parse_dates = ['Date'], encoding ="ISO-8859-1")

In [7]:
nyt_train_df.drop(columns = ['Unnamed: 0'],inplace = True)
nyt_train_df

Unnamed: 0,Date,Word,Clue,day_of_week,year
0,2016-08-20,DEW,Wet blanket?,Saturday,2016
1,2020-10-21,SHOP,Union workplace,Wednesday,2020
2,2015-01-28,AWET,Mad as ___ hen,Wednesday,2015
3,2019-12-08,NARNIA,Fictional land of books and film,Sunday,2019
4,2020-09-15,ROKU,Giant in media streaming,Tuesday,2020
...,...,...,...,...,...
117532,2020-07-15,DELTA,Charlie follower,Wednesday,2020
117533,2017-03-08,TETRAD,Group of four,Wednesday,2017
117534,2020-12-22,OLD,Aged,Tuesday,2020
117535,2017-03-05,ROADSIDEDINER,[Circled letters]-advertised establishment,Sunday,2017


In [8]:
nyt_eval_df.drop(columns = ['Unnamed: 0'],inplace = True)
nyt_eval_df

Unnamed: 0,Date,Word,Clue,day_of_week,year
0,2015-09-07,INSPIRE,Motivate,Monday,2015
1,2017-10-29,ALONE,Stag,Sunday,2017
2,2020-08-11,EDGES,Things that spheres lack,Tuesday,2020
3,2018-12-17,TYKE,Young 'un,Monday,2018
4,2015-03-20,EUCLID,"Who wrote to Ptolemy I ""There is no royal road...",Friday,2015
...,...,...,...,...,...
33843,2018-10-29,TNT,Relative of dynamite,Monday,2018
33844,2017-10-12,ATEUP,Believed with no questions asked,Thursday,2017
33845,2016-09-14,UTILITY,Gas or water,Wednesday,2016
33846,2020-10-17,CRI,"Shout, in Chamonix",Saturday,2020


# Data loader class 

Its here because it relevant for both models as it is ised in the fine-tuning loop to pass the batches

In [9]:
class CrossWordsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)        

# BERT MLM on 2015 data masked twice

In this section we will try to solve the crosswords with the basic masking technique , which is masking every answer with double mask in the traning set.
this technique is derived from the problem that we cant know from the answer that is given to us that the answer is a single word , two words or three words.
This also applies to real life New York Times crosswords , there are only hints and a total count of letters in the answer

We will use different auxiliary function in each of the model section

**Some auxiliary function for training and evaluating the model**

In [10]:
def create_riddles_and_answers_mask_position_corrected_masked_once(df):
    hint_masked_list = []
    hist_answer_list = []
    for row in df.itertuples():
        if '_' in str(row.Clue):
            masked = re.sub('___','[MASK]',row.Clue)
            answerd = re.sub('___',str(row.Word),row.Clue)
        else: # doesnt have ___ , hence second kind of hint
            masked = str(row.Clue) + ' : [MASK]'
            answerd = str(row.Clue) + ' : '+str(row.Word)
        hint_masked_list.append(masked)
        hist_answer_list.append(answerd)
    return hint_masked_list,hist_answer_list

In [11]:
def create_riddles_and_answers_mask_position_corrected_masked_twice(df):
    hint_masked_list_bigram = []
    hist_answer_list_bigram = []
    answers_list = []
    for row in df.itertuples():
        if '_' in str(row.Clue):
            masked = re.sub('___','[MASK] [MASK]',row.Clue)
            answerd = re.sub('___',str(row.Word),row.Clue)
        else: # doesnt have ___ , hence second kind of hint
            masked = str(row.Clue) + ' : [MASK] [MASK]'
            answerd = str(row.Clue) + ' : '+str(row.Word)
        hint_masked_list_bigram.append(masked)
        hist_answer_list_bigram.append(answerd)
        answers_list.append(str(row.Word))
    return hint_masked_list_bigram,hist_answer_list_bigram , answers_list

In [12]:
def get_topk_accuracy_uni_plus_bigram(model,tokenizer,masked_tokenized_unigram, masked_tokenized_bigram , answered_untokenzed_list,top_ks = 10):
    cnt_uni = 0
    cnt_bi = 0
    total = len(answered_untokenzed_list)
    loop = tqdm(range(total), leave=True)
    for i in loop:
        answer = answered_untokenzed_list[i]
        output_bi = model(input_ids = masked_tokenized_bigram.input_ids[i].to(model.device).unsqueeze(0),
                                                   token_type_ids = masked_tokenized_bigram.token_type_ids[i].to(model.device).unsqueeze(0) ,
                                                   attention_mask = masked_tokenized_bigram.attention_mask[i].to(model.device).unsqueeze(0))
        masked_id_list = []
        for mask_id in range(masked_tokenized_bigram.input_ids[i].shape[0]):
            if masked_tokenized_bigram.input_ids[i][mask_id] == 103:
#                 masked_id_list.append(mask_id+1)
                masked_id_list.append(mask_id)
        answer_input_ids = []
        for mask_id in masked_id_list:
            answer_input_ids.append(output_bi.logits[0][mask_id].topk(top_ks).indices)
        if ( len(answer_input_ids) < 2):
            continue
        predictions = []
        for tk in range(np.min([answer_input_ids[0].shape[0],answer_input_ids[1].shape[0],top_ks])):
            temp = [answer_input_ids[0][tk].item(),answer_input_ids[1][tk].item()]
            predictions.extend(tokenizer.decode(temp).split())
        if ( answer.lower() in predictions):
            cnt_bi += 1
            continue
        
        # else answer bi not in prediction , try unigram
        output_uni = model(input_ids = masked_tokenized_unigram.input_ids[i].to(model.device).unsqueeze(0),
                                                   token_type_ids = masked_tokenized_unigram.token_type_ids[i].to(model.device).unsqueeze(0) ,
                                                   attention_mask = masked_tokenized_unigram.attention_mask[i].to(model.device).unsqueeze(0))
        masked_index = -1
        for mask_id in range(masked_tokenized_unigram.input_ids[i].shape[0]):
            if masked_tokenized_unigram.input_ids[i][mask_id] == 103:
                masked_index = mask_id
                break
        if masked_index == -1:
            continue
        predictions = []
        predictions = tokenizer.decode(output_uni.logits[0][masked_index].topk(top_ks).indices).split()
        if ( answer.lower() in predictions):
            cnt_uni += 1
    print('Count for masked once was: ',cnt_uni,', Count for masked twice was: ',cnt_bi,' both of them sum up to',cnt_bi+cnt_uni ,' out of a total of ',total)    
    print('So this brings us to the accuract for top ',top_ks,' prediction model made at',round((cnt_uni + cnt_bi)/total*100,3),'%.')
    return cnt_uni,cnt_bi,total

In [13]:
def get_top_k_accuracy_by_days(model,tokenizer,df_to_check,top_ks = 10):
    days = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
    results_dict = {}
    over_all_once = 0
    over_all_twice = 0
    over_all_total = 0
    for day in days:
        once_cnt = 0
        twice_cnt = 0
        total = 0
        masked_by_day_one , answered_by_day_one = create_riddles_and_answers_mask_position_corrected_masked_once(df_to_check[df_to_check['day_of_week'] == day])
        day_inputs = tokenized_list_for_model(tokenizer , [masked_by_day_one,answered_by_day_one])
        masked_by_day_twice , answered_by_day_twice , day_answer_list = create_riddles_and_answers_mask_position_corrected_masked_twice(df_to_check[df_to_check['day_of_week'] == day])
        day_inputs_twice = tokenized_list_for_model(tokenizer ,[masked_by_day_twice , answered_by_day_twice] )
        print(day,' The results are as follows: ')
        once_cnt , twice_cnt , total = get_topk_accuracy_uni_plus_bigram(model , tokenizer , day_inputs,day_inputs_twice,day_answer_list,top_ks)
        results_dict[day] = {'total_enteries' : total , 'masked_once_cnt' : once_cnt , 'masked_twice_cnt' : twice_cnt }
        over_all_once += once_cnt
        over_all_twice += twice_cnt
        over_all_total += total
    results_dict['overall'] = {'total_enteries' : over_all_total , 'masked_once_cnt' : over_all_once , 'masked_twice_cnt' : over_all_twice}
    print('Total score as follows\nCount for masked once was: ',over_all_once,', Count for masked twice was: ',over_all_twice,' both of them sum up to',over_all_once+over_all_twice ,' out of a total of ',over_all_total)    
    print('So this brings us to the accuract for top ',top_ks,' prediction model made at',round((over_all_once + over_all_twice)/over_all_total*100,3),'%.')
    return results_dict

In [14]:
def tokenized_list_for_model(tokenizer , list_to_tokenize,pad_max_len = 64): # pass both lists masked and answered
    masked_ds_train = tokenizer(list_to_tokenize[0],padding=True,truncation=True,max_length = pad_max_len,return_tensors="pt")
    masked_ds_train['labels'] = tokenizer(list_to_tokenize[1],padding=True,truncation=True,max_length = pad_max_len,return_tensors="pt").input_ids.detach().clone()
    return masked_ds_train
# masked_ds_train = masked_ds_train.to(device)

In [15]:
def generate_days_histogram(df):
    days_week_dict = {d:df[df['day_of_week'].apply(lambda x: x == d)].shape[0] for d in set(df['day_of_week'])}
    days = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
    total = np.sum([v for v in days_week_dict.values()])
    for d in days:
        print(d,' Values is ',days_week_dict[d],' Which in proportion to other days is: ',round(days_week_dict[d]/total,3))
    plt.figure(figsize = (10,5))
    plt.bar([i for i in range(len(days_week_dict.keys()))],days_week_dict.values())
    plt.title('Days of week histogram count')
    plt.ylabel('Count for day')
    plt.xlabel('Day')
    plt.xticks([i for i in range(len(days_week_dict.keys()))],list(days_week_dict.keys()))
    plt.show()

**BERT MLM answers tokenized twice regarless**

First lets import the tokenizer and the basic model that we will fine-tune

In [16]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [17]:
model_MLM_masked_twice_2015 = BertForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


**Now we tokenize the training df , not the eval , because the eval is feed to the accuracy function as a df not a tokenized list**

The function create_riddles_and_answers_mask_position_corrected_masked_twice() return three lists as follows:
1. list of the hint and answer masked twice as we said , for the input_ids for the training loop
2. list of the hist with the answer for the labels in the next function
3. list of answers alone , this is wont be used now but it is used by the accuarcy evaluation function

In [18]:
train_twice_masked , train_twice_answered , train_answers = create_riddles_and_answers_mask_position_corrected_masked_twice(nyt_train_df)

In [19]:
print('len of masked',len(train_twice_masked),' len of answered' , len(train_twice_answered),' len answers alone', len(train_answers))
print('Do all have the same length? ', len(train_twice_masked) == len(train_twice_answered) == len(train_answers))

len of masked 117537  len of answered 117537  len answers alone 117537
Do all have the same length?  True


Now we will tokenize the training input with the function tokenized_list_for_model() , which will recieve the tokenizer two lists
1. one is the list 'train_twice_masked' which will be used for input_ids
2. seconed is 'train_twice_answered' which will be used as the labels

In [20]:
train_twice_tokenized = tokenized_list_for_model(tokenizer , [train_twice_masked,train_twice_answered])

In [21]:
print('train masked twice size is ',train_twice_tokenized.input_ids.shape)

train masked twice size is  torch.Size([117537, 44])


Now we will create a Dataset and a loader for the training loop.

In [22]:
dataset_train_masked_twice = CrossWordsDataset(train_twice_tokenized)

In [23]:
loader_train_twice = torch.utils.data.DataLoader(dataset_train_masked_twice, batch_size=32, shuffle=True)

**Creating the optimizer and the training loop in next cells**

We will train the model for 3 epochs , save model after each epoch , after the third epoch we will evaluate the model based on the training and eval df and continue to fine tune two more epochs if neccesary

In [24]:
model_MLM_masked_twice_2015.to(device)
model_MLM_masked_twice_2015.train()

# initialize optimizer
optim = AdamW(model_MLM_masked_twice_2015.parameters(), lr=5e-5)

In [25]:
epochs = 3

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader_train_twice, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model_MLM_masked_twice_2015(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    pickle.dump(model_MLM_masked_twice_2015, open('//home//student//project//BERT_MLM_masked_twice_batch32_pad64_2015_data'+'_epoch'+str(epoch)+'.sav', 'wb'))

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 3674/3674 [28:39<00:00,  2.14it/s, loss=0.0301]
Epoch 1: 100%|██████████| 3674/3674 [28:44<00:00,  2.13it/s, loss=0.943]
Epoch 2: 100%|██████████| 3674/3674 [28:43<00:00,  2.13it/s, loss=0.172] 


**Evaluate the model based on the the training df and eval df**

In [26]:
model_MLM_masked_twice_2015.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [27]:
eval_results_masked_twice_2015_3epochs = get_top_k_accuracy_by_days(model_MLM_masked_twice_2015,tokenizer,nyt_eval_df)

Sunday  The results are as follows: 


100%|██████████| 4919/4919 [01:57<00:00, 41.80it/s]


Count for masked once was:  473 , Count for masked twice was:  1020  both of them sum up to 1493  out of a total of  4919
So this brings us to the accuract for top  10  prediction model made at 30.352 %.
Monday  The results are as follows: 


100%|██████████| 5187/5187 [01:53<00:00, 45.74it/s]


Count for masked once was:  653 , Count for masked twice was:  1935  both of them sum up to 2588  out of a total of  5187
So this brings us to the accuract for top  10  prediction model made at 49.894 %.
Tuesday  The results are as follows: 


100%|██████████| 5176/5176 [01:56<00:00, 44.32it/s]


Count for masked once was:  560 , Count for masked twice was:  1653  both of them sum up to 2213  out of a total of  5176
So this brings us to the accuract for top  10  prediction model made at 42.755 %.
Wednesday  The results are as follows: 


100%|██████████| 4973/4973 [01:51<00:00, 44.47it/s]


Count for masked once was:  541 , Count for masked twice was:  1324  both of them sum up to 1865  out of a total of  4973
So this brings us to the accuract for top  10  prediction model made at 37.503 %.
Thursday  The results are as follows: 


100%|██████████| 4986/4986 [02:00<00:00, 41.51it/s]


Count for masked once was:  519 , Count for masked twice was:  990  both of them sum up to 1509  out of a total of  4986
So this brings us to the accuract for top  10  prediction model made at 30.265 %.
Friday  The results are as follows: 


100%|██████████| 4407/4407 [01:49<00:00, 40.07it/s]


Count for masked once was:  381 , Count for masked twice was:  617  both of them sum up to 998  out of a total of  4407
So this brings us to the accuract for top  10  prediction model made at 22.646 %.
Saturday  The results are as follows: 


100%|██████████| 4200/4200 [01:46<00:00, 39.62it/s]

Count for masked once was:  316 , Count for masked twice was:  435  both of them sum up to 751  out of a total of  4200
So this brings us to the accuract for top  10  prediction model made at 17.881 %.
Total score as follows
Count for masked once was:  3443 , Count for masked twice was:  7974  both of them sum up to 11417  out of a total of  33848
So this brings us to the accuract for top  10  prediction model made at 33.73 %.





In [28]:
train_results_masked_twice_2015_3epochs = get_top_k_accuracy_by_days(model_MLM_masked_twice_2015,tokenizer,nyt_train_df)

Sunday  The results are as follows: 


100%|██████████| 17163/17163 [05:56<00:00, 48.19it/s]


Count for masked once was:  1392 , Count for masked twice was:  8032  both of them sum up to 9424  out of a total of  17163
So this brings us to the accuract for top  10  prediction model made at 54.909 %.
Monday  The results are as follows: 


100%|██████████| 17980/17980 [05:32<00:00, 54.08it/s]


Count for masked once was:  1100 , Count for masked twice was:  11279  both of them sum up to 12379  out of a total of  17980
So this brings us to the accuract for top  10  prediction model made at 68.849 %.
Tuesday  The results are as follows: 


100%|██████████| 17594/17594 [05:40<00:00, 51.65it/s]


Count for masked once was:  1314 , Count for masked twice was:  10024  both of them sum up to 11338  out of a total of  17594
So this brings us to the accuract for top  10  prediction model made at 64.442 %.
Wednesday  The results are as follows: 


100%|██████████| 17358/17358 [05:42<00:00, 50.63it/s]


Count for masked once was:  1324 , Count for masked twice was:  9328  both of them sum up to 10652  out of a total of  17358
So this brings us to the accuract for top  10  prediction model made at 61.367 %.
Thursday  The results are as follows: 


100%|██████████| 17076/17076 [05:51<00:00, 48.53it/s]


Count for masked once was:  1545 , Count for masked twice was:  7891  both of them sum up to 9436  out of a total of  17076
So this brings us to the accuract for top  10  prediction model made at 55.259 %.
Friday  The results are as follows: 


100%|██████████| 15519/15519 [05:34<00:00, 46.45it/s]


Count for masked once was:  1380 , Count for masked twice was:  6138  both of them sum up to 7518  out of a total of  15519
So this brings us to the accuract for top  10  prediction model made at 48.444 %.
Saturday  The results are as follows: 


100%|██████████| 14847/14847 [05:28<00:00, 45.21it/s]

Count for masked once was:  1278 , Count for masked twice was:  5174  both of them sum up to 6452  out of a total of  14847
So this brings us to the accuract for top  10  prediction model made at 43.457 %.
Total score as follows
Count for masked once was:  9333 , Count for masked twice was:  57866  both of them sum up to 67199  out of a total of  117537
So this brings us to the accuract for top  10  prediction model made at 57.173 %.





**We will fine tune the model for two more epochs and see where that gets us**

In [29]:
model_MLM_masked_twice_2015.train()

# initialize optimizer
optim = AdamW(model_MLM_masked_twice_2015.parameters(), lr=5e-5)

In [30]:
epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader_train_twice, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model_MLM_masked_twice_2015(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    pickle.dump(model_MLM_masked_twice_2015, open('//home//student//project//BERT_MLM_masked_twice_batch32_pad64_2015_data'+'_epoch'+str(epoch+3)+'.sav', 'wb'))

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 3674/3674 [28:41<00:00,  2.13it/s, loss=0.225] 
Epoch 1: 100%|██████████| 3674/3674 [28:42<00:00,  2.13it/s, loss=0.0312]


**Back to eval after 5 epochs**

In [31]:
model_MLM_masked_twice_2015.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [32]:
eval_results_masked_twice_2015_5epochs = get_top_k_accuracy_by_days(model_MLM_masked_twice_2015,tokenizer,nyt_eval_df)

Sunday  The results are as follows: 


100%|██████████| 4919/4919 [01:55<00:00, 42.71it/s]


Count for masked once was:  375 , Count for masked twice was:  1259  both of them sum up to 1634  out of a total of  4919
So this brings us to the accuract for top  10  prediction model made at 33.218 %.
Monday  The results are as follows: 


100%|██████████| 5187/5187 [01:50<00:00, 46.96it/s]


Count for masked once was:  527 , Count for masked twice was:  2160  both of them sum up to 2687  out of a total of  5187
So this brings us to the accuract for top  10  prediction model made at 51.803 %.
Tuesday  The results are as follows: 


100%|██████████| 5176/5176 [01:53<00:00, 45.56it/s]


Count for masked once was:  484 , Count for masked twice was:  1861  both of them sum up to 2345  out of a total of  5176
So this brings us to the accuract for top  10  prediction model made at 45.305 %.
Wednesday  The results are as follows: 


100%|██████████| 4973/4973 [01:50<00:00, 45.09it/s]


Count for masked once was:  476 , Count for masked twice was:  1477  both of them sum up to 1953  out of a total of  4973
So this brings us to the accuract for top  10  prediction model made at 39.272 %.
Thursday  The results are as follows: 


100%|██████████| 4986/4986 [01:56<00:00, 42.64it/s]


Count for masked once was:  426 , Count for masked twice was:  1210  both of them sum up to 1636  out of a total of  4986
So this brings us to the accuract for top  10  prediction model made at 32.812 %.
Friday  The results are as follows: 


100%|██████████| 4407/4407 [01:46<00:00, 41.53it/s]


Count for masked once was:  273 , Count for masked twice was:  805  both of them sum up to 1078  out of a total of  4407
So this brings us to the accuract for top  10  prediction model made at 24.461 %.
Saturday  The results are as follows: 


100%|██████████| 4200/4200 [01:44<00:00, 40.20it/s]

Count for masked once was:  265 , Count for masked twice was:  593  both of them sum up to 858  out of a total of  4200
So this brings us to the accuract for top  10  prediction model made at 20.429 %.
Total score as follows
Count for masked once was:  2826 , Count for masked twice was:  9365  both of them sum up to 12191  out of a total of  33848
So this brings us to the accuract for top  10  prediction model made at 36.017 %.





In [33]:
train_results_masked_twice_2015_5epochs = get_top_k_accuracy_by_days(model_MLM_masked_twice_2015,tokenizer,nyt_train_df)

Sunday  The results are as follows: 


100%|██████████| 17163/17163 [05:04<00:00, 56.36it/s]


Count for masked once was:  654 , Count for masked twice was:  11841  both of them sum up to 12495  out of a total of  17163
So this brings us to the accuract for top  10  prediction model made at 72.802 %.
Monday  The results are as follows: 


100%|██████████| 17980/17980 [04:53<00:00, 61.18it/s]


Count for masked once was:  532 , Count for masked twice was:  14097  both of them sum up to 14629  out of a total of  17980
So this brings us to the accuract for top  10  prediction model made at 81.363 %.
Tuesday  The results are as follows: 


100%|██████████| 17594/17594 [04:55<00:00, 59.62it/s]


Count for masked once was:  546 , Count for masked twice was:  13391  both of them sum up to 13937  out of a total of  17594
So this brings us to the accuract for top  10  prediction model made at 79.215 %.
Wednesday  The results are as follows: 


100%|██████████| 17358/17358 [04:54<00:00, 58.85it/s]


Count for masked once was:  616 , Count for masked twice was:  12754  both of them sum up to 13370  out of a total of  17358
So this brings us to the accuract for top  10  prediction model made at 77.025 %.
Thursday  The results are as follows: 


100%|██████████| 17076/17076 [05:01<00:00, 56.59it/s]


Count for masked once was:  753 , Count for masked twice was:  11773  both of them sum up to 12526  out of a total of  17076
So this brings us to the accuract for top  10  prediction model made at 73.354 %.
Friday  The results are as follows: 


100%|██████████| 15519/15519 [04:45<00:00, 54.33it/s]


Count for masked once was:  638 , Count for masked twice was:  9792  both of them sum up to 10430  out of a total of  15519
So this brings us to the accuract for top  10  prediction model made at 67.208 %.
Saturday  The results are as follows: 


100%|██████████| 14847/14847 [04:44<00:00, 52.17it/s]


Count for masked once was:  639 , Count for masked twice was:  8649  both of them sum up to 9288  out of a total of  14847
So this brings us to the accuract for top  10  prediction model made at 62.558 %.
Total score as follows
Count for masked once was:  4378 , Count for masked twice was:  82297  both of them sum up to 86675  out of a total of  117537
So this brings us to the accuract for top  10  prediction model made at 73.743 %.


# BERT MLM smart masking

Basicaly, smart masking means that we cheat the system abit to see if we get better result and on the basis that we dont know how much tokens doest the tokenizer will split the answer to. 

So what is a smart masking , so when we create the lists of the masked hints while creating the masked hint , contrary to what we have done on the double masking which is just add '[MASK] [MASK]' to where the answer should be in the hint .
In the smart masking we first tokenize the answer , and wee how much tokens doest the okenizer return .

i.e. the answer is 'asis' and the tokenizer tokinezed it to '[101] [546] [897] [102]' ,(two middle token are made up) we know now that the correct masking for the word containes two tokens  , and we will add '[MASK] [MASK]' to the hint.

this goes for both ways , if a word is masked in three token or in one , this will be reflect in the trainig ds masking and thus we hope and predict that the learning will be more effective and we hope in geting better result with smart masking.

**Start with importing the new clean model , tokenizer is not imported again because its the same one**

In [34]:
BERT_MLM_smart_masking = BertForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


**Auxiliary functions**

In [35]:
def create_masks_for_hist(answer_len):
    tmp = ''
    for i in range(answer_len):
        tmp += '[MASK]'
        if i < answer_len-1:
              tmp += ' '
    return tmp

In [36]:
def create_masked_answered_lists_for_tokenization(df,tokenizer,padding_len = 64):
    # we are using smart masking which means first we check how many tokens is the answer in the model and than we add this
    # number of masks to the hint in the correct position , say the answer is tokenized into 3 token than the hint answered will
    # be hist : [MASK] [MASK] [MASK]
    hint_masked_list = []
    hint_answered_list = []
    answers_list = []
    for row in df.itertuples():
        tokenized_answer_len = 0
        tokenized_answer = tokenizer(str(row.Word),padding = True,truncation = True,max_length = padding_len , return_tensors = 'pt')
        tokenized_answer_len = len(tokenized_answer.input_ids[0].tolist())-2 # because it will always be [101 , x , x , 102]
        masks_for_hist = create_masks_for_hist(tokenized_answer_len)
        if '_' in str(row.Clue):
            masked = re.sub('___',masks_for_hist,row.Clue)
            answered = re.sub('___',str(row.Word),row.Clue)
        else:
            masked = str(row.Clue) + ' : ' + masks_for_hist
            answered = str(row.Clue) + ' : ' + str(row.Word)
        hint_masked_list.append(masked)
        hint_answered_list.append(answered)
        answers_list.append(str(row.Word))
    return hint_masked_list,hint_answered_list,answers_list

In [37]:
def get_top_k_accuracy_by_days_smart_masking(model,tokenizer,df_to_check,top_ks = 10):
    days = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']
    results_dict = {}
    overall_cnt = 0
    overall_total = 0
    for day in days:
        cnt = 0
        total = 0
        masked_by_day , answered_by_day, asnwers_day = create_masked_answered_lists_for_tokenization(df_to_check[df_to_check['day_of_week'] == day],tokenizer)
        day_inputs = tokenized_list_for_model(tokenizer , [masked_by_day,answered_by_day])
        print(day,' The results are as follows: ')
        cnt , total = get_topk_accuracy_smart_masking(model , tokenizer , day_inputs,asnwers_day,top_ks)
        results_dict[day] = {'total_day' : total , 'cnt_day' : cnt }
        overall_cnt += cnt
        overall_total += total
    results_dict['overall'] = {'total_enteries' : overall_total , 'total_cnt' : overall_cnt}
    print('Overall results for df as follows\n')
    print('Count for top ',top_ks,'predictions is ',overall_cnt,' out of total of ',overall_total,' enteries.')
    print('This gives us an accuracy precentage of',round((overall_cnt/overall_total)*100,3))
    return results_dict

In [38]:
def get_topk_accuracy_smart_masking(model,tokenizer,masked_tokenized , answered_untokenzed_list,top_ks = 10):
    cnt = 0
    total = len(answered_untokenzed_list)
    loop = tqdm(range(total), leave=True)
    for i in loop:
        answer = answered_untokenzed_list[i]
        input_ids = masked_tokenized.input_ids[i].to(model.device).unsqueeze(0)
        token_type_ids = masked_tokenized.token_type_ids[i].to(model.device).unsqueeze(0)
        attention_masks = masked_tokenized.attention_mask[i].to(model.device).unsqueeze(0)
        
        output = model(input_ids = input_ids,token_type_ids = token_type_ids ,attention_mask = attention_masks)
        
        masked_id_list = []
        for mask_id in range(masked_tokenized.input_ids[i].shape[0]):
            if masked_tokenized.input_ids[i][mask_id] == 103:
                  masked_id_list.append(mask_id)
        
        topk_predictions_indices = output.logits[0][masked_id_list].topk(5).indices
        
        if ( len(topk_predictions_indices) == 0): # for some reason no masked tokens were found
            continue
        predictions = []
        
#         print(len(topk_predictions_indices[0]))
        for i in range(len(topk_predictions_indices[0])): # on top k predication
            tokens_to_decode = []
            for k in range(len(masked_id_list)): # on masked tokens
                tokens_to_decode.append(topk_predictions_indices[k][i])
            predictions.extend(tokenizer.decode(tokens_to_decode).split())
        
        if ( answer.lower() in predictions):
            cnt += 1
    print('Count for top ',top_ks,'predictions is ',cnt,' out of total of ',total,' enteries.')
    print('This gives us an accuracy precentage of',round((cnt/total)*100,3))
    return cnt,total

**Tokenizing the train df for smart masking**

In [39]:
masked_smart_train , answered_smart_train , answers_smart_list_train = create_masked_answered_lists_for_tokenization(nyt_train_df,tokenizer)

As before , the function returns three lists
1. first is the hint masked using smart masking - 'masked_smart_train'
2. second is hint with the answer in the same position as the mask - 'answered_smart_train'
3. is the answers only list , gets a different name for differantiation , not different at all

In [40]:
print('Lets see if the shape of the lists are the same: ',len(masked_smart_train) == len(answered_smart_train) == len(answers_smart_list_train))

Lets see if the shape of the lists are the same:  True


**Tokenize the lists as before , nothing different in thetokenization procedure**

In [41]:
tokenized_train_smart_masked = tokenized_list_for_model(tokenizer ,[masked_smart_train,answered_smart_train] )

In [42]:
print('shape of the tokenized train ds just to be suure',tokenized_train_smart_masked.input_ids.shape)

shape of the tokenized train ds just to be suure torch.Size([117537, 44])


**Creating the trtaining loop and its dataloaders and optimizers**

As before , we train for 3 epochs , get accuracy on eval and train and train some more if needed , this time because the masking is done smartly we do not predict the traning proccess will requre more than 3 epochs

In [43]:
dataset_train_smart = CrossWordsDataset(tokenized_train_smart_masked)

In [44]:
loader_train_smart = torch.utils.data.DataLoader(dataset_train_smart, batch_size=32, shuffle=True)

In [45]:
BERT_MLM_smart_masking.to(device)
BERT_MLM_smart_masking.train()

# initialize optimizer
optim = AdamW(BERT_MLM_smart_masking.parameters(), lr=5e-5)

In [46]:
epochs = 3

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader_train_smart, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = BERT_MLM_smart_masking(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    pickle.dump(BERT_MLM_smart_masking, open('//home//student//project//BERT_MLM_smart_masking_batch32_pad64_2015'+'_epoch'+str(epoch)+'.sav', 'wb'))

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 3674/3674 [28:42<00:00,  2.13it/s, loss=0.336]
Epoch 1: 100%|██████████| 3674/3674 [28:45<00:00,  2.13it/s, loss=0.0415]
Epoch 2: 100%|██████████| 3674/3674 [28:44<00:00,  2.13it/s, loss=0.15]  


**lets evaluate**

In [47]:
BERT_MLM_smart_masking.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

First we will evaluate the eval and train df with the accuracy function for the previous model and than with the new function for the smart masking
Generallly speaking they are not different just the second was created for smart masking and is abit faster and prints abit differently

In [48]:
eval_results_smart_masked_2015_3epochs_old_acc = get_top_k_accuracy_by_days(BERT_MLM_smart_masking,tokenizer,nyt_eval_df)

Sunday  The results are as follows: 


100%|██████████| 4919/4919 [02:00<00:00, 40.78it/s]


Count for masked once was:  965 , Count for masked twice was:  871  both of them sum up to 1836  out of a total of  4919
So this brings us to the accuract for top  10  prediction model made at 37.325 %.
Monday  The results are as follows: 


100%|██████████| 5187/5187 [01:58<00:00, 43.91it/s]


Count for masked once was:  1349 , Count for masked twice was:  1585  both of them sum up to 2934  out of a total of  5187
So this brings us to the accuract for top  10  prediction model made at 56.564 %.
Tuesday  The results are as follows: 


100%|██████████| 5176/5176 [02:00<00:00, 42.99it/s]


Count for masked once was:  1231 , Count for masked twice was:  1396  both of them sum up to 2627  out of a total of  5176
So this brings us to the accuract for top  10  prediction model made at 50.753 %.
Wednesday  The results are as follows: 


100%|██████████| 4973/4973 [01:56<00:00, 42.65it/s]


Count for masked once was:  1163 , Count for masked twice was:  1042  both of them sum up to 2205  out of a total of  4973
So this brings us to the accuract for top  10  prediction model made at 44.339 %.
Thursday  The results are as follows: 


100%|██████████| 4986/4986 [02:02<00:00, 40.62it/s]


Count for masked once was:  1016 , Count for masked twice was:  826  both of them sum up to 1842  out of a total of  4986
So this brings us to the accuract for top  10  prediction model made at 36.943 %.
Friday  The results are as follows: 


100%|██████████| 4407/4407 [01:51<00:00, 39.68it/s]


Count for masked once was:  741 , Count for masked twice was:  537  both of them sum up to 1278  out of a total of  4407
So this brings us to the accuract for top  10  prediction model made at 28.999 %.
Saturday  The results are as follows: 


100%|██████████| 4200/4200 [01:48<00:00, 38.86it/s]

Count for masked once was:  628 , Count for masked twice was:  387  both of them sum up to 1015  out of a total of  4200
So this brings us to the accuract for top  10  prediction model made at 24.167 %.
Total score as follows
Count for masked once was:  7093 , Count for masked twice was:  6644  both of them sum up to 13737  out of a total of  33848
So this brings us to the accuract for top  10  prediction model made at 40.584 %.





In [49]:
train_results_smart_masked_2015_3epochs_old_acc = get_top_k_accuracy_by_days(BERT_MLM_smart_masking,tokenizer,nyt_train_df)

Sunday  The results are as follows: 


100%|██████████| 17163/17163 [06:09<00:00, 46.39it/s]


Count for masked once was:  4574 , Count for masked twice was:  6876  both of them sum up to 11450  out of a total of  17163
So this brings us to the accuract for top  10  prediction model made at 66.713 %.
Monday  The results are as follows: 


100%|██████████| 17980/17980 [05:55<00:00, 50.57it/s]


Count for masked once was:  4633 , Count for masked twice was:  9356  both of them sum up to 13989  out of a total of  17980
So this brings us to the accuract for top  10  prediction model made at 77.803 %.
Tuesday  The results are as follows: 


100%|██████████| 17594/17594 [06:01<00:00, 48.65it/s]


Count for masked once was:  4722 , Count for masked twice was:  8491  both of them sum up to 13213  out of a total of  17594
So this brings us to the accuract for top  10  prediction model made at 75.099 %.
Wednesday  The results are as follows: 


100%|██████████| 17358/17358 [06:05<00:00, 47.47it/s]


Count for masked once was:  4948 , Count for masked twice was:  7560  both of them sum up to 12508  out of a total of  17358
So this brings us to the accuract for top  10  prediction model made at 72.059 %.
Thursday  The results are as follows: 


100%|██████████| 17076/17076 [06:13<00:00, 45.72it/s]


Count for masked once was:  5004 , Count for masked twice was:  6444  both of them sum up to 11448  out of a total of  17076
So this brings us to the accuract for top  10  prediction model made at 67.041 %.
Friday  The results are as follows: 


100%|██████████| 15519/15519 [05:50<00:00, 44.23it/s]


Count for masked once was:  4369 , Count for masked twice was:  5024  both of them sum up to 9393  out of a total of  15519
So this brings us to the accuract for top  10  prediction model made at 60.526 %.
Saturday  The results are as follows: 


100%|██████████| 14847/14847 [05:39<00:00, 43.78it/s]

Count for masked once was:  3840 , Count for masked twice was:  4429  both of them sum up to 8269  out of a total of  14847
So this brings us to the accuract for top  10  prediction model made at 55.695 %.
Total score as follows
Count for masked once was:  32090 , Count for masked twice was:  48180  both of them sum up to 80270  out of a total of  117537
So this brings us to the accuract for top  10  prediction model made at 68.293 %.





**Now lets use the new accuracy function**

In [50]:
eval_df_results_3epochs_smart_masking_acc_2015 = get_top_k_accuracy_by_days_smart_masking(BERT_MLM_smart_masking,tokenizer , nyt_eval_df)

Sunday  The results are as follows: 


100%|██████████| 4919/4919 [01:06<00:00, 74.05it/s]


Count for top  10 predictions is  1710  out of total of  4919  enteries.
This gives us an accuracy precentage of 34.763
Monday  The results are as follows: 


100%|██████████| 5187/5187 [01:09<00:00, 74.49it/s]


Count for top  10 predictions is  2867  out of total of  5187  enteries.
This gives us an accuracy precentage of 55.273
Tuesday  The results are as follows: 


100%|██████████| 5176/5176 [01:08<00:00, 75.39it/s]


Count for top  10 predictions is  2541  out of total of  5176  enteries.
This gives us an accuracy precentage of 49.092
Wednesday  The results are as follows: 


100%|██████████| 4973/4973 [01:04<00:00, 77.31it/s]


Count for top  10 predictions is  2098  out of total of  4973  enteries.
This gives us an accuracy precentage of 42.188
Thursday  The results are as follows: 


100%|██████████| 4986/4986 [01:06<00:00, 74.93it/s]


Count for top  10 predictions is  1733  out of total of  4986  enteries.
This gives us an accuracy precentage of 34.757
Friday  The results are as follows: 


100%|██████████| 4407/4407 [00:59<00:00, 74.57it/s]


Count for top  10 predictions is  1185  out of total of  4407  enteries.
This gives us an accuracy precentage of 26.889
Saturday  The results are as follows: 


100%|██████████| 4200/4200 [00:56<00:00, 74.51it/s]

Count for top  10 predictions is  910  out of total of  4200  enteries.
This gives us an accuracy precentage of 21.667
Overall results for df as follows

Count for top  10 predictions is  13044  out of total of  33848  enteries.
This gives us an accuracy precentage of 38.537





In [51]:
train_df_results_3epochs_smart_masking_acc_2015 = get_top_k_accuracy_by_days_smart_masking(BERT_MLM_smart_masking,tokenizer , nyt_train_df)

Sunday  The results are as follows: 


100%|██████████| 17163/17163 [03:50<00:00, 74.56it/s]


Count for top  10 predictions is  11669  out of total of  17163  enteries.
This gives us an accuracy precentage of 67.989
Monday  The results are as follows: 


100%|██████████| 17980/17980 [03:59<00:00, 74.93it/s]


Count for top  10 predictions is  14318  out of total of  17980  enteries.
This gives us an accuracy precentage of 79.633
Tuesday  The results are as follows: 


100%|██████████| 17594/17594 [03:56<00:00, 74.37it/s]


Count for top  10 predictions is  13470  out of total of  17594  enteries.
This gives us an accuracy precentage of 76.56
Wednesday  The results are as follows: 


100%|██████████| 17358/17358 [03:51<00:00, 74.86it/s]


Count for top  10 predictions is  12723  out of total of  17358  enteries.
This gives us an accuracy precentage of 73.298
Thursday  The results are as follows: 


100%|██████████| 17076/17076 [03:49<00:00, 74.49it/s]


Count for top  10 predictions is  11620  out of total of  17076  enteries.
This gives us an accuracy precentage of 68.049
Friday  The results are as follows: 


100%|██████████| 15519/15519 [03:27<00:00, 74.84it/s]


Count for top  10 predictions is  9505  out of total of  15519  enteries.
This gives us an accuracy precentage of 61.248
Saturday  The results are as follows: 


100%|██████████| 14847/14847 [03:19<00:00, 74.51it/s]

Count for top  10 predictions is  8383  out of total of  14847  enteries.
This gives us an accuracy precentage of 56.463
Overall results for df as follows

Count for top  10 predictions is  81688  out of total of  117537  enteries.
This gives us an accuracy precentage of 69.5





**Well, the result are not that bad on the train , but the eval results arenot as good , only on monday we can see a adequate results.
Monday is the easiest crossword of the week**

**Lets train the smart masking model for two more epochs , after we will try the smart masking model after 2 epochs because it has alow loss score**

In [52]:
BERT_MLM_smart_masking.to(device)
BERT_MLM_smart_masking.train()

# initialize optimizer
optim = AdamW(BERT_MLM_smart_masking.parameters(), lr=5e-5)

In [53]:
epochs = 2

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader_train_smart, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = BERT_MLM_smart_masking(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    pickle.dump(BERT_MLM_smart_masking, open('//home//student//project//BERT_MLM_smart_masking_batch32_pad64_2015'+'_epoch'+str(epoch+3)+'.sav', 'wb'))

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 3674/3674 [28:41<00:00,  2.13it/s, loss=0.00371]
Epoch 1: 100%|██████████| 3674/3674 [28:42<00:00,  2.13it/s, loss=0.0897]


Now back to evaluating the eval and train df

In [54]:
BERT_MLM_smart_masking.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

Firstly with old twice and single masking technique

In [55]:
eval_results_smart_masked_2015_5epochs_old_acc = get_top_k_accuracy_by_days(BERT_MLM_smart_masking,tokenizer,nyt_eval_df)

Sunday  The results are as follows: 


100%|██████████| 4919/4919 [01:56<00:00, 42.37it/s]


Count for masked once was:  789 , Count for masked twice was:  1198  both of them sum up to 1987  out of a total of  4919
So this brings us to the accuract for top  10  prediction model made at 40.394 %.
Monday  The results are as follows: 


100%|██████████| 5187/5187 [01:52<00:00, 46.04it/s]


Count for masked once was:  1081 , Count for masked twice was:  2014  both of them sum up to 3095  out of a total of  5187
So this brings us to the accuract for top  10  prediction model made at 59.668 %.
Tuesday  The results are as follows: 


100%|██████████| 5176/5176 [01:55<00:00, 44.88it/s]


Count for masked once was:  977 , Count for masked twice was:  1765  both of them sum up to 2742  out of a total of  5176
So this brings us to the accuract for top  10  prediction model made at 52.975 %.
Wednesday  The results are as follows: 


100%|██████████| 4973/4973 [01:52<00:00, 44.07it/s]


Count for masked once was:  956 , Count for masked twice was:  1363  both of them sum up to 2319  out of a total of  4973
So this brings us to the accuract for top  10  prediction model made at 46.632 %.
Thursday  The results are as follows: 


100%|██████████| 4986/4986 [01:58<00:00, 42.00it/s]


Count for masked once was:  866 , Count for masked twice was:  1129  both of them sum up to 1995  out of a total of  4986
So this brings us to the accuract for top  10  prediction model made at 40.012 %.
Friday  The results are as follows: 


100%|██████████| 4407/4407 [01:49<00:00, 40.34it/s]


Count for masked once was:  637 , Count for masked twice was:  712  both of them sum up to 1349  out of a total of  4407
So this brings us to the accuract for top  10  prediction model made at 30.61 %.
Saturday  The results are as follows: 


100%|██████████| 4200/4200 [01:45<00:00, 39.92it/s]

Count for masked once was:  513 , Count for masked twice was:  551  both of them sum up to 1064  out of a total of  4200
So this brings us to the accuract for top  10  prediction model made at 25.333 %.
Total score as follows
Count for masked once was:  5819 , Count for masked twice was:  8732  both of them sum up to 14551  out of a total of  33848
So this brings us to the accuract for top  10  prediction model made at 42.989 %.





In [56]:
train_results_smart_masked_2015_5epochs_old_acc = get_top_k_accuracy_by_days(BERT_MLM_smart_masking,tokenizer,nyt_train_df)

Sunday  The results are as follows: 


100%|██████████| 17163/17163 [05:14<00:00, 54.52it/s]


Count for masked once was:  2838 , Count for masked twice was:  11228  both of them sum up to 14066  out of a total of  17163
So this brings us to the accuract for top  10  prediction model made at 81.955 %.
Monday  The results are as follows: 


100%|██████████| 17980/17980 [05:10<00:00, 57.92it/s]


Count for masked once was:  2562 , Count for masked twice was:  12992  both of them sum up to 15554  out of a total of  17980
So this brings us to the accuract for top  10  prediction model made at 86.507 %.
Tuesday  The results are as follows: 


100%|██████████| 17594/17594 [05:10<00:00, 56.67it/s]


Count for masked once was:  2702 , Count for masked twice was:  12432  both of them sum up to 15134  out of a total of  17594
So this brings us to the accuract for top  10  prediction model made at 86.018 %.
Wednesday  The results are as follows: 


100%|██████████| 17358/17358 [05:11<00:00, 55.69it/s]


Count for masked once was:  2954 , Count for masked twice was:  11809  both of them sum up to 14763  out of a total of  17358
So this brings us to the accuract for top  10  prediction model made at 85.05 %.
Thursday  The results are as follows: 


100%|██████████| 17076/17076 [05:16<00:00, 53.90it/s]


Count for masked once was:  3189 , Count for masked twice was:  10793  both of them sum up to 13982  out of a total of  17076
So this brings us to the accuract for top  10  prediction model made at 81.881 %.
Friday  The results are as follows: 


100%|██████████| 15519/15519 [04:57<00:00, 52.13it/s]


Count for masked once was:  2784 , Count for masked twice was:  9174  both of them sum up to 11958  out of a total of  15519
So this brings us to the accuract for top  10  prediction model made at 77.054 %.
Saturday  The results are as follows: 


100%|██████████| 14847/14847 [04:46<00:00, 51.82it/s]

Count for masked once was:  2569 , Count for masked twice was:  8479  both of them sum up to 11048  out of a total of  14847
So this brings us to the accuract for top  10  prediction model made at 74.412 %.
Total score as follows
Count for masked once was:  19598 , Count for masked twice was:  76907  both of them sum up to 96505  out of a total of  117537
So this brings us to the accuract for top  10  prediction model made at 82.106 %.





Now lets evaluate using accuracy function made for smart msking

In [57]:
eval_df_results_5epochs_smart_masking_acc_2015 = get_top_k_accuracy_by_days_smart_masking(BERT_MLM_smart_masking,tokenizer , nyt_eval_df)

Sunday  The results are as follows: 


100%|██████████| 4919/4919 [01:06<00:00, 74.36it/s]


Count for top  10 predictions is  1896  out of total of  4919  enteries.
This gives us an accuracy precentage of 38.544
Monday  The results are as follows: 


100%|██████████| 5187/5187 [01:09<00:00, 74.83it/s]


Count for top  10 predictions is  3025  out of total of  5187  enteries.
This gives us an accuracy precentage of 58.319
Tuesday  The results are as follows: 


100%|██████████| 5176/5176 [01:08<00:00, 75.52it/s]


Count for top  10 predictions is  2680  out of total of  5176  enteries.
This gives us an accuracy precentage of 51.777
Wednesday  The results are as follows: 


100%|██████████| 4973/4973 [01:04<00:00, 76.86it/s]


Count for top  10 predictions is  2268  out of total of  4973  enteries.
This gives us an accuracy precentage of 45.606
Thursday  The results are as follows: 


100%|██████████| 4986/4986 [01:07<00:00, 74.31it/s]


Count for top  10 predictions is  1890  out of total of  4986  enteries.
This gives us an accuracy precentage of 37.906
Friday  The results are as follows: 


100%|██████████| 4407/4407 [00:59<00:00, 74.24it/s]


Count for top  10 predictions is  1278  out of total of  4407  enteries.
This gives us an accuracy precentage of 28.999
Saturday  The results are as follows: 


100%|██████████| 4200/4200 [00:55<00:00, 75.34it/s]

Count for top  10 predictions is  1011  out of total of  4200  enteries.
This gives us an accuracy precentage of 24.071
Overall results for df as follows

Count for top  10 predictions is  14048  out of total of  33848  enteries.
This gives us an accuracy precentage of 41.503





In [58]:
train_df_results_5epochs_smart_masking_acc_2015 = get_top_k_accuracy_by_days_smart_masking(BERT_MLM_smart_masking,tokenizer , nyt_train_df)

Sunday  The results are as follows: 


100%|██████████| 17163/17163 [03:49<00:00, 74.62it/s]


Count for top  10 predictions is  15006  out of total of  17163  enteries.
This gives us an accuracy precentage of 87.432
Monday  The results are as follows: 


100%|██████████| 17980/17980 [04:01<00:00, 74.58it/s]


Count for top  10 predictions is  16448  out of total of  17980  enteries.
This gives us an accuracy precentage of 91.479
Tuesday  The results are as follows: 


100%|██████████| 17594/17594 [03:55<00:00, 74.78it/s]


Count for top  10 predictions is  15969  out of total of  17594  enteries.
This gives us an accuracy precentage of 90.764
Wednesday  The results are as follows: 


100%|██████████| 17358/17358 [03:53<00:00, 74.49it/s]


Count for top  10 predictions is  15617  out of total of  17358  enteries.
This gives us an accuracy precentage of 89.97
Thursday  The results are as follows: 


100%|██████████| 17076/17076 [03:48<00:00, 74.75it/s]


Count for top  10 predictions is  14865  out of total of  17076  enteries.
This gives us an accuracy precentage of 87.052
Friday  The results are as follows: 


100%|██████████| 15519/15519 [03:29<00:00, 74.06it/s]


Count for top  10 predictions is  12880  out of total of  15519  enteries.
This gives us an accuracy precentage of 82.995
Saturday  The results are as follows: 


100%|██████████| 14847/14847 [03:19<00:00, 74.48it/s]

Count for top  10 predictions is  11996  out of total of  14847  enteries.
This gives us an accuracy precentage of 80.797
Overall results for df as follows

Count for top  10 predictions is  102781  out of total of  117537  enteries.
This gives us an accuracy precentage of 87.446





**Lets check if the accuracy precentage is different with the smart masking function**

In [59]:
eval_results_masked_twice_2015_5epochs_smart_func_acc = get_top_k_accuracy_by_days_smart_masking(model_MLM_masked_twice_2015,tokenizer,nyt_eval_df)

Sunday  The results are as follows: 


100%|██████████| 4919/4919 [01:05<00:00, 75.09it/s]


Count for top  10 predictions is  1505  out of total of  4919  enteries.
This gives us an accuracy precentage of 30.596
Monday  The results are as follows: 


100%|██████████| 5187/5187 [01:09<00:00, 75.10it/s]


Count for top  10 predictions is  2547  out of total of  5187  enteries.
This gives us an accuracy precentage of 49.104
Tuesday  The results are as follows: 


100%|██████████| 5176/5176 [01:08<00:00, 75.20it/s]


Count for top  10 predictions is  2174  out of total of  5176  enteries.
This gives us an accuracy precentage of 42.002
Wednesday  The results are as follows: 


100%|██████████| 4973/4973 [01:03<00:00, 77.81it/s]


Count for top  10 predictions is  1802  out of total of  4973  enteries.
This gives us an accuracy precentage of 36.236
Thursday  The results are as follows: 


100%|██████████| 4986/4986 [01:07<00:00, 74.32it/s]


Count for top  10 predictions is  1502  out of total of  4986  enteries.
This gives us an accuracy precentage of 30.124
Friday  The results are as follows: 


100%|██████████| 4407/4407 [00:59<00:00, 74.61it/s]


Count for top  10 predictions is  969  out of total of  4407  enteries.
This gives us an accuracy precentage of 21.988
Saturday  The results are as follows: 


100%|██████████| 4200/4200 [00:56<00:00, 74.75it/s]

Count for top  10 predictions is  751  out of total of  4200  enteries.
This gives us an accuracy precentage of 17.881
Overall results for df as follows

Count for top  10 predictions is  11250  out of total of  33848  enteries.
This gives us an accuracy precentage of 33.237





In [60]:
train_results_masked_twice_2015_5epochs_smart_func_acc = get_top_k_accuracy_by_days_smart_masking(model_MLM_masked_twice_2015,tokenizer,nyt_train_df)

Sunday  The results are as follows: 


100%|██████████| 17163/17163 [03:51<00:00, 74.06it/s]


Count for top  10 predictions is  12290  out of total of  17163  enteries.
This gives us an accuracy precentage of 71.608
Monday  The results are as follows: 


100%|██████████| 17980/17980 [04:02<00:00, 74.24it/s]


Count for top  10 predictions is  14532  out of total of  17980  enteries.
This gives us an accuracy precentage of 80.823
Tuesday  The results are as follows: 


100%|██████████| 17594/17594 [03:54<00:00, 74.94it/s]


Count for top  10 predictions is  13810  out of total of  17594  enteries.
This gives us an accuracy precentage of 78.493
Wednesday  The results are as follows: 


100%|██████████| 17358/17358 [03:53<00:00, 74.28it/s]


Count for top  10 predictions is  13182  out of total of  17358  enteries.
This gives us an accuracy precentage of 75.942
Thursday  The results are as follows: 


100%|██████████| 17076/17076 [03:49<00:00, 74.40it/s]


Count for top  10 predictions is  12298  out of total of  17076  enteries.
This gives us an accuracy precentage of 72.019
Friday  The results are as follows: 


100%|██████████| 15519/15519 [03:28<00:00, 74.50it/s]


Count for top  10 predictions is  10221  out of total of  15519  enteries.
This gives us an accuracy precentage of 65.861
Saturday  The results are as follows: 


100%|██████████| 14847/14847 [03:17<00:00, 75.24it/s]

Count for top  10 predictions is  9062  out of total of  14847  enteries.
This gives us an accuracy precentage of 61.036
Overall results for df as follows

Count for top  10 predictions is  85395  out of total of  117537  enteries.
This gives us an accuracy precentage of 72.654





# Evaluating not fune-tuned BERT MLM 

Now we will see the top 10 accuracy precentage a non fine-tuned BERT MLM model can reach

In [61]:
BERT_clean = BertForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [62]:
BERT_clean.to(device)
BERT_clean.eval()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [63]:
eval_df_results_bert_basic_2015_data_old_acc = get_top_k_accuracy_by_days(BERT_clean,tokenizer,nyt_eval_df)

Sunday  The results are as follows: 


100%|██████████| 4919/4919 [02:10<00:00, 37.78it/s]


Count for masked once was:  9 , Count for masked twice was:  69  both of them sum up to 78  out of a total of  4919
So this brings us to the accuract for top  10  prediction model made at 1.586 %.
Monday  The results are as follows: 


100%|██████████| 5187/5187 [02:15<00:00, 38.30it/s]


Count for masked once was:  31 , Count for masked twice was:  260  both of them sum up to 291  out of a total of  5187
So this brings us to the accuract for top  10  prediction model made at 5.61 %.
Tuesday  The results are as follows: 


100%|██████████| 5176/5176 [02:16<00:00, 37.90it/s]


Count for masked once was:  16 , Count for masked twice was:  151  both of them sum up to 167  out of a total of  5176
So this brings us to the accuract for top  10  prediction model made at 3.226 %.
Wednesday  The results are as follows: 


100%|██████████| 4973/4973 [02:07<00:00, 39.02it/s]


Count for masked once was:  12 , Count for masked twice was:  115  both of them sum up to 127  out of a total of  4973
So this brings us to the accuract for top  10  prediction model made at 2.554 %.
Thursday  The results are as follows: 


100%|██████████| 4986/4986 [02:12<00:00, 37.64it/s]


Count for masked once was:  11 , Count for masked twice was:  95  both of them sum up to 106  out of a total of  4986
So this brings us to the accuract for top  10  prediction model made at 2.126 %.
Friday  The results are as follows: 


100%|██████████| 4407/4407 [01:58<00:00, 37.31it/s]


Count for masked once was:  10 , Count for masked twice was:  53  both of them sum up to 63  out of a total of  4407
So this brings us to the accuract for top  10  prediction model made at 1.43 %.
Saturday  The results are as follows: 


100%|██████████| 4200/4200 [01:51<00:00, 37.58it/s]

Count for masked once was:  7 , Count for masked twice was:  51  both of them sum up to 58  out of a total of  4200
So this brings us to the accuract for top  10  prediction model made at 1.381 %.
Total score as follows
Count for masked once was:  96 , Count for masked twice was:  794  both of them sum up to 890  out of a total of  33848
So this brings us to the accuract for top  10  prediction model made at 2.629 %.





In [64]:
train_df_results_bert_basic_2015_data_old_acc = get_top_k_accuracy_by_days(BERT_clean,tokenizer,nyt_train_df)

Sunday  The results are as follows: 


100%|██████████| 17163/17163 [07:36<00:00, 37.61it/s]


Count for masked once was:  34 , Count for masked twice was:  294  both of them sum up to 328  out of a total of  17163
So this brings us to the accuract for top  10  prediction model made at 1.911 %.
Monday  The results are as follows: 


100%|██████████| 17980/17980 [07:43<00:00, 38.77it/s]


Count for masked once was:  76 , Count for masked twice was:  803  both of them sum up to 879  out of a total of  17980
So this brings us to the accuract for top  10  prediction model made at 4.889 %.
Tuesday  The results are as follows: 


100%|██████████| 17594/17594 [07:45<00:00, 37.78it/s]


Count for masked once was:  63 , Count for masked twice was:  484  both of them sum up to 547  out of a total of  17594
So this brings us to the accuract for top  10  prediction model made at 3.109 %.
Wednesday  The results are as follows: 


100%|██████████| 17358/17358 [07:39<00:00, 37.74it/s]


Count for masked once was:  41 , Count for masked twice was:  409  both of them sum up to 450  out of a total of  17358
So this brings us to the accuract for top  10  prediction model made at 2.592 %.
Thursday  The results are as follows: 


100%|██████████| 17076/17076 [07:33<00:00, 37.68it/s]


Count for masked once was:  31 , Count for masked twice was:  280  both of them sum up to 311  out of a total of  17076
So this brings us to the accuract for top  10  prediction model made at 1.821 %.
Friday  The results are as follows: 


100%|██████████| 15519/15519 [06:52<00:00, 37.59it/s]


Count for masked once was:  29 , Count for masked twice was:  203  both of them sum up to 232  out of a total of  15519
So this brings us to the accuract for top  10  prediction model made at 1.495 %.
Saturday  The results are as follows: 


100%|██████████| 14847/14847 [06:35<00:00, 37.52it/s]

Count for masked once was:  25 , Count for masked twice was:  153  both of them sum up to 178  out of a total of  14847
So this brings us to the accuract for top  10  prediction model made at 1.199 %.
Total score as follows
Count for masked once was:  299 , Count for masked twice was:  2626  both of them sum up to 2925  out of a total of  117537
So this brings us to the accuract for top  10  prediction model made at 2.489 %.





In [65]:
eval_df_results_bert_basic_smart_masking_2015_data = get_top_k_accuracy_by_days_smart_masking(BERT_clean,tokenizer , nyt_eval_df)

Sunday  The results are as follows: 


100%|██████████| 4919/4919 [01:05<00:00, 74.63it/s]


Count for top  10 predictions is  50  out of total of  4919  enteries.
This gives us an accuracy precentage of 1.016
Monday  The results are as follows: 


100%|██████████| 5187/5187 [01:09<00:00, 74.38it/s]


Count for top  10 predictions is  165  out of total of  5187  enteries.
This gives us an accuracy precentage of 3.181
Tuesday  The results are as follows: 


100%|██████████| 5176/5176 [01:08<00:00, 75.07it/s]


Count for top  10 predictions is  89  out of total of  5176  enteries.
This gives us an accuracy precentage of 1.719
Wednesday  The results are as follows: 


100%|██████████| 4973/4973 [01:04<00:00, 76.72it/s]


Count for top  10 predictions is  69  out of total of  4973  enteries.
This gives us an accuracy precentage of 1.387
Thursday  The results are as follows: 


100%|██████████| 4986/4986 [01:06<00:00, 74.76it/s]


Count for top  10 predictions is  57  out of total of  4986  enteries.
This gives us an accuracy precentage of 1.143
Friday  The results are as follows: 


100%|██████████| 4407/4407 [00:59<00:00, 74.64it/s]


Count for top  10 predictions is  29  out of total of  4407  enteries.
This gives us an accuracy precentage of 0.658
Saturday  The results are as follows: 


100%|██████████| 4200/4200 [00:56<00:00, 74.83it/s]

Count for top  10 predictions is  33  out of total of  4200  enteries.
This gives us an accuracy precentage of 0.786
Overall results for df as follows

Count for top  10 predictions is  492  out of total of  33848  enteries.
This gives us an accuracy precentage of 1.454





In [66]:
train_df_results_bert_basic_smart_masking_2015_data = get_top_k_accuracy_by_days_smart_masking(BERT_clean,tokenizer , nyt_train_df)

Sunday  The results are as follows: 


100%|██████████| 17163/17163 [03:50<00:00, 74.47it/s]


Count for top  10 predictions is  185  out of total of  17163  enteries.
This gives us an accuracy precentage of 1.078
Monday  The results are as follows: 


100%|██████████| 17980/17980 [04:00<00:00, 74.77it/s]


Count for top  10 predictions is  481  out of total of  17980  enteries.
This gives us an accuracy precentage of 2.675
Tuesday  The results are as follows: 


100%|██████████| 17594/17594 [03:54<00:00, 74.95it/s]


Count for top  10 predictions is  300  out of total of  17594  enteries.
This gives us an accuracy precentage of 1.705
Wednesday  The results are as follows: 


100%|██████████| 17358/17358 [03:51<00:00, 74.84it/s]


Count for top  10 predictions is  235  out of total of  17358  enteries.
This gives us an accuracy precentage of 1.354
Thursday  The results are as follows: 


100%|██████████| 17076/17076 [03:48<00:00, 74.82it/s]


Count for top  10 predictions is  166  out of total of  17076  enteries.
This gives us an accuracy precentage of 0.972
Friday  The results are as follows: 


100%|██████████| 15519/15519 [03:28<00:00, 74.42it/s]


Count for top  10 predictions is  115  out of total of  15519  enteries.
This gives us an accuracy precentage of 0.741
Saturday  The results are as follows: 


100%|██████████| 14847/14847 [03:18<00:00, 74.63it/s]

Count for top  10 predictions is  90  out of total of  14847  enteries.
This gives us an accuracy precentage of 0.606
Overall results for df as follows

Count for top  10 predictions is  1572  out of total of  117537  enteries.
This gives us an accuracy precentage of 1.337





# Lets run the models on the test set which they never saw before.

**We will use the 5 epochs models to simulate the world distribution in the strongest model we trained**

First load test_df

In [67]:
nyt_test_df = pd.read_csv('//home//student//project//project_final_files//nytcrosswords_balanced_days_test_2015.csv',parse_dates = ['Date'], encoding ="ISO-8859-1")

In [68]:
nyt_test_df.drop(columns = ['Unnamed: 0'],inplace = True)
nyt_test_df

Unnamed: 0,Date,Word,Clue,day_of_week,year
0,2021-10-31,SRO,Inits. for a theatrical hit,Sunday,2021
1,2021-10-31,STASIS,State of equilibrium,Sunday,2021
2,2021-10-31,SACHS,"Samuel ___, business partner of Marcus Goldman",Sunday,2021
3,2021-10-31,YES,"""You betcha!""",Sunday,2021
4,2021-10-31,TOETOTOE,Mano a mano,Sunday,2021
...,...,...,...,...,...
16679,2015-01-01,PED,"Taxi eschewer, for short",Thursday,2015
16680,2015-01-01,SWANK,Ritzy,Thursday,2015
16681,2015-01-01,PASTE,Wallop,Thursday,2015
16682,2015-01-01,NAFTA,Clinton-backed pact,Thursday,2015


**First evaluate using the naive masking**

In [69]:
test_results_naive_masking_5epochs_MLM_twice = get_top_k_accuracy_by_days(model_MLM_masked_twice_2015,tokenizer,nyt_test_df)

Sunday  The results are as follows: 


100%|██████████| 2451/2451 [00:58<00:00, 41.71it/s]


Count for masked once was:  193 , Count for masked twice was:  600  both of them sum up to 793  out of a total of  2451
So this brings us to the accuract for top  10  prediction model made at 32.354 %.
Monday  The results are as follows: 


100%|██████████| 2548/2548 [00:53<00:00, 47.99it/s]


Count for masked once was:  249 , Count for masked twice was:  1110  both of them sum up to 1359  out of a total of  2548
So this brings us to the accuract for top  10  prediction model made at 53.336 %.
Tuesday  The results are as follows: 


100%|██████████| 2511/2511 [00:54<00:00, 46.26it/s]


Count for masked once was:  240 , Count for masked twice was:  905  both of them sum up to 1145  out of a total of  2511
So this brings us to the accuract for top  10  prediction model made at 45.599 %.
Wednesday  The results are as follows: 


100%|██████████| 2437/2437 [00:54<00:00, 44.84it/s]


Count for masked once was:  222 , Count for masked twice was:  776  both of them sum up to 998  out of a total of  2437
So this brings us to the accuract for top  10  prediction model made at 40.952 %.
Thursday  The results are as follows: 


100%|██████████| 2491/2491 [00:58<00:00, 42.52it/s]


Count for masked once was:  206 , Count for masked twice was:  570  both of them sum up to 776  out of a total of  2491
So this brings us to the accuract for top  10  prediction model made at 31.152 %.
Friday  The results are as follows: 


100%|██████████| 2136/2136 [00:51<00:00, 41.50it/s]


Count for masked once was:  163 , Count for masked twice was:  379  both of them sum up to 542  out of a total of  2136
So this brings us to the accuract for top  10  prediction model made at 25.375 %.
Saturday  The results are as follows: 


100%|██████████| 2110/2110 [00:52<00:00, 39.85it/s]

Count for masked once was:  112 , Count for masked twice was:  303  both of them sum up to 415  out of a total of  2110
So this brings us to the accuract for top  10  prediction model made at 19.668 %.
Total score as follows
Count for masked once was:  1385 , Count for masked twice was:  4643  both of them sum up to 6028  out of a total of  16684
So this brings us to the accuract for top  10  prediction model made at 36.13 %.





**Secondly with the smart masking technique**

In [70]:
test_results_smart_masking_5epochs_MLM_smart_masking_BERT = get_top_k_accuracy_by_days_smart_masking(model_MLM_masked_twice_2015,tokenizer,nyt_test_df)

Sunday  The results are as follows: 


100%|██████████| 2451/2451 [00:33<00:00, 73.98it/s]


Count for top  10 predictions is  725  out of total of  2451  enteries.
This gives us an accuracy precentage of 29.58
Monday  The results are as follows: 


100%|██████████| 2548/2548 [00:33<00:00, 77.04it/s]


Count for top  10 predictions is  1294  out of total of  2548  enteries.
This gives us an accuracy precentage of 50.785
Tuesday  The results are as follows: 


100%|██████████| 2511/2511 [00:32<00:00, 76.71it/s]


Count for top  10 predictions is  1057  out of total of  2511  enteries.
This gives us an accuracy precentage of 42.095
Wednesday  The results are as follows: 


100%|██████████| 2437/2437 [00:31<00:00, 76.39it/s]


Count for top  10 predictions is  922  out of total of  2437  enteries.
This gives us an accuracy precentage of 37.833
Thursday  The results are as follows: 


100%|██████████| 2491/2491 [00:32<00:00, 76.75it/s]


Count for top  10 predictions is  700  out of total of  2491  enteries.
This gives us an accuracy precentage of 28.101
Friday  The results are as follows: 


100%|██████████| 2136/2136 [00:27<00:00, 77.23it/s]


Count for top  10 predictions is  489  out of total of  2136  enteries.
This gives us an accuracy precentage of 22.893
Saturday  The results are as follows: 


100%|██████████| 2110/2110 [00:28<00:00, 74.50it/s]

Count for top  10 predictions is  370  out of total of  2110  enteries.
This gives us an accuracy precentage of 17.536
Overall results for df as follows

Count for top  10 predictions is  5557  out of total of  16684  enteries.
This gives us an accuracy precentage of 33.307





In [71]:
test_result_naitve_mask_func_smart_bert_model_5epochs = get_top_k_accuracy_by_days(BERT_MLM_smart_masking,tokenizer,nyt_test_df)

Sunday  The results are as follows: 


100%|██████████| 2451/2451 [00:58<00:00, 41.79it/s]


Count for masked once was:  378 , Count for masked twice was:  566  both of them sum up to 944  out of a total of  2451
So this brings us to the accuract for top  10  prediction model made at 38.515 %.
Monday  The results are as follows: 


100%|██████████| 2548/2548 [00:53<00:00, 47.90it/s]


Count for masked once was:  515 , Count for masked twice was:  1011  both of them sum up to 1526  out of a total of  2548
So this brings us to the accuract for top  10  prediction model made at 59.89 %.
Tuesday  The results are as follows: 


100%|██████████| 2511/2511 [00:55<00:00, 45.13it/s]


Count for masked once was:  482 , Count for masked twice was:  822  both of them sum up to 1304  out of a total of  2511
So this brings us to the accuract for top  10  prediction model made at 51.932 %.
Wednesday  The results are as follows: 


100%|██████████| 2437/2437 [00:54<00:00, 44.41it/s]


Count for masked once was:  460 , Count for masked twice was:  687  both of them sum up to 1147  out of a total of  2437
So this brings us to the accuract for top  10  prediction model made at 47.066 %.
Thursday  The results are as follows: 


100%|██████████| 2491/2491 [00:58<00:00, 42.41it/s]


Count for masked once was:  418 , Count for masked twice was:  536  both of them sum up to 954  out of a total of  2491
So this brings us to the accuract for top  10  prediction model made at 38.298 %.
Friday  The results are as follows: 


100%|██████████| 2136/2136 [00:51<00:00, 41.84it/s]


Count for masked once was:  308 , Count for masked twice was:  356  both of them sum up to 664  out of a total of  2136
So this brings us to the accuract for top  10  prediction model made at 31.086 %.
Saturday  The results are as follows: 


100%|██████████| 2110/2110 [00:53<00:00, 39.54it/s]

Count for masked once was:  268 , Count for masked twice was:  272  both of them sum up to 540  out of a total of  2110
So this brings us to the accuract for top  10  prediction model made at 25.592 %.
Total score as follows
Count for masked once was:  2829 , Count for masked twice was:  4250  both of them sum up to 7079  out of a total of  16684
So this brings us to the accuract for top  10  prediction model made at 42.43 %.





In [72]:
test_result_smart_mask_func_smart_bert_model_5epochs = get_top_k_accuracy_by_days_smart_masking(BERT_MLM_smart_masking,tokenizer,nyt_test_df)

Sunday  The results are as follows: 


100%|██████████| 2451/2451 [00:32<00:00, 74.43it/s]


Count for top  10 predictions is  918  out of total of  2451  enteries.
This gives us an accuracy precentage of 37.454
Monday  The results are as follows: 


100%|██████████| 2548/2548 [00:33<00:00, 76.02it/s]


Count for top  10 predictions is  1502  out of total of  2548  enteries.
This gives us an accuracy precentage of 58.948
Tuesday  The results are as follows: 


100%|██████████| 2511/2511 [00:32<00:00, 77.18it/s]


Count for top  10 predictions is  1280  out of total of  2511  enteries.
This gives us an accuracy precentage of 50.976
Wednesday  The results are as follows: 


100%|██████████| 2437/2437 [00:31<00:00, 76.97it/s]


Count for top  10 predictions is  1102  out of total of  2437  enteries.
This gives us an accuracy precentage of 45.22
Thursday  The results are as follows: 


100%|██████████| 2491/2491 [00:32<00:00, 76.24it/s]


Count for top  10 predictions is  914  out of total of  2491  enteries.
This gives us an accuracy precentage of 36.692
Friday  The results are as follows: 


100%|██████████| 2136/2136 [00:28<00:00, 75.90it/s]


Count for top  10 predictions is  632  out of total of  2136  enteries.
This gives us an accuracy precentage of 29.588
Saturday  The results are as follows: 


100%|██████████| 2110/2110 [00:28<00:00, 73.83it/s]

Count for top  10 predictions is  504  out of total of  2110  enteries.
This gives us an accuracy precentage of 23.886
Overall results for df as follows

Count for top  10 predictions is  6852  out of total of  16684  enteries.
This gives us an accuracy precentage of 41.069





In [73]:
test_results_naive_mask_func_MLM_clean_BERT = get_top_k_accuracy_by_days(BERT_clean,tokenizer,nyt_test_df)

Sunday  The results are as follows: 


100%|██████████| 2451/2451 [01:05<00:00, 37.42it/s]


Count for masked once was:  3 , Count for masked twice was:  51  both of them sum up to 54  out of a total of  2451
So this brings us to the accuract for top  10  prediction model made at 2.203 %.
Monday  The results are as follows: 


100%|██████████| 2548/2548 [01:04<00:00, 39.37it/s]


Count for masked once was:  13 , Count for masked twice was:  138  both of them sum up to 151  out of a total of  2548
So this brings us to the accuract for top  10  prediction model made at 5.926 %.
Tuesday  The results are as follows: 


100%|██████████| 2511/2511 [01:04<00:00, 38.95it/s]


Count for masked once was:  7 , Count for masked twice was:  83  both of them sum up to 90  out of a total of  2511
So this brings us to the accuract for top  10  prediction model made at 3.584 %.
Wednesday  The results are as follows: 


100%|██████████| 2437/2437 [01:03<00:00, 38.59it/s]


Count for masked once was:  5 , Count for masked twice was:  60  both of them sum up to 65  out of a total of  2437
So this brings us to the accuract for top  10  prediction model made at 2.667 %.
Thursday  The results are as follows: 


100%|██████████| 2491/2491 [01:04<00:00, 38.92it/s]


Count for masked once was:  3 , Count for masked twice was:  53  both of them sum up to 56  out of a total of  2491
So this brings us to the accuract for top  10  prediction model made at 2.248 %.
Friday  The results are as follows: 


100%|██████████| 2136/2136 [00:55<00:00, 38.49it/s]


Count for masked once was:  6 , Count for masked twice was:  25  both of them sum up to 31  out of a total of  2136
So this brings us to the accuract for top  10  prediction model made at 1.451 %.
Saturday  The results are as follows: 


100%|██████████| 2110/2110 [00:56<00:00, 37.26it/s]


Count for masked once was:  5 , Count for masked twice was:  22  both of them sum up to 27  out of a total of  2110
So this brings us to the accuract for top  10  prediction model made at 1.28 %.
Total score as follows
Count for masked once was:  42 , Count for masked twice was:  432  both of them sum up to 474  out of a total of  16684
So this brings us to the accuract for top  10  prediction model made at 2.841 %.


In [74]:
test_results_smart_mask_func_MLM_clean_BERT = get_top_k_accuracy_by_days_smart_masking(BERT_clean,tokenizer,nyt_test_df)

Sunday  The results are as follows: 


100%|██████████| 2451/2451 [00:32<00:00, 74.28it/s]


Count for top  10 predictions is  29  out of total of  2451  enteries.
This gives us an accuracy precentage of 1.183
Monday  The results are as follows: 


100%|██████████| 2548/2548 [00:33<00:00, 77.03it/s]


Count for top  10 predictions is  82  out of total of  2548  enteries.
This gives us an accuracy precentage of 3.218
Tuesday  The results are as follows: 


100%|██████████| 2511/2511 [00:32<00:00, 77.75it/s]


Count for top  10 predictions is  55  out of total of  2511  enteries.
This gives us an accuracy precentage of 2.19
Wednesday  The results are as follows: 


100%|██████████| 2437/2437 [00:32<00:00, 74.60it/s]


Count for top  10 predictions is  32  out of total of  2437  enteries.
This gives us an accuracy precentage of 1.313
Thursday  The results are as follows: 


100%|██████████| 2491/2491 [00:32<00:00, 76.66it/s]


Count for top  10 predictions is  31  out of total of  2491  enteries.
This gives us an accuracy precentage of 1.244
Friday  The results are as follows: 


100%|██████████| 2136/2136 [00:27<00:00, 76.49it/s]


Count for top  10 predictions is  17  out of total of  2136  enteries.
This gives us an accuracy precentage of 0.796
Saturday  The results are as follows: 


100%|██████████| 2110/2110 [00:28<00:00, 74.86it/s]

Count for top  10 predictions is  15  out of total of  2110  enteries.
This gives us an accuracy precentage of 0.711
Overall results for df as follows

Count for top  10 predictions is  261  out of total of  16684  enteries.
This gives us an accuracy precentage of 1.564



