# Prep lib and dataset

In [None]:
# Author: Yan CONG
# make sure this ipnb file is in the same folder as Fine_tuning_a_masked_language_model...ipynb
import pandas as pd
import csv
import math
import os
import re

In [None]:
from transformers import pipeline

In [None]:
unmasker = pipeline('fill-mask', model='distilbert-base-uncased')
unmasker("Hello I'm a [MASK] model.")

In [None]:
unmasker("Hello I'm a [MASK] model.")[0]['score']

# Distill-BERT

## quantifier

### dataset

In [None]:
quantifier_df = pd.read_csv('data', index_col=0)
quantifier_df.head()

### playground

In [None]:
unmasker = pipeline('fill-mask', model='distilbert-base-uncased')
unmasker("[MASK] people have lungs that are diseased by viruses.")

In [None]:
unmasker = pipeline('fill-mask', model='distilbert-base-uncased')
unmasker("Some people have [MASK] that are diseased by viruses.")

In [None]:
unmasker = pipeline('fill-mask', model='distilbert-base-uncased')
unmasker("[MASK] people have lungs which require good care.")

### dataset cw quantifier

In [None]:
new_qdf = pd.read_csv('data', index_col=0)
new_qdf.head()

In [None]:
new_qdf['distillbert_cw_prob'] = ''
new_qdf['distillbert_top1'] = ''
new_qdf['distillbert_top1_prob'] = ''
new_qdf['distillbert_cw_accuracy'] = 0

unmasker = pipeline('fill-mask', model='distilbert-base-uncased')

for i in new_qdf.index:
    seq = "[MASK] " + new_qdf['post_cw'][i]
    temp = unmasker(seq)
    new_qdf['distillbert_top1'][i] = temp[0]['token_str']
    new_qdf['distillbert_top1_prob'][i] = temp[0]['score']

    for item in temp:
        if item['token_str'] == new_qdf['cw'][i]:
            new_qdf['distillbert_cw_prob'][i] = item['score']
            new_qdf['distillbert_cw_accuracy'][i] = 1


## presupposition

In [None]:
df = pd.read_csv('data', index_col=0)
df.head()

In [None]:
df['distillbert_cw_prob'] = ''
df['distillbert_top1'] = ''
df['distillbert_top1_prob'] = ''
df['distillbert_cw_accuracy'] = 0

unmasker = pipeline('fill-mask', model='distilbert-base-uncased')

for i in df.index:
    seq = df['pre_cw'][i] + " [MASK] " + df['post_cw'][i] + ' ' + df['post_cw_trigger'][i]
    temp = unmasker(seq)
    df['distillbert_top1'][i] = temp[0]['token_str']
    df['distillbert_top1_prob'][i] = temp[0]['score']

    for item in temp:
        if item['token_str'] == df['cw'][i]:
            df['distillbert_cw_prob'][i] = item['score']
            df['distillbert_cw_accuracy'][i] = 1

# Fine-tuned distill-bert

## quantifier

### playground

In [None]:
unmasker = pipeline('fill-mask', model='distilbert-base-uncased-finetuned-quantifier')
unmasker("[MASK] people have lungs that are diseased by viruses.")

In [None]:
unmasker = pipeline('fill-mask', model='distilbert-base-uncased-finetuned-quantifier')
unmasker("Some people have [MASK] that are diseased by viruses.")

In [None]:
unmasker = pipeline('fill-mask', model='distilbert-base-uncased-finetuned-quantifier')
unmasker("[MASK] people have lungs which require good care.")

### finetuned distill-bert on new SI dataset

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased-finetuned-quantifier"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.save_pretrained('distilbert-base-uncased-finetuned-quantifier')

distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

In [None]:
new_qdf['ft_distillbert_cw_prob'] = ''
new_qdf['ft_distillbert_top1'] = ''
new_qdf['ft_distillbert_top1_prob'] = ''
new_qdf['ft_distillbert_accuracy'] = 0

unmasker = pipeline('fill-mask', model='distilbert-base-uncased-finetuned-quantifier')

for i in new_qdf.index:
    seq = "[MASK] " + new_qdf['post_cw'][i]
    temp = unmasker(seq)
    new_qdf['ft_distillbert_top1'][i] = temp[0]['token_str']
    new_qdf['ft_distillbert_top1_prob'][i] = temp[0]['score']

    for item in temp:
        if item['token_str'] == new_qdf['cw'][i]:
            new_qdf['ft_distillbert_cw_prob'][i] = item['score']
            new_qdf['ft_distillbert_accuracy'][i] = 1

## presupposition

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
use_auth_token=True

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM

model_checkpoint = "distilbert-base-uncased-finetuned-existence"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.save_pretrained('distilbert-base-uncased-finetuned-existence')

distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> DistilBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

In [None]:
unmasker = pipeline('fill-mask', model='distilbert-base-uncased-finetuned-existence')
unmasker("[MASK] people have lungs which require good care.")

In [None]:
df['ft_distillbert_cw_prob'] = ''
df['ft_distillbert_top1'] = ''
df['ft_distillbert_top1_prob'] = ''
df['ft_distillbert_accuracy'] = 0

unmasker = pipeline('fill-mask', model='distilbert-base-uncased-finetuned-existence')

for i in df.index:
    seq = df['pre_cw'][i] + " [MASK] " + df['post_cw'][i] + ' ' + df['post_cw_trigger'][i]
    temp = unmasker(seq)
    df['ft_distillbert_top1'][i] = temp[0]['token_str']
    df['ft_distillbert_top1_prob'][i] = temp[0]['score']

    for item in temp:
        if item['token_str'] == df['cw'][i]:
            df['ft_distillbert_cw_prob'][i] = item['score']
            df['ft_distillbert_accuracy'][i] = 1 # this is wrong, should be if bad, then 0; if good, then 1