##  Creating Virtual Machine

In [None]:
#!/usr/bin/env python
# coding: utf-8
python -m venv nlp_env
source nlp_env/bin/activate 
pip install torch transformers datasets

## Installing all the Libraries

In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
from datasets import load_dataset
import numpy as np
import pandas as pd

## Tarining the model 

In [None]:

model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

def mask_token(tokenizer, text, mask_index):
    tokens = tokenizer.tokenize(text)
    tokens[mask_index] = tokenizer.mask_token
    masked_text = tokenizer.convert_tokens_to_string(tokens)
    return masked_text

def compute_pll_original(model, tokenizer, sentences):
    pll_scores = []
    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        input_ids = tokenizer(sentence, return_tensors='pt')['input_ids']
        sentence_score = 0
        for i in range(len(tokens)):
            masked_sentence = mask_token(tokenizer, sentence, i)
            masked_input_ids = tokenizer(masked_sentence, return_tensors='pt')['input_ids']
            with torch.no_grad():
                outputs = model(masked_input_ids)
            logits = outputs.logits
            softmax = torch.nn.functional.softmax(logits, dim=-1)
            token_prob = softmax[0, i, input_ids[0, i]].item()
            sentence_score += np.log(token_prob)
        pll_scores.append(sentence_score)
    return pll_scores

def compute_pll_word_l2r(model, tokenizer, sentences):
    pll_scores = []
    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        input_ids = tokenizer(sentence, return_tensors='pt')['input_ids']
        sentence_score = 0
        i = 0
        while i < len(tokens):
            if tokens[i].startswith("##"):
                i += 1
                continue
            masked_sentence = mask_token(tokenizer, sentence, i)
            masked_input_ids = tokenizer(masked_sentence, return_tensors='pt')['input_ids']
            with torch.no_grad():
                outputs = model(masked_input_ids)
            logits = outputs.logits
            softmax = torch.nn.functional.softmax(logits, dim=-1)
            token_prob = softmax[0, i, input_ids[0, i]].item()
            sentence_score += np.log(token_prob)
            i += 1
        pll_scores.append(sentence_score)
    return pll_scores

def compute_pll_whole_word(model, tokenizer, sentences):
    pll_scores = []
    for sentence in sentences:
        tokens = tokenizer.tokenize(sentence)
        input_ids = tokenizer(sentence, return_tensors='pt')['input_ids']
        sentence_score = 0
        i = 0
        while i < len(tokens):
            if tokens[i].startswith("##"):
                i += 1
                continue
            j = i
            while j < len(tokens) and tokens[j].startswith("##"):
                j += 1
            masked_sentence = mask_token(tokenizer, sentence, i)
            masked_input_ids = tokenizer(masked_sentence, return_tensors='pt')['input_ids']
            with torch.no_grad():
                outputs = model(masked_input_ids)
            logits = outputs.logits
            softmax = torch.nn.functional.softmax(logits, dim=-1)
            token_prob = softmax[0, i, input_ids[0, i]].item()
            sentence_score += np.log(token_prob)
            i = j
        pll_scores.append(sentence_score)
    return pll_scores


## Loading the Dataset

In [None]:

dataset = load_dataset('ag_news', split='test[:1%]') 
sentences = [example['text'] for example in dataset]

## Compare results

In [None]:


results = pd.DataFrame({
    'Sentence': sentences,
    'PLL Original': pll_original_scores,
    'PLL Word L2R': pll_word_l2r_scores,
    'PLL Whole Word': pll_whole_word_scores
})


## Save results to CSV

In [None]:

results.to_csv('pll_scores_comparison.csv', index=False)
print(results.head())