In [1]:
import gdown
import json
from minicons import scorer
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
from transformers import BertModel, BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


### Load First Dataset

In [3]:
import json
import pandas as pd

def load_jsonl(file_path):
    """Load a JSONL file into a DataFrame."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Specify the path to your dataset
dataset_path = "./data/distractor_agreement_relational_noun.jsonl"

# Load the dataset
df_dgrn = load_jsonl(dataset_path)

# Display the first few rows of the DataFrame to inspect the data
pd.set_option('display.max_colwidth', None) 
df_dgrn.head()


Unnamed: 0,sentence_good,sentence_bad,one_prefix_prefix,one_prefix_word_good,one_prefix_word_bad,dependency_length,field,linguistics_term,UID,simple_LM_method,one_prefix_method,two_prefix_method,lexically_identical,pairID
0,A niece of most senators hasn't descended most slopes.,A niece of most senators haven't descended most slopes.,A niece of most senators,hasn't,haven't,5,morphology,subject_verb_agreement,distractor_agreement_relational_noun,True,True,False,False,0
1,The sketch of those trucks hasn't hurt Alan.,The sketch of those trucks haven't hurt Alan.,The sketch of those trucks,hasn't,haven't,5,morphology,subject_verb_agreement,distractor_agreement_relational_noun,True,True,False,False,1
2,A newspaper article about the Borgias has disagreed with Marcus.,A newspaper article about the Borgias have disagreed with Marcus.,A newspaper article about the Borgias,has,have,6,morphology,subject_verb_agreement,distractor_agreement_relational_noun,True,True,False,False,2
3,The niece of most guests has cleaned every college campus.,The niece of most guests have cleaned every college campus.,The niece of most guests,has,have,5,morphology,subject_verb_agreement,distractor_agreement_relational_noun,True,True,False,False,3
4,A sketch of lights doesn't appear.,A sketch of lights don't appear.,A sketch of lights,doesn't,don't,4,morphology,subject_verb_agreement,distractor_agreement_relational_noun,True,True,False,False,4


In [4]:
df_dgrn.columns

Index(['sentence_good', 'sentence_bad', 'one_prefix_prefix',
       'one_prefix_word_good', 'one_prefix_word_bad', 'dependency_length',
       'field', 'linguistics_term', 'UID', 'simple_LM_method',
       'one_prefix_method', 'two_prefix_method', 'lexically_identical',
       'pairID'],
      dtype='object')

### Priming and Token Evaluation

In [5]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
from tqdm import tqdm



# Define the evaluation function
def evaluate_priming_effect(model, tokenizer, row, num_labels, device):
    model.eval()
    classifier = nn.Linear(model.config.hidden_size, num_labels).to(device)
    results = {}

    with torch.no_grad():
        # Extract sentences and tokens from the row
        primer_sentences = [row['sentence_good'], row['sentence_bad']]
        target_tokens = [row['one_prefix_word_good'], row['one_prefix_word_bad']]
        
        for primer in primer_sentences:
            encoded_input = tokenizer(primer, return_tensors='pt').to(device)
            outputs = model(**encoded_input)
            logits = classifier(outputs.last_hidden_state)
            logits_at_last_position = logits[0, -1, :]  # Get logits at the last position for all possible tokens
            probs_at_last_position = F.softmax(logits_at_last_position, dim=0)

            predictions = {}
            for token in target_tokens:
                token_ids = tokenizer.encode(token, add_special_tokens=False)
                token_probs = probs_at_last_position[token_ids]
                token_max_prob = token_probs.max().item() if token_probs.numel() > 1 else token_probs.item()
                predictions[token] = token_max_prob

            results[primer] = predictions

    return results

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)
num_labels = tokenizer.vocab_size

# Process each row and evaluate priming effect
all_results = []
for index, row in tqdm(df_dgrn.iterrows()):
    result = evaluate_priming_effect(model, tokenizer, row, num_labels, device)
    all_results.append(result)


Using device: cuda


1000it [01:26, 11.53it/s]


In [6]:
file_path = "./results/prime_results.json"

# Prepare to write to the file
with open(file_path, 'w') as file:
    json.dump(all_results, file)

### Text Generation

In [7]:
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
import pandas as pd
from tqdm import tqdm

class BertForTextGeneration(nn.Module):
    def __init__(self):
        super(BertForTextGeneration, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.decoder = nn.Linear(768, 10000)  # Match this with the tokenizer's vocabulary size if different
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, input_ids):
        embeddings = self.bert(input_ids).last_hidden_state
        logits = self.decoder(embeddings[:, -1, :])  # Use the last token's embedding for prediction
        return self.softmax(logits)

# Load the dataset
def load_dataset(path):
    return pd.read_json(path, lines=True)

def run_text_gen(dataset_path):
    df = load_dataset(dataset_path)

    # Prepare tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForTextGeneration().to('cuda')  # Use GPU if available

    # Prepare batches of sentences
    batch_size = 5  # You can adjust batch size based on your GPU memory
    sentences = df['sentence_good'].tolist()  # Assuming you want to process 'sentence_good'
    predicted_words = []

    # Process sentences in batches
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch_sentences = sentences[i:i+batch_size]
        
        inputs = tokenizer(batch_sentences, padding=True, return_tensors='pt', truncation=True).to('cuda')

        with torch.no_grad():
            predictions = model(inputs['input_ids'])
            next_word_logits = predictions  # Contains logits for each sentence in the batch
            predicted_word_ids = torch.argmax(next_word_logits, dim=1)
            # Decode each token and handle unused tokens and subword fragments
            predicted_batch_words = []
            for pid in predicted_word_ids.cpu().numpy():
                decoded_word = tokenizer.decode([pid])
                if '[unused' in decoded_word:
                    decoded_word = '[unknown]'  # Replace unused tokens with '[unknown]'
                elif '##' in decoded_word:
                    decoded_word = decoded_word.replace('##', '')  # Attempt to clean subword fragments
                predicted_batch_words.append(decoded_word)
            predicted_words.extend(predicted_batch_words)

    df['predicted_words'] = predicted_words

    return df




In [8]:
prefix_sentence = {
    "Let me tell you a wild story.",
    "I heard a crazy story the other day.",
    "I have a story that will blow your mind.",
    "let me tell you a joke.",
}

animate_subject_passive_path = './data/animate_subject_passive.jsonl'
bert_animate_subject_passive = run_text_gen(animate_subject_passive_path)


# we want abs(good-bad) 
# 

100%|██████████| 200/200 [00:01<00:00, 122.07it/s]


In [9]:
# bert_animate_subject_passive['predicted_words']

In [10]:
distractor_agreement_relational_noun_path = './data/distractor_agreement_relational_noun.jsonl'
bert_distractor_agreement_relational_noun = run_text_gen(distractor_agreement_relational_noun_path)

100%|██████████| 200/200 [00:01<00:00, 128.18it/s]


In [11]:
bert_animate_subject_passive.to_csv('./results/bert_animate_subject_passive_predicted.csv')
bert_distractor_agreement_relational_noun.to_csv('./results/bert_distractor_agreement_relational_noun_predicted.csv')

Exception in callback BaseSelectorEventLoop._read_from_self()
handle: <Handle BaseSelectorEventLoop._read_from_self()>
Traceback (most recent call last):
  File "c:\Users\jindd\anaconda3\Lib\asyncio\events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "c:\Users\jindd\anaconda3\Lib\asyncio\selector_events.py", line 119, in _read_from_self
    data = self._ssock.recv(4096)
           ^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
Exception in callback BaseSelectorEventLoop._read_from_self()
handle: <Handle BaseSelectorEventLoop._read_from_self()>
Traceback (most recent call last):
  File "c:\Users\jindd\anaconda3\Lib\asyncio\events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "c:\Users\jindd\anaconda3\Lib\asyncio\selector_events.py", line 119, in _read_from_self
    data = self._ssock.recv(4096)
           ^^^^^^^^^^^^^^^^^^^^^^
ConnectionReset