In [18]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")
    
import os, sys
import argparse
import torch
import pyrootutils

import hydra
from hydra import initialize, compose
from lightning import LightningDataModule, LightningModule

import pandas as pd
import numpy as np

sys.path.append('../utils/')
sys.path.append('../modeling/joint-clm-prosody/')

from config import *
from src import utils
# import dataset_utils as utils
# from tommy_utils import nlp #nlp_utils as nlpf
import prosody_analysis_utils as analysis
from src.data.components.datasets import tokenize_text_with_labels, TokenTaggingDataset

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
utils

<module 'src.utils' from '/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/code/notebooks/../modeling/joint-clm-prosody/src/utils/__init__.py'>

In [10]:
from transformers import AutoTokenizer
from torch.nn import functional as F
from scipy import stats

def load_model(config_path, ckpt_path, overrides):

    with initialize(version_base="1.3", config_path=config_path):
        cfg = compose(config_name="train.yaml", overrides=overrides)

    model: LightningModule = hydra.utils.instantiate(cfg.model)

    # Load the model from a checkpoint
    checkpoint = torch.load(ckpt_path, map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint['state_dict'])
    model.eval()

    tokenizer = AutoTokenizer.from_pretrained(
        cfg.data.model_name, add_prefix_space=False
    )
    tokenizer.pad_token_id = tokenizer.eos_token_id

    return cfg, tokenizer, model

def get_prosody_model_predictions(batch, model, out_fn=None):
        
    with torch.no_grad():
        _, outputs = model.step(batch=batch)
        logits = outputs['logits'][:, -1, :]
    
    # get the probability of the logits
    probs = F.softmax(logits, dim=-1)

    # if we provide we save logits out
    if out_fn:
        torch.save(logits, out_fn)
    
    return probs



## Loading data and models

### Set task parameters

In [20]:
from src.utils.text_processing import python_remove_punctuation


MODELS_DIR = os.path.join(BASE_DIR, 'code/modeling/joint-clm-prosody/')
EXPERIMENT = ["experiment=helsinki_prosody.yaml"]

task = 'black'
window_size = 25
top_n = [1]

model_name = 'helsinki-prosody_scratch-gpt2_joint-loss_prosody-embed'
ckpt_path = os.path.join(MODELS_DIR, 'logs/train/runs/2024-09-12/07-41-15/checkpoints/epoch_014.ckpt')
overrides = EXPERIMENT + [f"model.loss_mode=joint", f"model.pretrained=False", f"model.use_prosody_embeddings=True"]

modeling_dir = os.path.join(BASE_DIR, 'code/modeling/joint-clm-prosody/')
results_dir = os.path.join(BASE_DIR, 'derivatives/joint-prosody-clm/')

pyrootutils.setup_root(modeling_dir, indicator=".project-root", pythonpath=True)

print (f'{model_name}', flush=True)
    
####################################
### Initialize hydra config file ###
####################################

# Get relative path --> path for initialize needs to be relative
config_path = os.path.join(os.path.relpath(modeling_dir, os.getcwd()), 'configs')


helsinki-prosody_scratch-gpt2_joint-loss_prosody-embed


In [35]:

    # out_dir = os.path.join(BASE_DIR, 'derivatives/model-predictions', p.task, p.model_name, f'window-size-{p.window_size}')
    # logits_dir = os.path.join(SCRATCH_DIR, 'derivatives/model-predictions', p.task, p.model_name, f'window-size-{p.window_size}', 'logits')

    # utils.attempt_makedirs(out_dir)
    # utils.attempt_makedirs(logits_dir)

task = 'black'

# Define column names for prosody data --> remove non-words
prosody_columns = ['stim', 'start', 'end', 'word', 'prominence', 'boundary']

df_prosody = pd.read_csv(os.path.join(BASE_DIR, 'stimuli/prosody/', f'{task}.prom'), sep='\t', names=prosody_columns)
df_prosody = df_prosody[~df_prosody['word'].isin(analysis.REMOVE_WORDS)].reset_index(drop=True) # remove non-words

df_preproc = pd.read_csv(os.path.join(BASE_DIR, 'stimuli/preprocessed/', task, f'{task}_transcript-preprocessed.csv'))
df_preproc = df_preproc.rename(columns={'Word_Written': 'word', 'Punctuation': 'punctuation'})

# make sure the words match
words_preproc = df_preproc['word'].str.lower().apply(python_remove_punctuation)
words_prosody =  df_prosody['word'].str.lower().apply(python_remove_punctuation)

assert all(words_preproc == words_prosody)

# if it matches we can add in prosody as a column
df_preproc['prominence'] = df_prosody['prominence']
df_preproc.loc[df_preproc['prominence'] < 0, 'prominence'] = 0

In [41]:
df_preproc

Unnamed: 0,word,Case,POS,POS_Definition,punctuation,Stop_Word,Word_Vocab,Onset,Offset,Duration,Named_Entity,NWP_Candidate,prominence
0,So,success,RB,adverb,,True,So,0.240000,0.630000,0.39,False,False,1.016
1,I,success,PRP,"pronoun, personal",,True,I,0.680000,1.260000,0.58,False,False,3.303
2,was,success,VBD,"verb, past tense",,True,was,1.960000,2.300000,0.34,False,False,1.032
3,a,success,DT,determiner,,True,a,2.300000,2.450000,0.15,False,False,0.452
4,junior,success,JJ,"adjective or numeral, ordinal",,False,junior,2.460000,3.140000,0.68,False,True,1.378
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536,what,success,WP,WH-pronoun,,True,what,790.779999,791.009999,0.23,False,False,0.066
1537,I,success,PRP,"pronoun, personal",,True,I,791.010000,791.130000,0.12,False,False,0.656
1538,do,success,VBP,"verb, present tense, not 3rd person singular",.,True,do,791.129999,791.469999,0.34,False,False,0.000
1539,Thank,success,VB,"verb, base form",,False,Thank,792.350000,792.610000,0.26,False,True,0.599


In [12]:
cfg, tokenizer, model = load_model(config_path, ckpt_path, overrides)

print (f'Model loaded', flush=True)

Loading Huggingface model.
Initializing new model
Using joint loss
Model loaded


In [38]:
# add the first word to the dataframe --> we don't run NWP on this as there is no context
# to condition, nor do we have humans do it
df = analysis.create_results_dataframe()
first_word = df_preproc.iloc[0]['word'].lower()
df.loc[len(df)] = {'ground_truth_word': first_word}

# set up variables to be used in the loop
df_stack = {str(n): [df] for n in top_n}
prev_probs = None

# create a list of indices that we will iterate through to sample the transcript
segments = analysis.get_segment_indices(n_words=len(df_preproc), window_size=window_size)[:-1]


In [39]:
# also keep track of the current ground truth word
inputs, labels = zip(*[analysis.transcript_to_input(df_preproc, segment, add_punctuation=True) for segment in segments])

dataset = TokenTaggingDataset(inputs, labels, tokenizer, model_name='gpt2', remove_punctuation=False, buffer_missing_samples=True)

Preprocessing samples: 100%|██████████| 1540/1540 [00:03<00:00, 481.71it/s]

Failed 3/1540





In [3]:
import prosody_analysis_utils as utils

In [29]:
from torch.utils.data import Dataset, DataLoader
from src.data.components.collators import collate_fn, encode_and_pad_batch

def collate(batch):
    return collate_fn(batch, tokenizer.pad_token_id)

dataloader = DataLoader(dataset=dataset, batch_size=1, collate_fn=collate)

In [30]:
for i, batch in enumerate(dataloader):

    ground_truth_index = segments[i][-1] + 1
    ground_truth_word = df_preproc.loc[ground_truth_index, 'word']

    # we've buffered the samples and we're gonna wait for the real one
    if not any(batch['input_text']):
        continue

    probs = get_prosody_model_predictions(batch, model)

    sys.exit(0)

    if i == 6:
        sys.exit(0)

SystemExit: 0

In [4]:
word_models = {model_name: utils.load_word_model(model_name=model_name, cache_dir=CACHE_DIR) for model_name in utils.WORD_MODELS.keys()}


Loading glove.42B.300d from saved .bin file.
Loading word2vec from saved .bin file.
Loading fasttext from saved .bin file.


In [33]:
segment_stats = analysis.get_model_statistics(ground_truth_word, probs, tokenizer, prev_probs=prev_probs, word_models=word_models, top_n=1)

In [34]:
segment_stats

Unnamed: 0,ground_truth_word,ground_truth_prob,top_n_predictions,top_prob,binary_accuracy,glove_avg_accuracy,glove_max_accuracy,glove_prediction_density,word2vec_avg_accuracy,word2vec_max_accuracy,word2vec_prediction_density,fasttext_avg_accuracy,fasttext_max_accuracy,fasttext_prediction_density,entropy,relative_entropy
0,in,1e-05,[and],0.122386,False,0.650876,0.650876,,,,,0.373871,0.373871,,4.012061,


In [216]:
probs = get_prosody_model_predictions(batch, model)

tensor([[5.0969e-08, 5.5014e-08, 6.0451e-08,  ..., 3.9684e-08, 1.0778e-05,
         9.3119e-08]])

In [209]:
# get the probability of the logits
probs = F.softmax(logits, dim=-1)

In [210]:
tokenizer.batch_decode([probs.argmax()])

[' great']

In [211]:
batch

{'input_text': ['Where I grew up in the South, there was a'],
 'tokenized_text': [['Where',
   'ĠI',
   'Ġgrew',
   'Ġup',
   'Ġin',
   'Ġthe',
   'ĠSouth',
   ',',
   'Ġthere',
   'Ġwas',
   'Ġa']],
 'original_labels': [[0.232,
   1.302,
   0.202,
   0.849,
   0.108,
   0.0,
   1.245,
   None,
   1.54,
   0.072,
   0.0]],
 'tokenized_labels': tensor([[ 2.3200e-01,  1.3020e+00,  2.0200e-01,  8.4900e-01,  1.0800e-01,
           0.0000e+00,  1.2450e+00, -9.9900e+02,  1.5400e+00,  7.2000e-02,
           0.0000e+00]]),
 'input_ids': tensor([[8496,  314, 6348,  510,  287,  262, 2520,   11,  612,  373,  257]]),
 'loss_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'word_to_tokens': [['Where',
   [8496],
   ' I',
   [314],
   ' grew',
   [6348],
   ' up',
   [510],
   ' in',
   [287],
   ' the',
   [262],
   ' South',
   [2520],
   ',',
   [11],
   ' there',
   [612],
   ' was',
   [373],
   ' a',
   [257]]]}

In [184]:
    logits = model(**tokens).logits[:, -1, :]



# if we provide we save logits out
if out_fn:
    torch.save(logits, out_fn)

{'logits': tensor([[[-6.2675, -7.2080, -5.4195,  ..., -6.9068,  0.7339, -5.1757],
          [-5.4105, -6.4297, -4.1599,  ..., -6.3006,  3.3910, -4.6908],
          [-6.6323, -6.7363, -4.4950,  ..., -6.3908, -1.0139, -6.0770],
          [-6.8450, -7.3865, -5.5411,  ..., -6.9936,  0.0876, -5.6336]]]),
 'preds': tensor([[0.8823, 0.9829, 1.3903, 0.3805]]),
 'mu': tensor([[0.3559, 0.5888, 0.7342, 0.3470]]),
 'var': tensor([[0.4033, 0.5991, 0.5281, 0.9121]]),
 'loss': tensor(6.6485),
 'clm_loss': tensor(5.5781),
 'prosody_loss': tensor(1.0704)}

In [70]:
# we don't need to get the last word
for i, segment in enumerate(segments):

    ground_truth_index = segment[-1] + 1
    ground_truth_word = df_preproc.loc[ground_truth_index, 'word']
    
    # also keep track of the current ground truth word
    inputs, prosody = transcript_to_input(df_preproc, segment, add_punctuation=True)

    sys.exit(0)		

    # run the inputs through the model, get predictive distribution, and save out the logits
    # if the next word is a prediction word save logits
    if df_preproc.loc[ground_truth_index, 'NWP_Candidate']: # and p.model_name == 'gpt2-xl':
        logits_fn = os.path.join(logits_dir, f'{p.task}_window-size-{p.window_size}_logits-{str(ground_truth_index).zfill(5)}.pt')
    else:
        logits_fn = None

    probs = get_prosody_model_predictions([inputs], model, tokenizer, out_fn=logits_fn)

SystemExit: 0