In [1]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")
    
import os, sys, glob
import json
import re
import numpy as np
import pandas as pd
from natsort import natsorted
from manual_spellchecker import spell_checker
import inflect
from scipy import stats

sys.path.append('/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/code/utils/')

from config import *
# import dataset_utils as utils
from tommy_utils import nlp, statistics
from preproc_utils import load_model_results, divide_nwp_dataframe
import analysis_utils as analysis

2024-11-17 18:23:02.221150: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-17 18:23:02.221228: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-17 18:23:02.221259: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-17 18:23:02.231024: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Functions -- cleaning, aggregation, and analysis

### Check and clean meta file

In [3]:
# importing shutil module  
import shutil
from pathlib import Path

def check_used_files(experiment_name, experiment_version, task, clean_errors=False, max_missing_responses=5):
    '''
    Grabs used files based on the meta file and results directory
    Returns list of subjects that were used and ones that had errors
    '''
    
    checker = {
        'complete': [],
        'incomplete': [],
        'error': [],
        'missing': [],
    }

    meta_dir = os.path.join('/dartfs/rc/lab/F/FinnLab/tommy/jspsych_experiments/utils/experiment_meta/', experiment_name)
    meta_file = pd.read_csv(os.path.join(meta_dir, f'{experiment_version}-{task}.csv'))
    
    source_dir = os.path.join(BASE_DIR, 'stimuli',  'presentation_orders', experiment_version, task, 'jspsych')
    results_dir = os.path.join(BASE_DIR, 'experiments',  experiment_name, 'results', experiment_version)
    
    # grab the used files
    used_fns = meta_file[meta_file['used'].fillna(0).astype(bool)]
    
    approve_ids = []
    
    # go through each used file
    for i, fn in used_fns.iterrows():
        # grab info regarding subject name and modality
        curr_path = Path(fn['subject_fns'])
        sub = curr_path.stem.split('_')[0]
        modality = curr_path.parents[0].stem
        
        # find the corresponding parameter file for the current subject
        parameter_fn = glob.glob(os.path.join(source_dir, f'{sub}*.json'))
        assert (len(parameter_fn) == 1)
        
        # load the parameter file to compare to the subject's results
        df_parameters = pd.read_json(parameter_fn[0], orient='records')
        df_parameters = df_parameters.dropna()
        df_parameters['word_index'] = df_parameters['word_index'].dropna().astype(int)
        
        # then grab the subject results
        sub_results_dir = os.path.join(results_dir, task, modality, sub)
        
        # load results from the completed experiment
        try:
            current_id, demographics, experience, responses = load_participant_results(sub_results_dir, sub)
            
            # append if approving
            approve_ids.append(current_id)
        except:
#             if os.path.exists(sub_results_dir):
            checker['error'].append((i, modality, sub))
            continue
            
        # check that all indices of trials match and all responses are there
        all_trials_complete = np.all(responses['word_index'] == df_parameters['word_index'])
        missing_response_threshold = sum(pd.isnull(responses['response'])) <= max_missing_responses
        
#         all_responses_complete = np.all(~pd.isnull(responses['response']))
        
        # also ensure that we have the right amount of demographics/experience questions
        all_checks_complete = np.all([
            all_trials_complete, 
            missing_response_threshold, 
            len(demographics)==4,
            len(experience)==2,
        ])
        
        if all_trials_complete and missing_response_threshold:
            # add to list of people completed
            checker['complete'].append((i, modality, sub, current_id))
        else:
            checker['incomplete'].append((i, modality, sub, current_id))
            
        del current_id
        
    if clean_errors:
        clean_meta_errors(checker, experiment_name, experiment_version, task)
        
        # run again and return from here now that its updated
        return check_used_files(experiment_name, experiment_version, task, clean_errors=False)
    else:
        return checker, approve_ids

def clean_meta_errors(checker, experiment_name, experiment_version, task):
    
    meta_dir = os.path.join('/dartfs/rc/lab/F/FinnLab/tommy/jspsych_experiments/utils/experiment_meta/', experiment_name)
    meta_fn = os.path.join(meta_dir, f'{experiment_version}-{task}.csv')
    
    meta_file = pd.read_csv(meta_fn)
    
    results_dir = os.path.join(BASE_DIR, 'experiments',  experiment_name, 'results', experiment_version)
    
    errors = checker['error']
    modalities = ['text', 'audio']
    
    if any(checker['error']):
        
        remove_idxs, _, _ = zip(*checker['error']) 
        remove_idxs = list(remove_idxs)
        
        for modality in modalities:

            # get errors for the current modality
            modality_errors = [error for error in errors if error[1] == modality]
            errors_dir = os.path.join(results_dir, task, modality, 'error')

            # get new errors dir if previous has files in it
            batch_errors = sorted(glob.glob(os.path.join(errors_dir, '*')))

            if any(batch_errors):
                last_error_dir = Path(batch_errors[-1]).stem
            else:
                last_error_dir = 'batch_1'

            if any(glob.glob(os.path.join(errors_dir, last_error_dir, '*'))):
                curr_batch_num = int(last_error_dir.split('_')[-1]) + 1
                curr_error_dir = os.path.join(errors_dir, f'batch_{curr_batch_num}')
                os.makedirs(curr_error_dir)
            else:
                curr_error_dir = os.path.join(errors_dir, last_error_dir)

            for item in modality_errors:
                file_idx, modality, sub = item

                # then grab the subject results
                sub_results_dir = os.path.join(results_dir, task, modality, sub)

                if os.path.exists(sub_results_dir):
                    print ('Here')
                    shutil.move(sub_results_dir, curr_error_dir)


        print (f'Cleaned meta file!')
        meta_file.loc[list(remove_idxs), 'used'] = None
        meta_file['used'] = meta_file['used'].astype('Int64')
        meta_file.to_csv(meta_fn, index=False)


### Loading data 

In [4]:
# def load_participant_results(sub_dir, sub):
    
#     # load and filter down to response trials
#     df_results = pd.read_csv(os.path.join(sub_dir, f'{sub}_next-word-prediction.csv')).fillna(False)
#     df_results['word_index'] = df_results['word_index'].astype(int)
    
#     # grab the prolific id
#     prolific_id = list(set(df_results['prolific_id']))

#     # filter down demographics
#     demographics = df_results[df_results['experiment_phase'].str.contains('demographics').fillna(False)]
#     demographics = demographics[['experiment_phase', 'response']].reset_index(drop=True)
    
#     # age, race, ethnicity, gender
#     assert (len(demographics) == 4)
    
#     # filter down to questinos about moth/story experience
#     experience = df_results[df_results['experiment_phase'].str.contains('experience').fillna(False)]
#     experience = experience[['experiment_phase', 'response']].reset_index(drop=True)
    
#     # moth experience + story experience
#     assert (len(experience) == 2)
    
#     # filter down to get the responses
#     responses = df_results[df_results['experiment_phase'] == 'test']
#     responses.loc[:,'response'] = responses['response'].str.lower()
#     responses = responses[['critical_word', 'word_index', 'entropy_group', 'accuracy_group', 'response', 'rt']].reset_index(drop=True)
    
#     return prolific_id[0], demographics, experience, responses


# import librosa

# def get_audio_duration(filepath):
#     """Gets the duration of an audio file in seconds."""
#     y, sr = librosa.load(filepath, sr=None)
#     return librosa.get_duration(y=y, sr=sr)

# def get_subject_audio_durations(stim_dir, n_orders=3):

#     orders = np.arange(1, n_orders + 1) # 3 orders for all tasks except black
#     df = pd.DataFrame(columns=['stim_order', 'audio_filename', 'duration'])

#     for order in orders:
#         fns = sorted(glob.glob(os.path.join(stim_dir, f'sub-{str(order).zfill(5)}', '*')))

#         for fn in fns:
#             duration = get_audio_duration(fn)

#             df.loc[len(df)] = {
#                 'stim_order': order - 1,
#                 'audio_filename': fn,
#                 'duration': duration * 1000
#             }
#     return df

# def aggregate_participant_responses(results_dir, stim_dir, task, sub_mod_list, n_orders=3):
    
#     df_results = pd.DataFrame(columns=['prolific_id', 'modality', 'subject',  'word_index', 'response', 'ground_truth', 'entropy_group', 'accuracy_group', 'rt'])
    
#     all_ids = []

#     df_order_durations = get_subject_audio_durations(os.path.join(stim_dir, task), n_orders=n_orders)
    
#     for sub, mod in sub_mod_list: 

#         # go through each task and get participant data
#         sub_dir = os.path.join(results_dir, task, mod, sub)
#         sub_stim_dir = os.path.join(stim_dir, task, sub)

#         current_order = (int(sub.split('-')[-1]) - 1) % n_orders
#         print (f'Current order: {current_order}')
        
#         df_duration = df_order_durations[df_order_durations['stim_order'] == current_order].reset_index(drop=True)

#         print (sub, mod)
#         if os.path.exists(sub_dir):
#             current_id, demographics, experience, responses = load_participant_results(sub_dir, sub)
#             responses['response'] = responses['response'].fillna('')

#             # for right now only focus on responses
#             for i, (index, response, critical_word, entropy_group, accuracy_group, rt) in enumerate(responses[['word_index', 'response', 'critical_word', 'entropy_group', 'accuracy_group', 'rt']].values):
                
#                 df_results.loc[len(df_results)] = {
#                     'prolific_id': current_id,
#                     'modality': mod,
#                     'subject': sub,
#                     'word_index': index,
#                     'response': response,
#                     'ground_truth': critical_word.lower(),
#                     'entropy_group': entropy_group, 
#                     'accuracy_group': accuracy_group,
#                     'rt': float(rt) - df_duration.loc[i, 'duration']
#                 }
            
#             all_ids.append(current_id)
#         else:
#             print (f'File not exists: {mod}, {sub}')
            
#     return df_results

# def strip_punctuation(text):
    
#     full_text = re.sub('[^A-Za-z0-9]+', '', text)
    
#     return full_text

### Interactive data cleaning

In [5]:
import enchant, string, time
from IPython.display import clear_output

AUTO_REPLACE = {
    'mum': 'mom',
    'mums': 'mom',
    'okay': 'ok',
    'sh': 'she',
    'barefoote': 'barefoot',
    'cellphone': 'phone',
    'fag': 'cigarette',
    'lite': 'light',
    'yea': 'yes',
    'yeah': 'yes'
}

def clean_participant_responses(df_results, df_transcript):
    
    # grab indices of responses --> used to index back in
    response_indices = df_results['experiment_phase'] == 'test'
    response_indices = np.where(response_indices)[0]

    checked_indices = []
    
    # filter down to get the responses
    df_responses = df_results.iloc[response_indices, :].reset_index(drop=True)
    df_responses.loc[df_responses['response'] == False, 'response'] = ""
    df_responses['response'] = df_responses['response'].apply(lambda x: x.strip().lower())

    inflect_engine = inflect.engine()
    enc_dict = enchant.Dict("en_US")

    ##############################
    ###### Run autoreplace #######
    ##############################

    for k, v in AUTO_REPLACE.items():
        df_responses.loc[df_responses['response'] == k, 'response'] = v

    ##############################
    #### Run numbers replace #####
    ##############################

    for index, df in df_responses.iterrows():
        response = df['response']
         
        if response.isdigit():
            response = inflect_engine.number_to_words(response)
            df['response'] = response
            df_responses.iloc[df.name] = df
            checked_indices.append(index)

    ##############################
    ###### Run spell-check #######
    ##############################

    print (f'##########################\n' +
           f'### Running spellcheck ###\n' +
           f'##########################\n\n')

    time.sleep(2)

    for index, df in df_responses.iterrows():

        response = df['response']
        
        if response == '':
            continue
        
        # tokens = df['response'].split()
        if not enc_dict.check(response) or response in string.punctuation and index not in checked_indices:
            df = prompt_correct_response(df, df_transcript, enc_dict, prompt_correction=False)
            df_responses.iloc[df.name] = df
            checked_indices.append(index)
            

    ##############################
    ######## Find phrases ########
    ##############################

    clear_output(wait=False)
    print (f'##########################\n' +
           f'####### Find phrases #####\n' +
           f'##########################\n\n')

    time.sleep(2)

    # go through each row
    for index, df in df_responses.iterrows():
        response = df['response'].split()

        if len(response) > 1 and index not in checked_indices:
            df = prompt_correct_response(df, df_transcript, enc_dict, prompt_correction=False)
            df_responses.iloc[df.name] = df
            checked_indices.append(index)
        else:
            continue

    ##############################
    ######## Final check #########
    ##############################

    clear_output(wait=False)
    print (f'##########################\n' +
           f'####### Final check ######\n' +
           f'##########################\n\n')

    time.sleep(2)

    # go through each row
    for index, df in df_responses.iterrows():

        if index not in checked_indices:
            df = prompt_correct_response(df, df_transcript, enc_dict, prompt_correction=True)
            df_responses.iloc[df.name] = df
            checked_indices.append(index)
        
    df_results.iloc[response_indices] = df_responses

    return df_results

def prompt_correct_response(df_response, df_transcript, enc_dict, range_display=7, prompt_correction=False):
    
    word_index = df_response['word_index']
    response = df_response['response']
    ground_truth = df_response['critical_word']
    
    start_index = (word_index - range_display) if (word_index - range_display) >= 0 else 0 
    end_index = (word_index + range_display) if (word_index + range_display) - len(df_transcript) <= 0 else None
    
    # grab the context    
    start_context = df_transcript['Word_Written'].iloc[start_index:word_index]
    end_context = df_transcript['Word_Written'].iloc[word_index + 1:end_index]
    
    # display the word
    string_to_print = ""
    
    if start_index > 0:
        string_to_print+= ".... "
    
    string_to_print+= " ".join(start_context) + " " + "\033[43;30m" + response + "\033[m" + " " + " ".join(end_context)
    
    if end_index is not None and end_index < len(df_transcript):
        string_to_print+= " ...."

    clear_output(wait=False)

    print("\n\nCurrent Word: " + string_to_print)
    print ("Ground Truth: ", ground_truth)
    
    # suggestions = enc_dict.suggest(misspelled_word)
    if prompt_correction:
        prompt_correction = input('\nNeeds correction? [y/n]: ')
    
    if prompt_correction == 'y' or prompt_correction == False:
        suggestions = enc_dict.suggest(response)
        print("\Suggestions: ", suggestions)
        correct_word = input("\nCorrect Version: ")
        
        if correct_word == "-999":
            break_flag = True
            sys.exit(0)
        elif correct_word.isdigit() and int(correct_word) - 1 < len(suggestions): # User wants to use suggestion
            print (f'Using word: {suggestions[int(correct_word)-1]}')
            df_response['response'] = df_response['response'].replace(response, suggestions[int(correct_word)-1])
            time.sleep(2)
        elif len(correct_word) == 0: # User wants to Skip
            return df_response
        elif correct_word == "''" or correct_word == '""': # User wants to remove the word
            df_response['response'] = df_response['response'].replace(response, "")
        else:
            time.sleep(2)
            df_response['response'] = df_response['response'].replace(response, correct_word)

    return df_response

### Lemmatization and accuracy functions

In [30]:
# from nltk import pos_tag
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
# from collections import defaultdict
# import re
# import spacy

# STOP_WORDS = stopwords.words('english')

# # Load the spaCy model globally
# SPACY_MODEL = spacy.load("en_core_web_sm")

# def get_pos_tags(text, strip_punc=True):
#     """
#     Creates POS tags of words in a text corpus. Before tagging,
#     performs case and white space normalization and punctuation
#     removal before tagging text.
    
#     Parameters
#     ----------
#     text : list of str
#         List of text samples (lecture transcript lines, quiz questions,
#         or quiz answers) to be processed.

#     Returns
#     -------
#     words_tags : list of tuples
#         The word-postag pairings for the list of text samples with 
#         preprocessing steps applied to each element.
#     """

#     # clean spacing, normalize case, strip puncutation
#     # (temporarily leave punctuation useful for POS tagging)
#     full_text = ' '.join(text) #.lower()
    
#     if strip_punc:
#         # TLB 10/26/22 - removing ignoring of apostrophe
#         # re.sub("[^a-zA-Z\s'-]+", '', full_text)
#         full_text = re.sub("[^a-zA-Z\s'-]+", '', full_text)
    
#     # POS tagging (works better on full transcript, more context provided)
#     words_tags = pos_tag(full_text.split())

#     #case normalize word -> case doesn't matter anymore
#     return [(word.lower(), tag) for word, tag in words_tags]

# def get_lemma(word, tag, remove_stopwords=True, backend='spacy'):
#     """
#     Handles lemmatization of words. Removes stopwords and alpha-numeric
#     words from the text.
    
#     Parameters
#     ----------
#     word_tag : tuple of str
#         Tuple containing the word to be lemmatized and the accompanying
#         WordNet POS.

#     Returns
#     -------
#     lemma : str
#         The word-postag pairings for the list of text samples with 
#         preprocessing steps applied to each element.
#     """

#     # if "'" in word:
#     #     word = word.split("'")[0]

#     if "'" in word and "n't" in word:
#         lemma = word #"not"
#         return lemma
        
#     # remove stop words & digits
#     if remove_stopwords and word in STOP_WORDS or any(c.isdigit() for c in word):
#         return None
    
#     if backend == 'spacy':

#         doc = SPACY_MODEL(word)
#         lemma = doc[0].lemma_

#     elif backend == 'nltk':

#         lemmatizer = WordNetLemmatizer()
        
#         # POS tag mapping, format: {Treebank tag (1st letter only): Wordnet}
#         tagset_mapping = defaultdict(
#             lambda: 'n',   # defaults to noun
#             {
#                 'N': 'n',  # noun types
#                 'P': 'n',  # pronoun types, predeterminers
#                 'V': 'v',  # verb types
#                 'J': 'a',  # adjective types
#                 'D': 'a',  # determiner
#                 'R': 'r'   # adverb types
#             })

#         # convert Treebank POS tags to WordNet POS tags; lemmatize
#         tag = tagset_mapping[tag[0]]
#         lemma = lemmatizer.lemmatize(word, tag)
    
#     return lemma

def make_transcript_context(word, df_transcript, word_index, range_display=10):
    start_index = (word_index - range_display) if (word_index - range_display) >= 0 else 0 
    end_index = (word_index + range_display) if (word_index + range_display) - len(df_transcript) <= 0 else None

    # grab the context    
    start_context = df_transcript['word'].iloc[start_index:word_index]
    end_context = df_transcript['word'].iloc[word_index + 1:end_index]

    context = " ".join(start_context) + " " + str(word) + " " + " ".join(end_context)

    index = word_index - start_index

    return context, index

def lemmatize_responses(df_results, df_transcript, response_column='response'):
    
    for i, df in df_results.iterrows():
        response = df[response_column]
        ground_truth = df['ground_truth']
        word_index = df['word_index']

        # find the lemma of the response
        context, index = make_transcript_context(response, df_transcript, word_index)
        response, response_tag = get_pos_tags([context])[index]
        response_lemma = get_lemma(response, response_tag, remove_stopwords=False)
        df_results.loc[i, response_column] = response_lemma

        print (f'Word: {response} \t Lemma: {response_lemma}')
        
        # find the lemma of the ground truth
        context, index = make_transcript_context(ground_truth, df_transcript, word_index)
        ground_truth, ground_truth_tag = get_pos_tags([context])[index]
        ground_truth_lemma = get_lemma(ground_truth, ground_truth_tag, remove_stopwords=False)
        df_results.loc[i, 'ground_truth'] = ground_truth_lemma

        print (f'GT: {ground_truth} \t GTLemma: {ground_truth_lemma}')


    return df_results

def calculate_results_accuracy(df_results):

    # compare response to ground truth
    df_results['accuracy'] = df_results['response'] == df_results['ground_truth']
    df_results['accuracy'] = df_results['accuracy'].astype(int)

    df_accuracy = df_results.groupby(['prolific_id', 'modality', 'subject'])['accuracy'].mean() \
    .reset_index() \
    .sort_values(by='accuracy', ascending=True)

    return df_results, df_accuracy

### Analysis of human data

In [18]:
from scipy.spatial.distance import cdist

def get_human_probs(responses):
    
    unique, counts = np.unique(responses, return_counts=True)
    probs = counts / sum(counts)
    
    return probs, unique

def strip_punctuation(text):
    
    full_text = re.sub('[^A-Za-z0-9]+', '', text)
    
    return full_text

def analyze_human_results(df_transcript, df_results, word_model_info, window_size=25, top_n=5, drop_rt=None):

    if drop_rt:
        print (f'Dropping trials with RTs longer than {drop_rt} seconds')
        drop_rt = drop_rt * 1000
    
    word_model_name, word_model = word_model_info
    
    # load the masked language model
    tokenizer, model = nlp.load_mlm_model(model_name='sentence-transformers/all-mpnet-base-v2', cache_dir=CACHE_DIR)
    
    df_collapse = pd.DataFrame(columns=[
        'modality',
        'word_index',
        'ground_truth',
        'n_predictions',
        'top_pred',
        'top_prob',
        'distribution_std',
        'entropy',
        'normalized_entropy',
        'all_response_entropy',
        'predictability',
        'accuracy',
        'bert_top_word_accuracy',
        f'{word_model_name}_top_word_accuracy',
        f'{word_model_name}_avg_accuracy',
        f'{word_model_name}_max_accuracy',
        f'{word_model_name}_weighted_pred-gt_accuracy',
        f'{word_model_name}_prediction_distances',
        f'{word_model_name}_weighted_prediction_distances',
        f'{word_model_name}_centroid_prediction_distances',
        'entropy_accuracy_group',
        'n_rt_drops',
        'average_rt',
        'std_rt',
    ])
    
    df_results['response'] = df_results['response'].apply(lambda x: strip_punctuation(x) if isinstance(x, str) else '')
    
    # get segments for BERT embedding
    segments = nlp.get_segment_indices(n_words=len(df_transcript), window_size=window_size, bidirectional=True)
    
    for index, df_index in df_results.groupby('word_index'):

        print (f'Word {index}')
    
        # get all responses regardless of modality
        all_responses = list(np.unique(df_index['response']))
        ground_truth = np.unique(df_index['ground_truth'])
    
        # ensure only one ground truth word
        assert (len(ground_truth) == 1)
    
        # get bert ground-truth embedding
        word_index = np.where(segments[index] == index)[0]
        inputs = nlp.transcript_to_input(df_transcript, idxs=segments[index])
        bert_gt_embedding = nlp.extract_word_embeddings([inputs], tokenizer, model, word_indices=word_index).squeeze()
        bert_gt_embedding = bert_gt_embedding[-1, :][np.newaxis]
    
        # go through each modality and calculate results
        for modality, df_modality in df_index.groupby('modality'):

            if drop_rt:
                rt_filter = df_modality['rt'] <= drop_rt
                df_modality = df_modality.loc[rt_filter, :]
            
                print (f'Kept {sum(rt_filter)} responses')
    
            human_responses = df_modality['response']
            rts = df_modality['rt']
    
            #######################################
            ### Get probabilities for each word ###
            #######################################
    
            # get probabilities and unique words for current modality
            human_probs, unique_words = get_human_probs(human_responses)
    
            # reconcile the modality responses with all human responses --> 
            # lets us calculate entropy over same number of words
            word_idxs = [all_responses.index(word) for word in unique_words]    
            temp = np.zeros(len(all_responses))
            temp[:] = 1e-100
            temp[word_idxs] = human_probs
            all_response_probs = temp / temp.sum()
    
            #######################################
            #### Calculate probability metrics ####
            #### 1. Predictability             ####
            #### 2. Entropy                    ####
            #### 3. Top probabiltity           ####
            #######################################
    
            # get predictability scores --> 
            # predictability is the count of number of accurate predictions over total predictions
            predictability =  sum(np.asarray(human_responses) == ground_truth) / len(human_responses)
            
            # calculate entropy
            entropy = stats.entropy(human_probs)
            norm_entropy = entropy / np.log(len(human_probs))

            # calculate entropy over all responses
            all_response_entropy = stats.entropy(all_response_probs)
    
            #######################################
            #### Calculate accuracy metrics    ####
            #### 1. Binary accuracy            ####
            #### 2. Continuous accuracy        ####
            ####     - fasttext/word-model     ####
            ####     - BERT                    ####
            #### 3. Top probabiltity           ####
            #######################################
    
            # sort the probabilities from highest to lowest
            sorted_prob_idxs = np.argsort(human_probs)[::-1]
            sorted_probs = human_probs[sorted_prob_idxs]
    
            # first trim down to the number of top words to use
            if top_n is not None and len(unique_words) < top_n:
                # grab all the unique words
                all_word_idxs = sorted_prob_idxs[:len(unique_words)]
                top_n_words = unique_words[all_word_idxs]
            else:
                # grab the top words
                top_word_idxs = sorted_prob_idxs[:top_n]
                top_n_words = unique_words[top_word_idxs]
    
            top_word = top_n_words[0]
            top_prob = sorted_probs[0]
            
            # 1. Binary accuracy: top-1 accuracy 
            binary_accuracy = top_word == ground_truth

            # 2. Continuous accuracy (word-model):
            #    Given a word model (e.g., word2vec, fasttext), 
            #    get the cosine similarity b/w top prediction and ground-truth
            top_word_accuracy, _ = nlp.get_word_vector_metrics(word_model, [top_word], ground_truth[0])
            max_pred_similarity, _ = nlp.get_word_vector_metrics(word_model, top_n_words, ground_truth[0], method='max')
            
            # 3. Prediction density: density of word vectors produced from 
            pred_similarity, pred_distances = nlp.get_word_vector_metrics(word_model, top_n_words, ground_truth[0])
            
            # get average similarity and distance weighted by the respective proportion of 
            # participants that predicted it
            weighted_pred_distances = np.nanmean(pred_distances * sorted_probs)
            weighted_pred_similarity = np.nanmean(pred_similarity * sorted_probs)

            # get distance to centroid
            predicted_vectors = [word_model[word] for word in unique_words if word in word_model]
            predicted_vectors = np.stack(predicted_vectors)

            centroid_distance = cdist(predicted_vectors.mean(0)[np.newaxis], predicted_vectors, metric='cosine')
            centroid_distance = np.nanmean(centroid_distance)

            if np.isnan(pred_distances):
                pred_distances = 0
    
            # create a copy and substitute word in
            df_substitute = df_transcript.copy()
            df_substitute.loc[index, 'word'] = top_word
    
            # embed the word into BERT space and get similarity
            inputs = nlp.transcript_to_input(df_substitute, idxs=segments[index])
            bert_top_word_embedding = nlp.extract_word_embeddings([inputs], tokenizer, model, word_indices=word_index).squeeze()
            bert_top_word_embedding = bert_top_word_embedding[-1, :][np.newaxis]
            
            bert_similarity = 1 - cdist(bert_gt_embedding, bert_top_word_embedding, metric='cosine').squeeze()
    
            #######################################
            ##### Add to the overall dataframe ####
            #######################################
    
            # binary accuracy of 1 or 0
            accuracy = (top_word == ground_truth).astype(int).squeeze()
            entropy_accuracy_group = df_modality['entropy_group'].iloc[0] + '-' + df_modality['accuracy_group'].iloc[0]
    
            df_collapse.loc[len(df_collapse)] = {
                'modality': modality,
                'word_index': index,
                'ground_truth': ground_truth[0],
                'n_predictions': len(unique_words),
                'top_pred': top_word,
                'top_prob': top_prob,
                'distribution_std': np.std(human_probs),
                'entropy': entropy,
                'normalized_entropy': np.nan_to_num(norm_entropy), 
                'all_response_entropy': all_response_entropy,
                'predictability': predictability,
                'accuracy': accuracy,
                'bert_top_word_accuracy': bert_similarity,
                f'{word_model_name}_top_word_accuracy': top_word_accuracy,
                f'{word_model_name}_avg_accuracy': np.nanmean(pred_similarity),
                f'{word_model_name}_max_accuracy': max_pred_similarity,
                f'{word_model_name}_weighted_pred-gt_accuracy': weighted_pred_similarity,
                f'{word_model_name}_prediction_distances': pred_distances,
                f'{word_model_name}_weighted_prediction_distances': weighted_pred_distances,
                f'{word_model_name}_centroid_prediction_distances': centroid_distance,
                'entropy_accuracy_group': entropy_accuracy_group,
                'n_rt_drops': sum(rt_filter) if drop_rt is not None else 0,
                'average_rt': rts.mean(),
                'std_rt': rts.std(),
            }
    
    return df_collapse

### Aggregation of human and LLM data

In [8]:
def get_model_word_quadrants(model_name, task, selected_idxs=None, accuracy_type='word2vec_avg_accuracy'):
    
    # FOR DIVIDING THE MODEL RESULTS INTO QUADRANTS
    ACCURACY_TYPE = accuracy_type
    ACCURACY_PERCENTILE = 50
    WINDOW_SIZE = 100
    TOP_N = 5
    
    preproc_dir = os.path.join(BASE_DIR, 'stimuli', 'preprocessed')
    
    # load our preprocessed file --> get the indices of the prediction words
    df_preproc = pd.read_csv(os.path.join(preproc_dir, task, f'{task}_transcript-preprocessed.csv'))
    nwp_idxs = np.where(df_preproc['NWP_Candidate'])[0]
    
    # select based on model quadrants --> trim down to only the words of interest
    model_results = load_model_results(models_dir, model_name=model_name, task=task, window_size=WINDOW_SIZE, top_n=TOP_N)
    model_results.loc[:, 'binary_accuracy'] = model_results['binary_accuracy'].astype(bool)
    model_results = model_results.iloc[nwp_idxs]
    
    # now grab the current model divided over the 50th percentile
    # while we originally divided words on the 45th percentile of gpt2, we want to see patterns across models
    df_divide = divide_nwp_dataframe(model_results, accuracy_type=ACCURACY_TYPE, percentile=ACCURACY_PERCENTILE, drop=False)
    
    return df_divide.loc[selected_idxs, ['entropy_group', 'accuracy_group']]
    
def compare_human_model_accuracy(human_results, model_names, word_model_info, task, top_n=1, window_size=25, lemmatize=False):
    
    # set the directories we need
    models_dir = os.path.join(BASE_DIR, 'derivatives/model-predictions')
    preproc_dir = os.path.join(BASE_DIR, 'stimuli/preprocessed', task)

    # load our preprocessed file --> get the indices of the prediction words
    df_transcript = pd.read_csv(os.path.join(preproc_dir, f'{task}_transcript-preprocessed.csv'))
    df_transcript = df_transcript.rename(columns={'Word_Written': 'word', 'Punctuation': 'punctuation'})

    df_selected = pd.read_csv(os.path.join(preproc_dir, f'{task}_transcript-selected.csv'))
    selected_idxs = np.where(df_selected['NWP_Candidate'])[0]
    
    # first get top 1 prediction for humans within each modality
    df_collapsed_results = analyze_human_results(df_transcript, human_results, word_model_info, top_n=top_n)
    word_model_name, word_model = word_model_info
    
    ## TLB THIS IS A HACK CHANG IT
    ### YOU KNOW IT IS CHANGE IT
    all_ground_truth_words = df_collapsed_results[df_collapsed_results['modality'] == 'audio']['ground_truth'].tolist()

    # start the list with the human results
    human_model_combined = [df_collapsed_results]
    
    for model_name in model_names:
        
        # load the results for the current model
        model_results = load_model_results(models_dir, model_name=model_name, task=task, top_n=top_n, window_size=window_size)
        model_results['top_pred'] = model_results['top_n_predictions'].str[0]
        model_results = model_results.rename(columns={'ground_truth_prob': 'predictability'})
        # model_results_trimmed = model_results.loc[selected_idxs, :]
        
        # trim down to only predicted word accuracies
        model_results_trimmed = model_results.loc[selected_idxs, ['top_pred', 'top_prob', 'predictability', f'{word_model_name}_avg_accuracy', 'entropy']]
        model_results_trimmed = model_results_trimmed.reset_index().rename(columns={'index': 'word_index'})
        model_results_trimmed['modality'] = model_name

        # get the quadrants and determine entropy accuracy groups
        model_quadrants = get_model_word_quadrants(model_name, task, selected_idxs, accuracy_type=f'{word_model_name}_max_accuracy').reset_index(drop=True)
        entropy_accuracy_groups = model_quadrants['entropy_group'] + '-' +  model_quadrants['accuracy_group']

        # add in entropy accuracy quadrant organization to the dataframe
        model_results_trimmed['entropy_accuracy_group'] = entropy_accuracy_groups    
        model_results_trimmed['ground_truth'] = all_ground_truth_words

        # lemmatize if wanted
        if lemmatize:
            model_results_trimmed = lemmatize_responses(model_results_trimmed, df_transcript, response_column='top_pred')
        
        human_model_combined.append(model_results_trimmed)
    
    # concatenate all dataframes 
    human_model_combined = pd.concat(human_model_combined).reset_index(drop=True)
    
    # use gpt2-xl as our model of organizing human results
    if 'prosody' not in model_names[0]:
        entropy_accuracy = human_model_combined[human_model_combined['modality'] == 'gpt2-xl']['entropy_accuracy_group'].tolist()   
        human_model_combined.loc[human_model_combined['modality'] == 'audio', 'entropy_accuracy_group'] = entropy_accuracy
        human_model_combined.loc[human_model_combined['modality'] == 'text', 'entropy_accuracy_group'] = entropy_accuracy

    # calculate accuracy
    human_model_combined['accuracy'] = human_model_combined['top_pred'] == human_model_combined['ground_truth']
    human_model_combined['accuracy'] = human_model_combined['accuracy'].astype(int) * 100

    print (f"Total missing values: {human_model_combined[f'{word_model_name}_avg_accuracy'].isna().sum()}")

    # select the columns that we want to save out for gross-comparison
    selected_columns = ['modality', 'word_index', 'top_pred', 'ground_truth', 'accuracy', f'{word_model_name}_avg_accuracy', 
                        'top_prob', 'predictability', 'entropy', 'entropy_accuracy_group']
    
    human_model_combined = human_model_combined.loc[:, selected_columns]
    
    return human_model_combined


### Comparison of probability distributions

In [9]:
import torch
from torch.nn import functional as F
from scipy.special import kl_div, rel_entr
from scipy import stats
from scipy.spatial import distance

def load_logits(model_dir, model_name, task, window_size, word_index):
    '''
    Loads model data from directory
    '''

    if 'prosody' in model_name:
        model_dir = os.path.join(model_dir, task, 'prosody-models', model_name, f'window-size-{window_size}')
    else:
        model_dir = os.path.join(model_dir, task, model_name, f'window-size-{window_size}')

    logits_fns = natsorted(glob.glob(os.path.join(model_dir, 'logits', f'*{str(word_index).zfill(5)}*.pt')))
    
    assert (len(logits_fns) == 1)
    
    return torch.load(logits_fns[0])

def compare_human_model_distributions(tokenizer, word_model, human_responses, all_responses, model_logits, ground_truth):
    
    df = pd.DataFrame(columns=[
        'top_word_human', 
        'top_word_model',
        'top_word_model_adjusted',
        'prob_human',
        'prob_model',
        'prob_model_adjusted', 
        'prob_model_human_pred',
        'predictability_model',
        'predictability_human',
        'continuous_predictability_human',
        'log_odds_predictability_model',
        'log_odds_predictability_human',
        'log_odds_continuous_predictability_human',
        'entropy',
        'kl_divergence',
        'relative_entropy',
        'wasserstein_dist',
        'jensenshannon_dist',
        'ks_stat'
    ])
    
    pre_filter = len(human_responses)
    human_responses = list(filter(None, human_responses))
    post_filter = len(human_responses)
    
    if pre_filter != post_filter:
        print (f'Removed {pre_filter - post_filter} empty responses')
    
    model_probs = F.softmax(model_logits, dim=-1).squeeze()
    prob_model = model_probs.max().item()
    top_word_model = tokenizer.decode(model_probs.argmax())

    entropy = stats.entropy(model_probs)
    
    ## get ground truth word prob
    gt_token = tokenizer.encode(ground_truth)
    gt_predictability_model = model_probs[gt_token].mean(0).item()
   
    # continuous predictability - average semantic distance of words from ground truth word
    human_predictability = sum(np.asarray(human_responses) == ground_truth) / len(human_responses)
    continuous_predictability = (1 - distance.cdist(word_model[ground_truth][np.newaxis], word_model[human_responses], metric='cosine')).mean()

    if human_predictability == 0:
        log_odds_human_predictability = statistics.log_odds(1e-2)
    else:
        log_odds_human_predictability = statistics.log_odds(human_predictability)

    log_odds_model_predictability = statistics.log_odds(gt_predictability_model)
    log_odds_continuous_predictability = statistics.log_odds(continuous_predictability)
        
    # get the probability distribution of the human responses --> also return the unique words
    human_probs, unique_words = get_human_probs(human_responses)
    prob_human = human_probs.max()
    
    # get the words indices in the overall array then add in the human probs
    word_idxs = [all_responses.index(word) for word in unique_words]    
    temp = np.zeros(len(all_responses))
    temp[word_idxs] = human_probs
    human_probs = temp
    
    # get probability of the words humans chose within the model distribution
    # then normalize to the number of samples
    model_adjusted_probs = np.asarray([nlp.get_word_prob(tokenizer, word, model_logits) for word in all_responses])
    model_adjusted_probs = model_adjusted_probs / model_adjusted_probs.sum()

    # select the probability of the top word that humans chose
    prob_model_adjusted = model_adjusted_probs[model_adjusted_probs.argmax()]
    prob_model_human_pred = model_adjusted_probs[human_probs.argmax()]

    # grab the human and model top words
    top_word_human = all_responses[human_probs.argmax()]
    top_word_model_adjusted = all_responses[model_adjusted_probs.argmax()]

    # now calculate kl divergence between the human and adjusted model distribution
    # measures how different P (human) is from Q (model) distribution
    #  KL divergence of P from Q is the expected excess surprise from 
    #  using Q as a model when the actual distribution is P
    kl_divergence = kl_div(human_probs, model_adjusted_probs)
    kl_divergence[np.isinf(kl_divergence)] = 0
    kl_divergence = kl_divergence.sum().item()
    
    relative_entropy = rel_entr(human_probs, model_adjusted_probs).sum().item()
    
    # earth movers distance between adjusted probs
    wasserstein_dist = stats.wasserstein_distance(human_probs, model_adjusted_probs)
    
    jensenshannon_dist = distance.jensenshannon(human_probs, model_adjusted_probs)
    
    ks_stats = stats.kstest(human_probs, model_adjusted_probs)
    
    df.loc[len(df)] = {
        'top_word_human': top_word_human,
        'top_word_model': top_word_model,
        'top_word_model_adjusted': top_word_model,
        'prob_human': prob_human,
        'prob_model': prob_model,
        'prob_model_adjusted': prob_model_adjusted, 
        'prob_model_human_pred': prob_model_human_pred,
        'predictability_model': gt_predictability_model,
        'predictability_human': human_predictability,
        'continuous_predictability_human': continuous_predictability,
        'log_odds_predictability_human': log_odds_human_predictability.astype(float),
        'log_odds_predictability_model': log_odds_model_predictability.astype(float),
        'log_odds_continuous_predictability_human': log_odds_continuous_predictability.astype(float),
        'entropy': entropy,
        'kl_divergence': kl_divergence,
        'relative_entropy': relative_entropy,
        'wasserstein_dist': wasserstein_dist,
        'jensenshannon_dist': jensenshannon_dist,
        'ks_stat': ks_stats[0]
    }
    
    return df

## Data cleaning

In [2]:
EXPERIMENT_NAME = 'next-word-prediction'
EXPERIMENT_VERSION = 'final-multimodal-01'
TASK = 'wheretheressmoke'

# set the directories we need
gentle_dir = os.path.join(BASE_DIR, 'stimuli/gentle')
results_dir = os.path.join(BASE_DIR, 'experiments',  EXPERIMENT_NAME, 'results', EXPERIMENT_VERSION)
preproc_dir = os.path.join(BASE_DIR, 'stimuli/preprocessed')
models_dir = os.path.join(BASE_DIR, 'derivatives/model-predictions')

# make directories
cleaned_results_dir = os.path.join(BASE_DIR, 'experiments',  EXPERIMENT_NAME, 'cleaned-results', EXPERIMENT_VERSION)
behavioral_dir = os.path.join(BASE_DIR, 'derivatives/results/behavioral/')
stim_dir = os.path.join(BASE_DIR, f'stimuli/cut_audio/{EXPERIMENT_VERSION}')

# utils.attempt_makedirs(behavioral_dir)

In [3]:
word_model_name = 'fasttext'
word_model = nlp.load_word_model(model_name=word_model_name, cache_dir=CACHE_DIR)
word_model_info = (word_model_name, word_model)

Loading fasttext from saved .bin file.


In [19]:
# load transcript
df_transcript = pd.read_csv(os.path.join(preproc_dir, TASK, f'{TASK}_transcript-selected_prosody.csv'))
# df_transcript = df_transcript.rename(columns={'Word_Written': 'word', 'Punctuation': 'punctuation'})

prosody_columns = [
    'prominence', 'prominence_mean', 'prominence_std', 
    'relative_prominence', 'relative_prominence_norm',
    'boundary', 'boundary_mean', 'boundary_std', 
]

In [20]:
df_transcript.loc[df_aggregated_results['word_index'], prosody_columns]

Unnamed: 0,prominence,prominence_mean,prominence_std,relative_prominence,relative_prominence_norm,boundary,boundary_mean,boundary_std
4,1.210,0.4808,0.537948,0.7292,1.355520,1.091,0.5164,0.514090
38,1.147,0.1800,0.312102,0.9670,3.098347,1.383,0.1868,0.297223
61,0.870,0.2798,0.303629,0.5902,1.943820,0.710,0.0676,0.096442
83,1.718,0.3008,0.547868,1.4172,2.586756,0.034,0.4524,0.561306
130,1.056,0.5346,0.703242,0.5214,0.741423,1.285,0.4748,0.609704
...,...,...,...,...,...,...,...,...
1683,1.754,0.6616,0.965895,1.0924,1.130972,0.000,1.0280,0.851370
1701,1.772,0.4068,0.620155,1.3652,2.201386,2.250,0.5082,0.604410
1732,0.004,0.2988,0.313628,-0.2948,-0.939967,0.214,0.2392,0.305504
1745,2.668,0.2258,0.475963,2.4422,5.131069,1.571,0.1776,0.355200


In [6]:
logits_dir = models_dir.replace(BASE_DIR, SCRATCH_DIR)

df_distribution_comparison = analysis.compare_human_model_distributions(
    df_aggregated_results, word_model_info, models_dir=logits_dir, model_name='gpt2', task=TASK)

Index(['word', 'Case', 'POS', 'POS_Definition', 'punctuation', 'Stop_Word',
       'Digit', 'Word_Vocab', 'Onset', 'Offset', 'Duration', 'Named_Entity',
       'NWP_Candidate', 'entropy_group', 'accuracy_group', 'prominence',
       'boundary', 'prosody_mean', 'prosody_std', 'relative_prosody',
       'relative_norm', 'boundary_mean', 'boundary_std'],
      dtype='object')

In [7]:
df_aggregated_results = []

for modality in ['audio', 'text']:
    df = analysis.aggregate_participant_responses(cleaned_results_dir, stim_dir, task=TASK, modality=modality, n_orders=3)
    df_aggregated_results.append(df)

df_aggregated_results = pd.concat(df_aggregated_results).reset_index(drop=True)
df_analyzed_results = analysis.analyze_human_results(df_transcript, df_aggregated_results, word_model_info)

Aggregating wheretheressmoke - audio
Total of 150 subjects


100%|██████████| 150/150 [00:59<00:00,  2.51it/s]


Aggregating wheretheressmoke - text
Total of 150 subjects


100%|██████████| 150/150 [00:58<00:00,  2.55it/s]


In [139]:
# load transcript
df_preproc_transcript = pd.read_csv(os.path.join(preproc_dir, TASK, f'{TASK}_transcript-preprocessed.csv'))
candidate_rows = np.where(df_preproc_transcript['NWP_Candidate'])[0]

df_model_results = analysis.analyze_model_accuracy(df_transcript, word_model_info=word_model_info, models_dir=models_dir, model_name='gpt2', task=TASK, candidate_rows=candidate_rows, lemmatize=True)

Total missing values: 1
Lemmatizing column: top_pred


237it [00:01, 179.54it/s]


Lemmatizing column: ground_truth


237it [00:01, 182.13it/s]
237it [00:00, 635.15it/s]


In [226]:
df_distribution_comparison

Unnamed: 0,model_name,modality,word_index,ground_truth,entropy_group,accuracy_group,human_top_word,human_prob,human_predictability,human_log_odds_predictability,...,model_prob,model_predictability,model_log_odds_predictability,model_entropy,model_prob_adjusted,model_prob_human_prediction,kl_divergence,earthmovers_dist,jensenshannon_dist,ks_stat
0,gpt2,audio,4,secretly,high,low,grabbed,0.42,0.00,-4.595120,...,0.081928,1.601153e-07,-15.647372,5.113877,0.043796,0.012390,2.874845,0.014823,0.737965,0.85
1,gpt2,audio,11,foot,high,low,hand,0.44,0.08,-2.442347,...,0.099520,3.467208e-06,-12.572158,6.071649,0.022090,0.022090,3.511974,0.013749,0.783414,0.85
2,gpt2,audio,30,shoes,low,low,shoes,0.24,0.24,-1.152680,...,0.252326,7.632860e-08,-16.388218,4.969536,0.061560,0.000812,4.817594,0.010244,0.777128,0.86
3,gpt2,audio,34,crying,high,low,cold,0.30,0.02,-3.891820,...,0.074556,8.813146e-08,-16.244436,5.680669,0.109407,0.000909,4.066211,0.009090,0.739773,0.84
4,gpt2,audio,38,wallet,high,low,money,0.30,0.00,-4.595120,...,0.112758,8.223946e-09,-18.616216,5.458792,0.142400,0.005130,3.257522,0.009298,0.731761,0.81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
469,gpt2,text,1738,gas,low,low,money,0.44,0.28,-0.944462,...,0.284312,1.413005e-07,-15.772377,4.558704,0.113624,0.019861,4.114797,0.013755,0.776092,0.89
470,gpt2,text,1745,baby,high,low,baby,0.20,0.20,-1.386294,...,0.124569,2.310813e-09,-19.885666,5.940462,0.062317,0.000154,4.235922,0.008550,0.734537,0.82
471,gpt2,text,1758,seven,low,low,the,0.42,0.16,-1.658228,...,0.426694,7.775363e-09,-18.672306,2.373669,0.115283,0.024490,4.355853,0.010940,0.746320,0.88
472,gpt2,text,1783,held,high,low,know,0.26,0.02,-3.891820,...,0.179073,2.181768e-08,-17.640545,4.787670,0.072488,0.007105,3.794607,0.008117,0.760187,0.79


In [5]:

df_lemmatized_results = pd.read_csv(os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior_lemmatized.csv'))

In [9]:
logits_dir = models_dir.replace(BASE_DIR, SCRATCH_DIR)

df_distribution_comparison = analysis.compare_human_model_distributions(
    df_lemmatized_results, word_model_info, models_dir=logits_dir, model_name='gpt2', task=TASK)

  0%|          | 0/474 [00:00<?, ?it/s]


In [43]:

from scipy.spatial import distance

human_word, model_word = df_distribution_comparison

human_word = human_word
model_word = model_word

human_vector = word_model[human_word]
model_vector = word_model[model_word]

1 - distance.cdist(human_vector[np.newaxis], model_vector[np.newaxis], metric='cosine')


array([[0.6583414]])

array([[0.07402906]])

### Clean meta file and remove errors

In [96]:
# check all the files
checker, approve_ids = check_used_files(EXPERIMENT_NAME, EXPERIMENT_VERSION, TASK, clean_errors=True)
approve_ids = list(map(str, approve_ids))

# find the subject list based on complete files
file_idx, modality, sub_list, prolific_ids = zip(*checker['complete'])
sub_mod_list = list(zip(sub_list, modality))

print (len(set(approve_ids)))
print ('\n'.join(approve_ids))

400
5a5679b58e6259000175bec5
604ceec4b2028fd4ddbbd1d4
60956679c5e66352f1aba6f1
60fd17aabf0cf899ab257db4
5f84e472052cf81ae5752252
5d75555c99bf5b0018864ed3
60af9bae67a5ae86b0c4bd68
63dab0380b8791f9560fd329
6071d0e406363ac1fd4bdc00
66d04b4f8c9b90e6012a510e
5e695632d40a492070942196
58497529bd87380001552f71
6483605a0d0e43c659c92801
543e5938fdf99b73557f2136
605ab9d5c01d0b0fc69930c5
6413603a62c72120012e05be
5483702ffdf99b24c654b810
60cb6eb4aa635f7ad59164e9
56f699e876348f000c883bba
6109a00f2f5ee8436e836a0c
59eb9c7a6a6c3a000154bb97
62fb7a00e90a516a23cfa123
6617a07b82321144ac5c41e0
650864268a31fd9d2b8f84ee
55a29d4dfdf99b602e6b069b
5a9ad69adbdb470001eee577
5877066b1ed9d10001050b9a
5d3e47ba29130a00015cd41d
5cc39aa64d7a7f0001817c43
60b1073cfd6d4ef92a3fed74
62861bcc5f0edf6340c914df
5a94d2e989de8200013ecab6
66588b59be1d4eb7055c8163
63f7bbc4fdef28a0923b7166
654d3a401d8c1ce5b38925b3
63d7b5321ace6099e3a83386
62e154b0682e270b4acdf5ee
59497c498fb2140001afda85
5e1e5c3c80e02e1c36679187
62fe6312269d0e4ae8942

#### Specific modification for stimulus black

In [120]:
import os
import datetime
def modification_date(filename):
    t = os.path.getmtime(filename)
    return datetime.datetime.fromtimestamp(t)


all_indices = []

for i, (sub, mod) in enumerate(sub_mod_list):
    sub_results_dir = os.path.join(results_dir, TASK, mod, sub)
    info = modification_date(os.path.join(sub_results_dir, f'{sub}_next-word-prediction.csv'))

    if info.year == 2024:
        all_indices.append(i)

current_ids = [approve_ids[idx] for idx in all_indices]
print ('\n'.join(current_ids))

63dab0380b8791f9560fd329
66d04b4f8c9b90e6012a510e
605ab9d5c01d0b0fc69930c5
56f699e876348f000c883bba
6617a07b82321144ac5c41e0
5cc39aa64d7a7f0001817c43
66588b59be1d4eb7055c8163
62fe6312269d0e4ae894297b
665a2de64db1af60265a9ddb
66b288cf788b139df178ce5e
6643c77e0c522dd6c4be1bd1
668ae77d36b52c2937a6599f
6438831795f1371e7745c477
66ae700d42e57234e691d55e
66f47421b650b6c39e80fa0d
5de01c3d5253b00996930a50
66435e2e5c5b5aceb2195908
5d53bdc9b18d590018dac171
5d3b784f80a6250001de7e29
66d6eca0724772a1edfe14f9
59a5d372c094d800013f1c47
5efe3d722e800832404ef5bb
628d14b52b5bd1d83ed1f7d2
5ca73fcf7067e70001f7a41f
66e464861f78048b57ab0f50
62da79cb2e28aefb55569ce1
6418dc6fd0f852a233a10350
5ebc2405690bf0025dad6a1a
5dcf0f73af16d3067b87757f
6596c3663bd4674351a26eb8
56b4fd68a9d33a000a891ff9
60e55bb59cfd0aa9803b4470
669e145ae371f3230bb7eb6a
5ac9e0299534ba0001c76e80
66cc188a299b596f5682ef8c
66c5d1accc092f147f94b363
5de3c83c2033793be6591e53
6616d9fdba3ae8945cba4a08
669bdbdfad2ba350a4c544b0
5d3ee89122ed84000172272a


### Manually spell-check participants data

In [97]:
df_transcript = pd.read_csv(os.path.join(preproc_dir, TASK, f'{TASK}_transcript-preprocessed.csv'))

for i, (sub, modality) in enumerate(sub_mod_list):

    sub_cleaned_dir = os.path.join(cleaned_results_dir, TASK, modality, sub)
    out_fn = os.path.join(sub_cleaned_dir, f'{sub}_next-word-prediction.csv')
    
    utils.attempt_makedirs(sub_cleaned_dir)

    ############################
    #### Load subject data #####
    ############################

    if os.path.exists(out_fn):
        print (f'File exists: {modality} {sub}')
        df_results = pd.read_csv(out_fn)

        for k, v in AUTO_REPLACE.items():
            df_results.loc[df_results['response'] == k, 'response'] = v
        
        df_results.to_csv(out_fn, index=False)

        print (f'Saved file for {modality} {sub}')
        continue
    else:
        print (f'Correcting: {modality} {sub}')
        # load and filter down to response trials 
        sub_dir = os.path.join(results_dir, TASK, modality, sub)
        df_results = pd.read_csv(os.path.join(sub_dir, f'{sub}_next-word-prediction.csv')).fillna(False)

    ############################
    ###### Check responses #####
    ############################

    df_results['word_index'] = df_results['word_index'].astype(int)
    df_results = clean_participant_responses(df_results, df_transcript)
    ############################
    #### Save cleaned data #####
    ############################
    
    df_results.to_csv(out_fn, index=False)

    print (f'Saved file for {modality} {sub}')
    # if os.path.exists(sub_dir):
    #     current_id, demographics, experience, responses = load_participant_results(sub_dir, sub)
    #     responses['response'] = responses['response'].fillna('')

File exists: audio sub-00001
Saved file for audio sub-00001
File exists: text sub-00001
Saved file for text sub-00001
File exists: audio sub-00002
Saved file for audio sub-00002
File exists: text sub-00002
Saved file for text sub-00002
File exists: audio sub-00003
Saved file for audio sub-00003
File exists: text sub-00003
Saved file for text sub-00003
File exists: audio sub-00004
Saved file for audio sub-00004
File exists: text sub-00004
Saved file for text sub-00004
File exists: audio sub-00005
Saved file for audio sub-00005
File exists: text sub-00005
Saved file for text sub-00005
File exists: audio sub-00006
Saved file for audio sub-00006
File exists: text sub-00006
Saved file for text sub-00006
File exists: audio sub-00007
Saved file for audio sub-00007
File exists: text sub-00007
Saved file for text sub-00007
File exists: audio sub-00008
Saved file for audio sub-00008
File exists: text sub-00008
Saved file for text sub-00008
File exists: audio sub-00009
Saved file for audio sub-00

## Compile data across participants

### Load word models for semantic comparisons

In [10]:
word_model_name = 'fasttext'
word_model = nlp.load_word_model(model_name=word_model_name, cache_dir=CACHE_DIR)
word_model_info = (word_model_name, word_model)

Loading fasttext from saved .bin file.


### Load all participants data and save file

In [29]:
out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior.csv')
df_clean = pd.read_csv(out_fn)

In [32]:
x

(('audio', 4),
                     prolific_id modality    subject  word_index response  \
 0      5df0932af9a01d0dabd9313d    audio  sub-00001           4  grabbed   
 237    62a999b8ed876efe5ffe9599    audio  sub-00004           4  grabbed   
 474    62682277645054f5802459b8    audio  sub-00007           4     held   
 711    5d8e154af2858200171fdb95    audio  sub-00010           4  grabbed   
 948    5dcddf9f4d51e40a5292727d    audio  sub-00013           4   picked   
 1185   5a1342b7f2e3460001edbfd2    audio  sub-00016           4  touched   
 1422   6147c5874b61952e42e9b2bd    audio  sub-00019           4  grabbed   
 1659   65562141ae0a6660324ce572    audio  sub-00022           4   picked   
 1896   6002e90144049f32edaf9ccf    audio  sub-00025           4  grabbed   
 2133   5c9623dd35599200175642e4    audio  sub-00028           4       to   
 2370   5ce9b0e4be5a6b00163f6870    audio  sub-00031           4   picked   
 2607   5fd8ea198512a801831f6b85    audio  sub-00034         

In [38]:
a

('audio', 4)

In [39]:
for a, b, c in df_clean.groupby(['modality', 'word_index']):
    sys.exit(0)
    # print (x[0])
    if i == 2:
        sys.exit(0)

ValueError: not enough values to unpack (expected 3, got 2)

In [98]:
stim_dir = os.path.join(BASE_DIR, f'stimuli/cut_audio/{EXPERIMENT_VERSION}')

# load transcript
df_transcript = pd.read_csv(os.path.join(preproc_dir, TASK, f'{TASK}_transcript-preprocessed.csv'))
df_transcript = df_transcript.rename(columns={'Word_Written': 'word', 'Punctuation': 'punctuation'})

# load all results, calculate accuracy, then save
df_all_results = aggregate_participant_responses(cleaned_results_dir, stim_dir, TASK, sub_mod_list, n_orders=4 if TASK == 'black' else 3)
# df_all_results, df_all_accuracy = calculate_results_accuracy(df_all_results)

# save compiled cleaned results
out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior.csv')
df_all_results.to_csv(out_fn, index=False)

Current order: 0
sub-00001 audio
Current order: 0
sub-00001 text
Current order: 1
sub-00002 audio
Current order: 1
sub-00002 text
Current order: 2
sub-00003 audio
Current order: 2
sub-00003 text
Current order: 3
sub-00004 audio
Current order: 3
sub-00004 text
Current order: 0
sub-00005 audio
Current order: 0
sub-00005 text
Current order: 1
sub-00006 audio
Current order: 1
sub-00006 text
Current order: 2
sub-00007 audio
Current order: 2
sub-00007 text
Current order: 3
sub-00008 audio
Current order: 3
sub-00008 text
Current order: 0
sub-00009 audio
Current order: 0
sub-00009 text
Current order: 1
sub-00010 audio
Current order: 1
sub-00010 text
Current order: 2
sub-00011 audio
Current order: 2
sub-00011 text
Current order: 3
sub-00012 audio
Current order: 3
sub-00012 text
Current order: 0
sub-00013 audio
Current order: 0
sub-00013 text
Current order: 1
sub-00014 audio
Current order: 1
sub-00014 text
Current order: 2
sub-00015 audio
Current order: 2
sub-00015 text
Current order: 3
sub-0001

### Lemmatize data (responses + ground truth) and save 

In [99]:
# use transcript to lemmatize responses, then calculate accuracy
df_lemmatized_results = lemmatize_responses(df_all_results.copy(), df_transcript)
df_lemmatized_results, df_lemmatized_accuracy = calculate_results_accuracy(df_lemmatized_results)

# save compiled lemmatized results
out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior_lemmatized.csv')
df_lemmatized_results.to_csv(out_fn, index=False)

Word: rocks 	 Lemma: rock
GT: radio 	 GTLemma: radio
Word: prepare 	 Lemma: prepare
GT: play 	 GTLemma: play
Word: sleeve 	 Lemma: sleeve
GT: sleeve 	 GTLemma: sleeve
Word: record 	 Lemma: record
GT: side 	 GTLemma: side
Word: one 	 Lemma: one
GT: largest 	 GTLemma: large
Word: get 	 Lemma: get
GT: get 	 GTLemma: get
Word: events 	 Lemma: event
GT: events 	 GTLemma: event
Word: wanting 	 Lemma: want
GT: black 	 GTLemma: black
Word: were 	 Lemma: be
GT: sound 	 GTLemma: sound
Word: and 	 Lemma: and
GT: people 	 GTLemma: people
Word: wanted 	 Lemma: want
GT: convinced 	 GTLemma: convince
Word: and 	 Lemma: and
GT: dreams 	 GTLemma: dream
Word: written 	 Lemma: write
GT: read 	 GTLemma: read
Word: know 	 Lemma: know
GT: know 	 GTLemma: know
Word: getting 	 Lemma: get
GT: town 	 GTLemma: town
Word: friend 	 Lemma: friend
GT: kids 	 GTLemma: kid
Word: kids 	 Lemma: kid
GT: voice 	 GTLemma: voice
Word: what 	 Lemma: what
GT: trying 	 GTLemma: try
Word: and 	 Lemma: and
GT: know 	 GTLemma: kn

### Analyze human data and save

In [100]:
# load transcript
df_transcript = pd.read_csv(os.path.join(preproc_dir, TASK, f'{TASK}_transcript-preprocessed.csv'))
df_transcript = df_transcript.rename(columns={'Word_Written': 'word', 'Punctuation': 'punctuation'})

# load lemmatized file and analyze human results --> use all words
lemmatized_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior_lemmatized.csv')
df_lemmatized_results = pd.read_csv(lemmatized_fn)

# combine the data and lemmatize model results
df_analyzed_lemmatized = analyze_human_results(df_transcript, df_lemmatized_results, word_model_info, top_n=None, drop_rt=None)

# save lemmatized human-model results
out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-analyzed-behavior_human-lemmatized.csv')
df_analyzed_lemmatized.to_csv(out_fn, index=False)

Word 6

audio
['college' 'high' 'school' 'work' 'intern' 'collage' 'class' 'chemistry']
[0.48 0.26 0.16 0.02 0.02 0.02 0.02 0.02]
1.3869596872523586

text
['school' 'high' 'college' 'a' 'team' 'middle' 'medicine' 'construction']
[0.48 0.22 0.18 0.04 0.02 0.02 0.02 0.02]
1.435793895702573
Word 12

audio
['job' 'assignment' 'boyfriend' 'car' 'degree' 'taste' 'period' 'kiss' 'f'
 'crush' 'traumatic' 'award' 'book' 'challenge' 'experience' 'diploma'
 'exam' 'tattoo' 'fail' 'injury' 'lesson' 'qualification' 'a']
[0.18 0.12 0.1  0.08 0.06 0.04 0.04 0.04 0.04 0.04 0.02 0.02 0.02 0.02
 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02]
2.825117931635182

text
['job' 'kiss' 'girlfriend' 'car' 'boyfriend' 'tattoo' 'ticket' 'crush'
 'assignment' 'bike' 'certificate' 'dog' 'degree' 'drink' 'f'
 'scholarship' 'std' 'apartment']
[0.3  0.14 0.12 0.1  0.06 0.04 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02
 0.02 0.02 0.02 0.02]
2.357582972136478
Word 19

audio
['road' 'campus' 'job' 'right' 'radio' 'role' 'farm'

## Analyze and compile human and LLM data

### Set LLM model names

In [17]:
PROSODY_MODELS = [
    'helsinki-prosody_scratch-gpt2_joint-loss_prosody-embed',
    'helsinki-prosody_scratch-gpt2_clm-loss_prosody-embed',
    'helsinki-prosody_scratch-gpt2_clm-loss_no-prosody-embed',

    ################### GIGASPEECH ############
    
    'gigaspeech-prosody_scratch-gpt2_joint-loss_prosody-embed',
    'gigaspeech-prosody_scratch-gpt2_clm-loss_prosody-embed',
    'gigaspeech-prosody_scratch-gpt2_clm-loss_no-prosody-embed',
]

model_names = PROSODY_MODELS


# # get all MLM models except BERT
# MLM_MODELS = list(nlp.MLM_MODELS_DICT.keys())[1:]
# CLM_MODELS = list(nlp.CLM_MODELS_DICT.keys()) 
# model_names = CLM_MODELS + MLM_MODELS

# print (f'Loading the following models')
# print (f'MLM models: {MLM_MODELS}')
# print (f'CLM models: {CLM_MODELS}')

### Cleaned responses - Load and merge human and LLM data 

In [128]:
# load lemmatized file and analyze human results --> use all words
cleaned_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior.csv')
df_cleaned_results = pd.read_csv(cleaned_fn)

# combine the data and lemmatize model results
df_human_model = compare_human_model_accuracy(df_cleaned_results, model_names, word_model_info, task=TASK, top_n=1, window_size=25, lemmatize=False)

# save combined human-model results
out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-analyzed-behavior_human-model.csv')
df_human_model.to_csv(out_fn, index=False)

Total missing values: 21


### Lemmatization - Load and merge human and LLM data

In [40]:
WINDOW_SIZE = 100

# load lemmatized file and analyze human results --> use all words
lemmatized_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior_lemmatized.csv')
df_lemmatized_results = pd.read_csv(lemmatized_fn)

# combine the data and lemmatize model results
df_human_model_lemmatized = compare_human_model_accuracy(df_lemmatized_results, model_names, word_model_info, task=TASK, top_n=1, window_size=WINDOW_SIZE, lemmatize=True)

# save lemmatized human-model results
if 'prosody' in model_names[0]:
    out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-analyzed-behavior_window-size-{WINDOW_SIZE}_human-prosody-model-lemmatized.csv')
else:
    out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-analyzed-behavior_window-size-{WINDOW_SIZE}_human-model-lemmatized.csv')

df_human_model_lemmatized.to_csv(out_fn, index=False)

Word 4
Word 11
Word 30
Word 34
Word 38
Word 47
Word 51
Word 61
Word 64
Word 70
Word 78
Word 83
Word 87
Word 99
Word 125
Word 130
Word 136
Word 151
Word 165
Word 182
Word 198
Word 202
Word 213
Word 217
Word 227
Word 242
Word 257
Word 261
Word 268
Word 272
Word 276
Word 282
Word 292
Word 306
Word 319
Word 323
Word 329
Word 341
Word 344
Word 351
Word 355
Word 359
Word 363
Word 367
Word 372
Word 376
Word 382
Word 389
Word 396
Word 406
Word 413
Word 425
Word 432
Word 435
Word 441
Word 445
Word 448
Word 453
Word 459
Word 469
Word 484
Word 490
Word 496
Word 504
Word 518
Word 525
Word 528
Word 531
Word 535
Word 539
Word 544
Word 548
Word 552
Word 562
Word 567
Word 582
Word 589
Word 595
Word 601
Word 606
Word 617
Word 621
Word 628
Word 632
Word 639
Word 646
Word 650
Word 661
Word 664
Word 670
Word 675
Word 685
Word 691
Word 696
Word 699
Word 711
Word 724
Word 727
Word 736
Word 742
Word 748
Word 756
Word 760
Word 769
Word 778
Word 781
Word 786
Word 793
Word 802
Word 806
Word 809
Word 817
Word 82

## Analyze human vs. GPT2-XL distributions

### Load file of cleaned data

In [12]:
# save lemmatized human-model results
human_cleaned_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior_lemmatized.csv')
df_human_cleaned = pd.read_csv(human_cleaned_fn)

NameError: name 'behavioral_dir' is not defined

### Compare human and model prediction distributions and save

In [76]:
WINDOW_SIZE = 25

df_comparison = []
df_human_cleaned['response'] = df_human_cleaned['response'].apply(lambda x: strip_punctuation(x) if isinstance(x, str) else '')
    
for model_name in model_names:

    print (f'{model_name}')

    # now load a model to compare to 
    tokenizer, model = nlp.load_clm_model(
        model_name='gpt2' if 'prosody' in model_name else model_name, 
        cache_dir=CACHE_DIR)
    
    # go through the each modality word index
    for (modality, response_index), df in df_human_cleaned.groupby(['modality', 'word_index']):
        
        # get all responses for current index across both modalities
        all_responses = df_human_cleaned[df_human_cleaned['word_index'] == response_index]['response'].tolist()
        all_responses = list(filter(None, all_responses))
    
        # grab responses for the current modality
        modality_responses = df['response'].apply(strip_punctuation)
        ground_truth = df['ground_truth'].unique().tolist()[0]
        
        # prosody models can't be run with less than 4 tokens
        if 'prosody' in model_name and response_index < 4:
            continue

        # load the logits for the current response
        model_logits = load_logits(models_dir, model_name, TASK, WINDOW_SIZE, response_index) #response_index - 1)
        
        # now compare the two and add it to the dataframe
        df_compare = compare_human_model_distributions(tokenizer, word_model, modality_responses, all_responses, model_logits, ground_truth)
        df_compare['model_name'] = model_name
        df_compare['modality'] = modality
        df_compare['word_index'] = response_index
        df_compare[['entropy_group', 'accuracy_group', 'ground_truth']] = df[['entropy_group', 'accuracy_group', 'ground_truth']].iloc[0]
        
        df_comparison.append(df_compare)

# lastly 
df_comparison = pd.concat(df_comparison).reset_index(drop=True)

if 'prosody' in model_names[0]:
    out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-analyzed-behavior_window-size-{WINDOW_SIZE}_human-prosody-model-distributions-lemmatized.csv')
else:
    out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-analyzed-behavior_window-size-{WINDOW_SIZE}_human-model-distributions-lemmatized.csv')

df_comparison.to_csv(out_fn, index=False)

helsinki-prosody_scratch-gpt2_joint-loss_prosody-embed
helsinki-prosody_scratch-gpt2_clm-loss_prosody-embed
helsinki-prosody_scratch-gpt2_clm-loss_no-prosody-embed
gigaspeech-prosody_scratch-gpt2_joint-loss_prosody-embed
gigaspeech-prosody_scratch-gpt2_clm-loss_prosody-embed
gigaspeech-prosody_scratch-gpt2_clm-loss_no-prosody-embed
