In [1]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")
    
import os, sys, glob
import json
import re
import numpy as np
import pandas as pd
from natsort import natsorted
import tqdm
from manual_spellchecker import spell_checker
import inflect
from scipy import stats

sys.path.append('/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/code/utils/')

from config import *
import dataset_utils as utils
from tommy_utils import nlp, statistics
from preproc_utils import load_model_results, divide_nwp_dataframe

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /dartfs/rc/lab/F/FinnLab/tommy/models/token
Login successful


In [78]:
EXPERIMENT_NAME = 'next-word-prediction'
EXPERIMENT_VERSION = 'final-multimodal-01'
TASK = 'odetostepfather'

# set the directories we need
gentle_dir = os.path.join(BASE_DIR, 'stimuli/gentle')
results_dir = os.path.join(BASE_DIR, 'experiments',  EXPERIMENT_NAME, 'results', EXPERIMENT_VERSION)
preproc_dir = os.path.join(BASE_DIR, 'stimuli/preprocessed')
models_dir = os.path.join(BASE_DIR, 'derivatives/model-predictions')

# make directories
cleaned_results_dir = os.path.join(BASE_DIR, 'experiments',  EXPERIMENT_NAME, 'cleaned-results', EXPERIMENT_VERSION)
behavioral_dir = os.path.join(BASE_DIR, 'derivatives/results/behavioral/')

utils.attempt_makedirs(behavioral_dir)

## Functions -- cleaning, aggregation, and analysis

### Check and clean meta file

In [3]:
# importing shutil module  
import shutil
from pathlib import Path

def check_used_files(experiment_name, experiment_version, task, clean_errors=False, max_missing_responses=5):
    '''
    Grabs used files based on the meta file and results directory
    Returns list of subjects that were used and ones that had errors
    '''
    
    checker = {
        'complete': [],
        'incomplete': [],
        'error': [],
        'missing': [],
    }

    meta_dir = os.path.join('/dartfs/rc/lab/F/FinnLab/tommy/jspsych_experiments/utils/experiment_meta/', experiment_name)
    meta_file = pd.read_csv(os.path.join(meta_dir, f'{experiment_version}-{task}.csv'))
    
    source_dir = os.path.join(BASE_DIR, 'stimuli',  'presentation_orders', experiment_version, task, 'jspsych')
    results_dir = os.path.join(BASE_DIR, 'experiments',  experiment_name, 'results', experiment_version)
    
    # grab the used files
    used_fns = meta_file[meta_file['used'].fillna(0).astype(bool)]
    
    approve_ids = []
    
    # go through each used file
    for i, fn in used_fns.iterrows():
        # grab info regarding subject name and modality
        curr_path = Path(fn['subject_fns'])
        sub = curr_path.stem.split('_')[0]
        modality = curr_path.parents[0].stem
        
        # find the corresponding parameter file for the current subject
        parameter_fn = glob.glob(os.path.join(source_dir, f'{sub}*.json'))
        assert (len(parameter_fn) == 1)
        
        # load the parameter file to compare to the subject's results
        df_parameters = pd.read_json(parameter_fn[0], orient='records')
        df_parameters = df_parameters.dropna()
        df_parameters['word_index'] = df_parameters['word_index'].dropna().astype(int)
        
        # then grab the subject results
        sub_results_dir = os.path.join(results_dir, task, modality, sub)
        
        # load results from the completed experiment
        try:
            current_id, demographics, experience, responses = load_participant_results(sub_results_dir, sub)
            
            # append if approving
            approve_ids.append(current_id)
        except:
#             if os.path.exists(sub_results_dir):
            checker['error'].append((i, modality, sub))
            continue
            
        # check that all indices of trials match and all responses are there
        all_trials_complete = np.all(responses['word_index'] == df_parameters['word_index'])
        missing_response_threshold = sum(pd.isnull(responses['response'])) <= max_missing_responses
        
#         all_responses_complete = np.all(~pd.isnull(responses['response']))
        
        # also ensure that we have the right amount of demographics/experience questions
        all_checks_complete = np.all([
            all_trials_complete, 
            missing_response_threshold, 
            len(demographics)==4,
            len(experience)==2,
        ])
        
        if all_trials_complete and missing_response_threshold:
            # add to list of people completed
            checker['complete'].append((i, modality, sub, current_id))
        else:
            checker['incomplete'].append((i, modality, sub, current_id))
            
        del current_id
        
    if clean_errors:
        clean_meta_errors(checker, experiment_name, experiment_version, task)
        
        # run again and return from here now that its updated
        return check_used_files(experiment_name, experiment_version, task, clean_errors=False)
    else:
        return checker, approve_ids

def clean_meta_errors(checker, experiment_name, experiment_version, task):
    
    meta_dir = os.path.join('/dartfs/rc/lab/F/FinnLab/tommy/jspsych_experiments/utils/experiment_meta/', experiment_name)
    meta_fn = os.path.join(meta_dir, f'{experiment_version}-{task}.csv')
    
    meta_file = pd.read_csv(meta_fn)
    
    results_dir = os.path.join(BASE_DIR, 'experiments',  experiment_name, 'results', experiment_version)
    
    errors = checker['error']
    modalities = ['text', 'audio']
    
    if any(checker['error']):
        
        remove_idxs, _, _ = zip(*checker['error']) 
        remove_idxs = list(remove_idxs)
        
        for modality in modalities:

            # get errors for the current modality
            modality_errors = [error for error in errors if error[1] == modality]
            errors_dir = os.path.join(results_dir, task, modality, 'error')

            # get new errors dir if previous has files in it
            batch_errors = sorted(glob.glob(os.path.join(errors_dir, '*')))

            if any(batch_errors):
                last_error_dir = Path(batch_errors[-1]).stem
            else:
                last_error_dir = 'batch_1'

            if any(glob.glob(os.path.join(errors_dir, last_error_dir, '*'))):
                curr_batch_num = int(last_error_dir.split('_')[-1]) + 1
                curr_error_dir = os.path.join(errors_dir, f'batch_{curr_batch_num}')
                os.makedirs(curr_error_dir)
            else:
                curr_error_dir = os.path.join(errors_dir, last_error_dir)

            for item in modality_errors:
                file_idx, modality, sub = item

                # then grab the subject results
                sub_results_dir = os.path.join(results_dir, task, modality, sub)

                if os.path.exists(sub_results_dir):
                    print ('Here')
                    shutil.move(sub_results_dir, curr_error_dir)


        print (f'Cleaned meta file!')
        meta_file.loc[list(remove_idxs), 'used'] = None
        meta_file['used'] = meta_file['used'].astype('Int64')
        meta_file.to_csv(meta_fn, index=False)


### Loading data 

In [4]:
def load_participant_results(sub_dir, sub):
    
    # load and filter down to response trials
    df_results = pd.read_csv(os.path.join(sub_dir, f'{sub}_next-word-prediction.csv')).fillna(False)
    df_results['word_index'] = df_results['word_index'].astype(int)
    
    # grab the prolific id
    prolific_id = list(set(df_results['prolific_id']))

    # filter down demographics
    demographics = df_results[df_results['experiment_phase'].str.contains('demographics').fillna(False)]
    demographics = demographics[['experiment_phase', 'response']].reset_index(drop=True)
    
    # age, race, ethnicity, gender
    assert (len(demographics) == 4)
    
    # filter down to questinos about moth/story experience
    experience = df_results[df_results['experiment_phase'].str.contains('experience').fillna(False)]
    experience = experience[['experiment_phase', 'response']].reset_index(drop=True)
    
    # moth experience + story experience
    assert (len(experience) == 2)
    
    # filter down to get the responses
    responses = df_results[df_results['experiment_phase'] == 'test']
    responses.loc[:,'response'] = responses['response'].str.lower()
    responses = responses[['critical_word', 'word_index', 'entropy_group', 'accuracy_group', 'response']].reset_index(drop=True)
    
    return prolific_id[0], demographics, experience, responses

def add_word_response(dict, key, value):
    
    if key in dict:
        dict[key].append(value)
    else:
        dict[key] = [value]
        
    return dict

def aggregate_participant_responses(results_dir, task, sub_mod_list):
    
#     MODALITIES = ['audio', 'text']
    
    df_results = pd.DataFrame(columns=['prolific_id', 'modality', 'subject',  'word_index', 'response', 'ground_truth', 'entropy_group', 'accuracy_group'])
    
    all_ids = []
    
    for sub, mod in sub_mod_list: 
        # go through each task and get participant data
        sub_dir = os.path.join(results_dir, task, mod, sub)
        print (sub, mod)
        if os.path.exists(sub_dir):
            current_id, demographics, experience, responses = load_participant_results(sub_dir, sub)
            responses['response'] = responses['response'].fillna('')

            # for right now only focus on responses
            for index, response, critical_word, entropy_group, accuracy_group in responses[['word_index', 'response', 'critical_word', 'entropy_group', 'accuracy_group']].values:

                df_results.loc[len(df_results)] = {
                    'prolific_id': current_id,
                    'modality': mod,
                    'subject': sub,
                    'word_index': index,
                    'response': response,
                    'ground_truth': critical_word.lower(),
                    'entropy_group': entropy_group, 
                    'accuracy_group': accuracy_group
                }
            
            all_ids.append(current_id)
        else:
            print (f'File not exists: {mod}, {sub}')
            
    return df_results

def strip_punctuation(text):
    
    full_text = re.sub('[^A-Za-z0-9]+', '', text)
    
    return full_text

### Interactive data cleaning

In [5]:
import enchant, string, time
from IPython.display import clear_output

AUTO_REPLACE = {
    'mum': 'mom',
}

def clean_participant_responses(df_results, df_transcript):
    
    # grab indices of responses --> used to index back in
    response_indices = df_results['experiment_phase'] == 'test'
    response_indices = np.where(response_indices)[0]

    checked_indices = []
    
    # filter down to get the responses
    df_responses = df_results.iloc[response_indices, :].reset_index(drop=True)
    df_responses.loc[df_responses['response'] == False, 'response'] = ""
    df_responses['response'] = df_responses['response'].apply(lambda x: x.strip().lower())

    inflect_engine = inflect.engine()
    enc_dict = enchant.Dict("en_US")

    ##############################
    ###### Run autoreplace #######
    ##############################

    for k, v in AUTO_REPLACE.items():
        df_responses.loc[df_responses['response'] == k, 'response'] = v

    ##############################
    #### Run numbers replace #####
    ##############################

    for index, df in df_responses.iterrows():
        response = df['response']
         
        if response.isdigit():
            response = inflect_engine.number_to_words(response)
            df['response'] = response
            df_responses.iloc[df.name] = df
            checked_indices.append(index)

    ##############################
    ###### Run spell-check #######
    ##############################

    print (f'##########################\n' +
           f'### Running spellcheck ###\n' +
           f'##########################\n\n')

    time.sleep(2)

    
    
    for index, df in df_responses.iterrows():

        response = df['response']
        
        if response == '':
            continue
        
        # tokens = df['response'].split()
        if not enc_dict.check(response) or response in string.punctuation and index not in checked_indices:
            df = prompt_correct_response(df, df_transcript, enc_dict, prompt_correction=False)
            df_responses.iloc[df.name] = df
            checked_indices.append(index)
            

    ##############################
    ######## Find phrases ########
    ##############################

    clear_output(wait=False)
    print (f'##########################\n' +
           f'####### Find phrases #####\n' +
           f'##########################\n\n')

    time.sleep(2)

    # go through each row
    for index, df in df_responses.iterrows():
        response = df['response'].split()

        if len(response) > 1 and index not in checked_indices:
            df = prompt_correct_response(df, df_transcript, enc_dict, prompt_correction=False)
            df_responses.iloc[df.name] = df
            checked_indices.append(index)
        else:
            continue

    ##############################
    ######## Final check #########
    ##############################

    clear_output(wait=False)
    print (f'##########################\n' +
           f'####### Final check ######\n' +
           f'##########################\n\n')

    time.sleep(2)

    # go through each row
    for index, df in df_responses.iterrows():

        if index not in checked_indices:
            df = prompt_correct_response(df, df_transcript, enc_dict, prompt_correction=True)
            df_responses.iloc[df.name] = df
            checked_indices.append(index)
        
    df_results.iloc[response_indices] = df_responses

    return df_results

def prompt_correct_response(df_response, df_transcript, enc_dict, range_display=7, prompt_correction=False):
    
    word_index = df_response['word_index']
    response = df_response['response']
    ground_truth = df_response['critical_word']
    
    start_index = (word_index - range_display) if (word_index - range_display) >= 0 else 0 
    end_index = (word_index + range_display) if (word_index + range_display) - len(df_transcript) <= 0 else None
    
    # grab the context    
    start_context = df_transcript['Word_Written'].iloc[start_index:word_index]
    end_context = df_transcript['Word_Written'].iloc[word_index + 1:end_index]
    
    # display the word
    string_to_print = ""
    
    if start_index > 0:
        string_to_print+= ".... "
    
    string_to_print+= " ".join(start_context) + " " + "\033[43;30m" + response + "\033[m" + " " + " ".join(end_context)
    
    if end_index is not None and end_index < len(df_transcript):
        string_to_print+= " ...."

    clear_output(wait=False)

    print("\n\nCurrent Word: " + string_to_print)
    print ("Ground Truth: ", ground_truth)
    
    # suggestions = enc_dict.suggest(misspelled_word)
    if prompt_correction:
        prompt_correction = input('\nNeeds correction? [y/n]: ')
    
    if prompt_correction == 'y' or prompt_correction == False:
        suggestions = enc_dict.suggest(response)
        print("\Suggestions: ", suggestions)
        correct_word = input("\nCorrect Version: ")
        
        if correct_word == "-999":
            break_flag = True
            sys.exit(0)
        elif correct_word.isdigit() and int(correct_word) - 1 < len(suggestions): # User wants to use suggestion
            print (f'Using word: {suggestions[int(correct_word)-1]}')
            df_response['response'] = df_response['response'].replace(response, suggestions[int(correct_word)-1])
            time.sleep(2)
        elif len(correct_word) == 0: # User wants to Skip
            return df_response
        elif correct_word == "''" or correct_word == '""': # User wants to remove the word
            df_response['response'] = df_response['response'].replace(response, "")
        else:
            time.sleep(2)
            df_response['response'] = df_response['response'].replace(response, correct_word)

    return df_response

### Lemmatization and accuracy functions

In [63]:
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
import re

STOP_WORDS = stopwords.words('english')

def get_pos_tags(text, strip_punc=True):
    """
    Creates POS tags of words in a text corpus. Before tagging,
    performs case and white space normalization and punctuation
    removal before tagging text.
    
    Parameters
    ----------
    text : list of str
        List of text samples (lecture transcript lines, quiz questions,
        or quiz answers) to be processed.

    Returns
    -------
    words_tags : list of tuples
        The word-postag pairings for the list of text samples with 
        preprocessing steps applied to each element.
    """

    # clean spacing, normalize case, strip puncutation
    # (temporarily leave punctuation useful for POS tagging)
    full_text = ' '.join(text) #.lower()
    
    if strip_punc:
        # TLB 10/26/22 - removing ignoring of apostrophe
        # re.sub("[^a-zA-Z\s'-]+", '', full_text)
        full_text = re.sub("[^a-zA-Z\s'-]+", '', full_text)
    
    # POS tagging (works better on full transcript, more context provided)
    words_tags = pos_tag(full_text.split())

    #case normalize word -> case doesn't matter anymore
    return [(word.lower(), tag) for word, tag in words_tags]

def get_lemma(word, tag, remove_stopwords=True):
    """
    Handles lemmatization of words. Removes stopwords and alpha-numeric
    words from the text.
    
    Parameters
    ----------
    word_tag : tuple of str
        Tuple containing the word to be lemmatized and the accompanying
        WordNet POS.

    Returns
    -------
    lemma : str
        The word-postag pairings for the list of text samples with 
        preprocessing steps applied to each element.
    """
    
    # define some constants only used in this function:
    lemmatizer = WordNetLemmatizer()
    
    # POS tag mapping, format: {Treebank tag (1st letter only): Wordnet}
    tagset_mapping = defaultdict(
        lambda: 'n',   # defaults to noun
        {
            'N': 'n',  # noun types
            'P': 'n',  # pronoun types, predeterminers
            'V': 'v',  # verb types
            'J': 'a',  # adjective types
            'D': 'a',  # determiner
            'R': 'r'   # adverb types
        })
    
    if "'" in word:
        word = word.split("'")[0]
        
    # remove stop words & digits
    if remove_stopwords and word in STOP_WORDS or any(c.isdigit() for c in word):
        return None
    
    # convert Treebank POS tags to WordNet POS tags; lemmatize
    tag = tagset_mapping[tag[0]]
    lemma = lemmatizer.lemmatize(word, tag)
    
    return lemma

def make_transcript_context(word, df_transcript, word_index, range_display=10):
    start_index = (word_index - range_display) if (word_index - range_display) >= 0 else 0 
    end_index = (word_index + range_display) if (word_index + range_display) - len(df_transcript) <= 0 else None

    # grab the context    
    start_context = df_transcript['Word_Written'].iloc[start_index:word_index]
    end_context = df_transcript['Word_Written'].iloc[word_index + 1:end_index]

    context = " ".join(start_context) + " " + word + " " + " ".join(end_context)

    index = word_index - start_index

    return context, index

def lemmatize_responses(df_results, df_transcript, response_column='response'):
    
    for i, df in df_results.iterrows():
        response = df[response_column]
        ground_truth = df['ground_truth']
        word_index = df['word_index']

        # find the lemma of the response
        context, index = make_transcript_context(response, df_transcript, word_index)
        response, response_tag = get_pos_tags([context])[index]
        response_lemma = get_lemma(response, response_tag, remove_stopwords=False)
        df_results.loc[i, response_column] = response_lemma

        print (f'Word: {response} \t Lemma: {response_lemma}')
        
        # find the lemma of the ground truth
        context, index = make_transcript_context(ground_truth, df_transcript, word_index)
        ground_truth, ground_truth_tag = get_pos_tags([context])[index]
        ground_truth_lemma = get_lemma(ground_truth, ground_truth_tag, remove_stopwords=False)
        df_results.loc[i, 'ground_truth'] = ground_truth_lemma

        print (f'GT: {ground_truth} \t GTLemma: {ground_truth_lemma}')


    return df_results

def calculate_results_accuracy(df_results):

    # compare response to ground truth
    df_results['accuracy'] = df_results['response'] == df_results['ground_truth']
    df_results['accuracy'] = df_results['accuracy'].astype(int)

    df_accuracy = df_results.groupby(['prolific_id', 'modality', 'subject'])['accuracy'].mean() \
    .reset_index() \
    .sort_values(by='accuracy', ascending=True)

    return df_results, df_accuracy

### Analysis of human data

In [69]:
def get_human_probs(responses):
    
    unique, counts = np.unique(responses, return_counts=True)
    probs = counts / sum(counts)
    
    return probs, unique

def strip_punctuation(text):
    
    full_text = re.sub('[^A-Za-z0-9]+', '', text)
    
    return full_text

def analyze_human_results(df_results, word_model_info, top_n=5):
    
    word_model_name, word_model = word_model_info
    
    df_collapse = pd.DataFrame(columns=[
        'modality',
        'word_index',
        'ground_truth',
        'binary_accuracy',
        'top_pred',
        'top_prob',
        'distribution_std',
        'entropy',
        'normalized_entropy',
        'predictability',
        f'{word_model_name}_top_word_accuracy',
        f'{word_model_name}_avg_accuracy',
        f'{word_model_name}_max_accuracy',
        f'{word_model_name}_prediction_density',
        f'{word_model_name}_weighted_prediction_density',
        'entropy_accuracy_group'
    ])

    df_results['response'] = df_results['response'].apply(lambda x: strip_punctuation(x) if isinstance(x, str) else '')
    
    for (modality, response_index), df in df_results.groupby(['modality', 'word_index']):
        
        df['response'] = df['response'].apply(strip_punctuation)
        human_responses = df['response']
        ground_truth = df['ground_truth']
        
        # get the probability distribution of the human responses --> also return the unique words
        human_probs, unique_words = get_human_probs(df['response'])
        predictability = sum(np.asarray(human_responses) == ground_truth) / len(human_responses)
        
        # sort the probabilities --> then choose the top_n words
        sorted_prob_idxs = np.argsort(human_probs)[::-1]
        sorted_probs = human_probs[sorted_prob_idxs]

        if top_n is not None and len(unique_words) < top_n:
            # grab all the unique words
            all_word_idxs = sorted_prob_idxs[:len(unique_words)]
            top_n_words = unique_words[all_word_idxs]
        else:
            # grab the top words
            top_word_idxs = sorted_prob_idxs[:top_n]
            top_n_words = unique_words[top_word_idxs]
            
            # human_probs = human_probs[:top_n] / sum(human_probs[:top_n])
            # sorted_probs = np.argsort(human_probs)[::-1]
        
        # calculate entropy
        entropy = stats.entropy(human_probs)
        norm_entropy = entropy / np.log(len(human_probs))
        
        ############################################
        ### Get word distances from ground truth ###
        ############################################

        ground_truth_word = np.unique(ground_truth)[0]
        
        # use the same form as how we calculated language model metrics
        avg_pred_similarity, pred_distances = nlp.get_word_vector_metrics(word_model, top_n_words, ground_truth_word)
        weighted_pred_distances = np.nanmean(pred_distances * sorted_probs)
        
        if np.isnan(pred_distances):
            pred_distances = 0
        
        # use the same form as how we calculated language model metrics
        top_word_accuracy, _ = nlp.get_word_vector_metrics(word_model, top_n_words[:1], ground_truth_word)
        max_pred_similarity, _ = nlp.get_word_vector_metrics(word_model, top_n_words, ground_truth_word, method='max')
        
        df_collapse.loc[len(df_collapse)] = {
            'modality': modality,
            'word_index': response_index,
            'ground_truth': ground_truth_word,
            'binary_accuracy': df['accuracy'].mean(),
            'top_pred': top_n_words[0],
            'top_prob': sorted_probs[0],
            'distribution_std': np.std(human_probs),
            'entropy': entropy,
            'normalized_entropy': np.nan_to_num(norm_entropy), 
            'predictability': predictability,
            f'{word_model_name}_top_word_accuracy': top_word_accuracy,
            f'{word_model_name}_avg_accuracy': avg_pred_similarity,
            f'{word_model_name}_max_accuracy': max_pred_similarity,
            f'{word_model_name}_prediction_density': 1 - pred_distances,
            f'{word_model_name}_weighted_prediction_density': 1 - weighted_pred_distances,
            'entropy_accuracy_group': df['entropy_group'].iloc[0] + '-' + df['accuracy_group'].iloc[0]
        }

    # compare response to ground truth
    df_collapse['accuracy'] = df_collapse['top_pred'] == df_collapse['ground_truth']
    df_collapse['accuracy'] = df_collapse['accuracy'].astype(int)

    return df_collapse

### Aggregation of human and LLM data

In [70]:
def get_model_word_quadrants(model_name, task, selected_idxs=None, accuracy_type='word2vec_avg_accuracy'):
    
    # FOR DIVIDING THE MODEL RESULTS INTO QUADRANTS
    ACCURACY_TYPE = accuracy_type
    ACCURACY_PERCENTILE = 50
    WINDOW_SIZE = 100
    TOP_N = 5
    
    preproc_dir = os.path.join(BASE_DIR, 'stimuli', 'preprocessed')
    
    # load our preprocessed file --> get the indices of the prediction words
    df_preproc = pd.read_csv(os.path.join(preproc_dir, task, f'{task}_transcript-preprocessed.csv'))
    nwp_idxs = np.where(df_preproc['NWP_Candidate'])[0]
    
    # select based on model quadrants --> trim down to only the words of interest
    model_results = load_model_results(models_dir, model_name=model_name, task=task, window_size=WINDOW_SIZE, top_n=TOP_N)
    model_results.loc[:, 'binary_accuracy'] = model_results['binary_accuracy'].astype(bool)
    model_results = model_results.iloc[nwp_idxs]
    
    # now grab the current model divided over the 50th percentile
    # while we originally divided words on the 45th percentile of gpt2, we want to see patterns across models
    df_divide = divide_nwp_dataframe(model_results, accuracy_type=ACCURACY_TYPE, percentile=ACCURACY_PERCENTILE, drop=False)
    
    return df_divide.loc[selected_idxs, ['entropy_group', 'accuracy_group']]
    
def compare_human_model_accuracy(human_results, model_names, word_model_info, task, top_n=1, window_size=25, lemmatize=False):
    
    # first get top 1 prediction for humans within each modality
    df_collapsed_results = analyze_human_results(human_results, word_model_info, top_n=top_n)
    word_model_name, word_model = word_model_info
    
    ## TLB THIS IS A HACK CHANG IT
    ### YOU KNOW IT IS CHANGE IT
    all_ground_truth_words = df_collapsed_results[df_collapsed_results['modality'] == 'audio']['ground_truth'].tolist()
    
    # set the directories we need
    models_dir = os.path.join(BASE_DIR, 'derivatives/model-predictions')
    preproc_dir = os.path.join(BASE_DIR, 'stimuli/preprocessed', task)

    # load our preprocessed file --> get the indices of the prediction words
    df_transcript = pd.read_csv(os.path.join(preproc_dir, f'{task}_transcript-preprocessed.csv'))
    df_selected = pd.read_csv(os.path.join(preproc_dir, f'{task}_transcript-selected.csv'))
    selected_idxs = np.where(df_selected['NWP_Candidate'])[0]
    
    # start the list with the human results
    human_model_combined = [df_collapsed_results]
    
    for model_name in model_names:
        
        # load the results for the current model
        model_results = load_model_results(models_dir, model_name=model_name, task=task, top_n=top_n, window_size=window_size)
        model_results['top_pred'] = model_results['top_n_predictions'].str[0]
        model_results = model_results.rename(columns={'top_prob': 'predictability'})
        # model_results_trimmed = model_results.loc[selected_idxs, :]
        
        # trim down to only predicted word accuracies
        model_results_trimmed = model_results.loc[selected_idxs, ['top_pred', 'predictability', f'{word_model_name}_avg_accuracy']]
        model_results_trimmed = model_results_trimmed.reset_index().rename(columns={'index': 'word_index'})
        model_results_trimmed['modality'] = model_name

        # get the quadrants and determine entropy accuracy groups
        model_quadrants = get_model_word_quadrants(model_name, task, selected_idxs, accuracy_type=f'{word_model_name}_max_accuracy').reset_index(drop=True)
        entropy_accuracy_groups = model_quadrants['entropy_group'] + '-' +  model_quadrants['accuracy_group']

        # add in entropy accuracy quadrant organization to the dataframe
        model_results_trimmed['entropy_accuracy_group'] = entropy_accuracy_groups    
        model_results_trimmed['ground_truth'] = all_ground_truth_words

        # lemmatize if wanted
        if lemmatize:
            model_results_trimmed = lemmatize_responses(model_results_trimmed, df_transcript, response_column='top_pred')
        
        human_model_combined.append(model_results_trimmed)
    
    # concatenate all dataframes 
    human_model_combined = pd.concat(human_model_combined).reset_index(drop=True)
    
    # use gpt2-xl as our model of organizing human results
    entropy_accuracy = human_model_combined[human_model_combined['modality'] == 'gpt2-xl']['entropy_accuracy_group'].tolist()   
    human_model_combined.loc[human_model_combined['modality'] == 'audio', 'entropy_accuracy_group'] = entropy_accuracy
    human_model_combined.loc[human_model_combined['modality'] == 'text', 'entropy_accuracy_group'] = entropy_accuracy

    # calculate accuracy
    human_model_combined['accuracy'] = human_model_combined['top_pred'] == human_model_combined['ground_truth']
    human_model_combined['accuracy'] = human_model_combined['accuracy'].astype(int) * 100

    print (f"Total missing values: {human_model_combined[f'{word_model_name}_avg_accuracy'].isna().sum()}")

    # select the columns that we want to save out for gross-comparison
    selected_columns = ['modality', 'word_index', 'top_pred', 'ground_truth', 'accuracy', f'{word_model_name}_avg_accuracy', 
                        'predictability', 'entropy_accuracy_group']
    
    human_model_combined = human_model_combined.loc[:, selected_columns]
    
    return human_model_combined


### Comparison of probability distributions

In [66]:
import torch
from torch.nn import functional as F
from scipy.special import kl_div, rel_entr
from scipy import stats
from scipy.spatial import distance

def load_logits(model_dir, model_name, task, window_size, word_index):
    '''
    Loads model data from directory
    '''

    model_dir = os.path.join(model_dir, task, model_name, f'window-size-{window_size}')
    logits_fns = natsorted(glob.glob(os.path.join(model_dir, 'logits', f'*{str(word_index).zfill(5)}*.pt')))
    
    assert (len(logits_fns) == 1)
    
    return torch.load(logits_fns[0])

def compare_human_model_distributions(tokenizer, word_model, human_responses, all_responses, model_logits, ground_truth):
    
    df = pd.DataFrame(columns=[
        'top_word_human', 
        'top_word_model',
        'top_word_model_adjusted',
        'prob_human',
        'prob_model',
        'prob_model_adjusted', 
        'prob_model_human_pred',
        'predictability_model',
        'predictability_human',
        'continuous_predictability_human',
        'log_odds_predictability_model',
        'log_odds_predictability_human',
        'log_odds_continuous_predictability_human',
        'kl_divergence',
        'relative_entropy',
        'wasserstein_dist',
        'jensenshannon_dist',
        'ks_stat'
    ])
    
    pre_filter = len(human_responses)
    human_responses = list(filter(None, human_responses))
    post_filter = len(human_responses)
    
    if pre_filter != post_filter:
        print (f'Removed {pre_filter - post_filter} empty responses')
    
    model_probs = F.softmax(model_logits, dim=-1).squeeze()
    prob_model = model_probs.max().item()
    top_word_model = tokenizer.decode(model_probs.argmax())
    
    ## get ground truth word prob
    gt_token = tokenizer.encode(ground_truth)
    gt_predictability_model = model_probs[gt_token].mean(0).item()
   
    # continuous predictability - average semantic distance of words from ground truth word
    human_predictability = sum(np.asarray(human_responses) == ground_truth) / len(human_responses)
    continuous_predictability = (1 - distance.cdist(word_model[ground_truth][np.newaxis], word_model[human_responses], metric='cosine')).mean()

    if human_predictability == 0:
        log_odds_human_predictability = statistics.log_odds(1e-2)
    else:
        log_odds_human_predictability = statistics.log_odds(human_predictability)

    log_odds_model_predictability = statistics.log_odds(gt_predictability_model)
    log_odds_continuous_predictability = statistics.log_odds(continuous_predictability)
        
    # get the probability distribution of the human responses --> also return the unique words
    human_probs, unique_words = get_human_probs(human_responses)
    prob_human = human_probs.max()
    
    # get the words indices in the overall array then add in the human probs
    word_idxs = [all_responses.index(word) for word in unique_words]    
    temp = np.zeros(len(all_responses))
    temp[word_idxs] = human_probs
    human_probs = temp
    
    # get probability of the words humans chose within the model distribution
    # then normalize to the number of samples
    model_adjusted_probs = np.asarray([nlp.get_word_prob(tokenizer, word, model_logits) for word in all_responses])
    model_adjusted_probs = model_adjusted_probs / model_adjusted_probs.sum()

    # select the probability of the top word that humans chose
    prob_model_adjusted = model_adjusted_probs[model_adjusted_probs.argmax()]
    prob_model_human_pred = model_adjusted_probs[human_probs.argmax()]

    # grab the human and model top words
    top_word_human = all_responses[human_probs.argmax()]
    top_word_model_adjusted = all_responses[model_adjusted_probs.argmax()]

    # now calculate kl divergence between the human and adjusted model distribution
    # measures how different P (human) is from Q (model) distribution
    #  KL divergence of P from Q is the expected excess surprise from 
    #  using Q as a model when the actual distribution is P
    kl_divergence = kl_div(human_probs, model_adjusted_probs)
    kl_divergence[np.isinf(kl_divergence)] = 0
    kl_divergence = kl_divergence.sum().item()
    
    relative_entropy = rel_entr(human_probs, model_adjusted_probs).sum().item()
    
    # earth movers distance between adjusted probs
    wasserstein_dist = stats.wasserstein_distance(human_probs, model_adjusted_probs)
    
    jensenshannon_dist = distance.jensenshannon(human_probs, model_adjusted_probs)
    
    ks_stats = stats.kstest(human_probs, model_adjusted_probs)
    
    df.loc[len(df)] = {
        'top_word_human': top_word_human,
        'top_word_model': top_word_model,
        'top_word_model_adjusted': top_word_model,
        'prob_human': prob_human,
        'prob_model': prob_model,
        'prob_model_adjusted': prob_model_adjusted, 
        'prob_model_human_pred': prob_model_human_pred,
        'predictability_model': gt_predictability_model,
        'predictability_human': human_predictability,
        'continuous_predictability_human': continuous_predictability,
        'log_odds_predictability_human': log_odds_human_predictability.astype(float),
        'log_odds_predictability_model': log_odds_model_predictability.astype(float),
        'log_odds_continuous_predictability_human': log_odds_continuous_predictability.astype(float),
        'kl_divergence': kl_divergence,
        'relative_entropy': relative_entropy,
        'wasserstein_dist': wasserstein_dist,
        'jensenshannon_dist': jensenshannon_dist,
        'ks_stat': ks_stats[0]
    }
    
    return df

## Data cleaning

### Clean meta file and remove errors

In [42]:
# check all the files
checker, approve_ids = check_used_files(EXPERIMENT_NAME, EXPERIMENT_VERSION, TASK, clean_errors=True)
approve_ids = list(map(str, approve_ids))

# find the subject list based on complete files
file_idx, modality, sub_list, prolific_ids = zip(*checker['complete'])
sub_mod_list = list(zip(sub_list, modality))

print (len(set(approve_ids)))
print ('\n'.join(approve_ids))

300
6156e5a04232a8cff11724d6
594dd3de0dd7710001aeb5e9
652ecef35eef03ac46a87d1c
60cb50c2bdc488cd99eeb3f0
59451bd536b8f40001de0a83
629e61efbf70f47f72934a10
5ec4a2cfc14ec21da0bc4a46
61449811f485875be5126622
5f10912e263294096adf23a7
5681739dc5767f00051de427
59d4bfd1719adb0001235bcc
5815767b0643a600016f0e72
604d4ae2216aad5c3b21e5a7
5ec69d34afab190997018f91
6522ca2204cc95f0b36facef
5e3c407472a17401e4754bd0
605dcb4e4e9a3d6d35a10f62
5dbeff1694f30b3a8f57dfd9
62851c0ec6dc62b777daacec
5d53df2b0299370019db7b92
5ca345b1506fca0016b358bb
5a8dbced1904200001564992
5cb4bee5ffdc1d0016bbd019
59d264e2faf42e00012b6cba
6100906e671b8294f9ea93d8
6093bbe2d2c5537241164571
5ff2ec49cb7697815e8c23a3
60147144ea1597492e5c6698
631a01c239b12c7b5008247d
5efcbb97df2a840990b0da94
659599a35a45f68375c3f06a
63da6f2e309a94d0e03079b0
60b014e8fd54e70643da606c
66520da15457a5e65d2ec66f
5afcbf18fabc8900018854d9
5a00e6da120fb300019480b3
63f93b47bb0c1a1aca131480
5e110747dc265f7d25341439
6683f5704a78cd288835b4a8
61bb381140db417c1a138

### Manually spell-check participants data

In [55]:
## cleaned_results_dir = os.path.join(BASE_DIR, 'experiments',  EXPERIMENT_NAME, 'cleaned-results', EXPERIMENT_VERSION)

df_transcript = pd.read_csv(os.path.join(preproc_dir, TASK, f'{TASK}_transcript-preprocessed.csv'))

for i, (sub, modality) in enumerate(sub_mod_list):

    sub_cleaned_dir = os.path.join(cleaned_results_dir, TASK, modality, sub)
    out_fn = os.path.join(sub_cleaned_dir, f'{sub}_next-word-prediction.csv')
    
    utils.attempt_makedirs(sub_cleaned_dir)

    if os.path.exists(out_fn):
        print (f'File exists: {modality} {sub}')
        continue
    else:
        print (f'Correcting: {modality} {sub}')

    ############################
    #### Load subject data #####
    ############################
    
    sub_dir = os.path.join(results_dir, TASK, modality, sub)

    # load and filter down to response trials
    df_results = pd.read_csv(os.path.join(sub_dir, f'{sub}_next-word-prediction.csv')).fillna(False)
    df_results['word_index'] = df_results['word_index'].astype(int)

    ############################
    ###### Check responses #####
    ############################

    df_results = clean_participant_responses(df_results, df_transcript)
    ############################
    #### Save cleaned data #####
    ############################
    
    df_results.to_csv(out_fn, index=False)

    print (f'Saved file for {modality} {sub}')
    # if os.path.exists(sub_dir):
    #     current_id, demographics, experience, responses = load_participant_results(sub_dir, sub)
    #     responses['response'] = responses['response'].fillna('')



Current Word: .... one to my family home that's been [43;30mdestroyed[m down and one to my Jeep ....
Ground Truth:  torn



Needs correction? [y/n]:  


Saved file for text sub-00150


## Compile data across participants

### Load word models for semantic comparisons

In [79]:
word_model_name = 'fasttext'
word_model = nlp.load_word_model(model_name=word_model_name, cache_dir=CACHE_DIR)
word_model_info = (word_model_name, word_model)

Loading fasttext from saved .bin file.


### Load all participants data and save file

In [80]:
# load transcript
df_transcript = pd.read_csv(os.path.join(preproc_dir, TASK, f'{TASK}_transcript-preprocessed.csv'))

# load all results, calculate accuracy, then save
df_all_results = aggregate_participant_responses(cleaned_results_dir, TASK, sub_mod_list)
df_all_results, df_all_accuracy = calculate_results_accuracy(df_all_results)

# save compiled cleaned results
out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior.csv')
df_all_results.to_csv(out_fn, index=False)

sub-00001 audio
sub-00001 text
sub-00002 audio
sub-00002 text
sub-00003 audio
sub-00003 text
sub-00004 audio
sub-00004 text
sub-00005 audio
sub-00005 text
sub-00006 audio
sub-00006 text
sub-00007 audio
sub-00007 text
sub-00008 audio
sub-00008 text
sub-00009 audio
sub-00009 text
sub-00010 audio
sub-00010 text
sub-00011 audio
sub-00011 text
sub-00012 audio
sub-00012 text
sub-00013 audio
sub-00013 text
sub-00014 audio
sub-00014 text
sub-00015 audio
sub-00015 text
sub-00016 audio
sub-00016 text
sub-00017 audio
sub-00017 text
sub-00018 audio
sub-00018 text
sub-00019 audio
sub-00019 text
sub-00020 audio
sub-00020 text
sub-00021 audio
sub-00021 text
sub-00022 audio
sub-00022 text
sub-00023 audio
sub-00023 text
sub-00024 audio
sub-00024 text
sub-00025 audio
sub-00025 text
sub-00026 audio
sub-00026 text
sub-00027 audio
sub-00027 text
sub-00028 audio
sub-00028 text
sub-00029 audio
sub-00029 text
sub-00030 audio
sub-00030 text
sub-00031 audio
sub-00031 text
sub-00032 audio
sub-00032 text
sub-0003

### Lemmatize data (responses + ground truth) and save 

In [81]:
# use transcript to lemmatize responses, then calculate accuracy
df_lemmatized_results = lemmatize_responses(df_all_results.copy(), df_transcript)
df_lemmatized_results, df_lemmatized_accuracy = calculate_results_accuracy(df_lemmatized_results)

# save compiled lemmatized results
out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior_lemmatized.csv')
df_lemmatized_results.to_csv(out_fn, index=False)

Word: is 	 Lemma: be
GT: influence 	 GTLemma: influence
Word: one 	 Lemma: one
GT: one 	 GTLemma: one
Word: many 	 Lemma: many
GT: people 	 GTLemma: people
Word: important 	 Lemma: important
GT: important 	 GTLemma: important
Word: family 	 Lemma: family
GT: mother 	 GTLemma: mother
Word: big 	 Lemma: big
GT: professional 	 GTLemma: professional
Word: athletic 	 Lemma: athletic
GT: ski 	 GTLemma: ski
Word: lot 	 Lemma: lot
GT: lab 	 GTLemma: lab
Word: was 	 Lemma: be
GT: was 	 GTLemma: be
Word: connect 	 Lemma: connect
GT: tap 	 GTLemma: tap
Word: sure 	 Lemma: sure
GT: sure 	 GTLemma: sure
Word: wasn't 	 Lemma: wasn
GT: lived 	 GTLemma: live
Word: boots 	 Lemma: boot
GT: flower 	 GTLemma: flower
Word: never 	 Lemma: never
GT: mean 	 GTLemma: mean
Word: showed 	 Lemma: show
GT: met 	 GTLemma: meet
Word: boy 	 Lemma: boy
GT: boy 	 GTLemma: boy
Word: to 	 Lemma: to
GT: since 	 GTLemma: since
Word: so 	 Lemma: so
GT: football 	 GTLemma: football
Word: football 	 Lemma: football
GT: footba

### Analyze human data and save

In [82]:
# load lemmatized file and analyze human results --> use all words
lemmatized_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior_lemmatized.csv')
df_lemmatized_results = pd.read_csv(lemmatized_fn)

# combine the data and lemmatize model results
df_analyzed_lemmatized = analyze_human_results(df_lemmatized_results, word_model_info, top_n=None)

# save lemmatized human-model results
out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-analyzed-behavior_human-lemmatized.csv')
df_analyzed_lemmatized.to_csv(out_fn, index=False)

## Analyze and compile human and LLM data

### Set LLM model names

In [83]:
# get all MLM models except BERT
MLM_MODELS = list(nlp.MLM_MODELS_DICT.keys())[1:]
CLM_MODELS = list(nlp.CLM_MODELS_DICT.keys()) 
model_names = CLM_MODELS + MLM_MODELS

print (f'Loading the following models')
print (f'MLM models: {MLM_MODELS}')
print (f'CLM models: {CLM_MODELS}')

Loading the following models
MLM models: ['roberta', 'electra', 'xlm-prophetnet']
CLM models: ['bloom', 'gpt2', 'gpt2-xl', 'gpt-neo-x', 'llama2', 'mistral']


### Cleaned responses - Load and merge human and LLM data 

In [84]:
# load lemmatized file and analyze human results --> use all words
cleaned_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior.csv')
df_cleaned_results = pd.read_csv(cleaned_fn)

# combine the data and lemmatize model results
df_human_model = compare_human_model_accuracy(df_cleaned_results, model_names, word_model_info, task=TASK, top_n=1, window_size=25, lemmatize=False)

# save combined human-model results
out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-analyzed-behavior_human-model.csv')
df_human_model.to_csv(out_fn, index=False)

Total missing values: 46


### Lemmatization - Load and merge human and LLM data

In [85]:
# load lemmatized file and analyze human results --> use all words
lemmatized_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior_lemmatized.csv')
df_lemmatized_results = pd.read_csv(lemmatized_fn)

# combine the data and lemmatize model results
df_human_model_lemmatized = compare_human_model_accuracy(df_lemmatized_results, model_names, word_model_info, task=TASK, top_n=1, window_size=25, lemmatize=True)

# save lemmatized human-model results
out_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-analyzed-behavior_human-model-lemmatized.csv')
df_human_model_lemmatized.to_csv(out_fn, index=False)

Word: terms 	 Lemma: term
GT: influence 	 GTLemma: influence
Word: first 	 Lemma: first
GT: one 	 GTLemma: one
Word: favorite 	 Lemma: favorite
GT: secret 	 GTLemma: secret
Word: a 	 Lemma: a
GT: people 	 GTLemma: people
Word: influential 	 Lemma: influential
GT: important 	 GTLemma: important
Word: like 	 Lemma: like
GT: even 	 GTLemma: even
Word: important 	 Lemma: important
GT: important 	 GTLemma: important
Word: life 	 Lemma: life
GT: life 	 GTLemma: life
Word: real 	 Lemma: real
GT: violin 	 GTLemma: violin
Word: don't 	 Lemma: don
GT: meet 	 GTLemma: meet
Word: in 	 Lemma: in
GT: seven 	 GTLemma: seven
Word: very 	 Lemma: very
GT: grow 	 GTLemma: grow
Word: father 	 Lemma: father
GT: mother 	 GTLemma: mother
Word: father 	 Lemma: father
GT: father 	 GTLemma: father
Word: three 	 Lemma: three
GT: three 	 GTLemma: three
Word: time 	 Lemma: time
GT: man 	 GTLemma: man
Word: see 	 Lemma: see
GT: meet 	 GTLemma: meet
Word: vet 	 Lemma: vet
GT: professional 	 GTLemma: professional
Wor

## Analyze human vs. GPT2-XL distributions

### Load file of cleaned data

In [86]:
# save lemmatized human-model results
human_cleaned_fn = os.path.join(behavioral_dir, f'task-{TASK}_group-cleaned-behavior.csv')
df_human_cleaned = pd.read_csv(human_cleaned_fn)

### Set model and window size

In [87]:
WINDOW_SIZE = 25
MODEL_NAME = 'gpt2-xl'

# now load a model to compare to 
tokenizer, model = nlp.load_clm_model(model_name='gpt2-xl', cache_dir=CACHE_DIR)

### Compare human and model prediction distributions and save

In [88]:
df_comparison = []
df_human_cleaned['response'] = df_human_cleaned['response'].apply(lambda x: strip_punctuation(x) if isinstance(x, str) else '')
    
# go through the each modality word index
for (modality, response_index), df in df_human_cleaned.groupby(['modality', 'word_index']):
    
    # get all responses for current index across both modalities
    all_responses = df_human_cleaned[df_human_cleaned['word_index'] == response_index]['response'].tolist()
    all_responses = list(filter(None, all_responses))

    # grab responses for the current modality
    modality_responses = df['response'].apply(strip_punctuation)
    ground_truth = df['ground_truth'].unique().tolist()[0]
    
    # load the logits for the current response
    model_logits = load_logits(models_dir, MODEL_NAME, TASK, WINDOW_SIZE, response_index) #response_index - 1)
    
    # now compare the two and add it to the dataframe
    df_compare = compare_human_model_distributions(tokenizer, word_model, modality_responses, all_responses, model_logits, ground_truth)
    df_compare['modality'] = modality
    df_compare['word_index'] = response_index
    df_compare[['entropy_group', 'accuracy_group', 'ground_truth']] = df[['entropy_group', 'accuracy_group', 'ground_truth']].iloc[0]
    
    df_comparison.append(df_compare)

# lastly 
df_comparison = pd.concat(df_comparison).reset_index(drop=True)

Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
Removed 1 empty responses
