In [1]:
%load_ext autoreload
import os, sys, glob
import json
import re
import numpy as np
import pandas as pd
from natsort import natsorted

sys.path.append('/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/code/utils/')
sys.path.append('/dartfs/rc/lab/F/FinnLab/tommy/utils/gentle')

import gentle
from config import *
from preproc_utils import create_balanced_orders, get_consecutive_list_idxs, sort_consecutive_constraint, check_consecutive_spacing

# from text_utils import get_pos_tags, get_lemma

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /dartfs-hpc/rc/home/w/f003rjw/.cache/huggingface/token
Login successful


# Set directories 

In [2]:
base_dir = '/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavior/'
stim_dir = os.path.join(base_dir, 'stimuli')
cache_dir = os.path.join('/dartfs/rc/lab/F/FinnLab/tommy/', 'models')

gentle_dir = os.path.join(stim_dir, 'gentle')

In [550]:
def load_model_data(model_dir, model_name, task, window_size, top_n):
    '''
    Loads model data from directory
    '''

    model_dir = os.path.join(model_dir, task, model_name, f'window-size-{window_size}')
    results_fn = natsorted(glob.glob(os.path.join(model_dir, f'*top-{top_n}*')))[0]

    # load the data, remove nans
    model_results = pd.read_csv(results_fn)
    model_results['glove_continuous_accuracy'] = model_results['glove_continuous_accuracy'].apply(np.nan_to_num)
    model_results['word2vec_continuous_accuracy'] = model_results['word2vec_continuous_accuracy'].apply(np.nan_to_num)

    return model_results

def get_stim_candidate_idxs(task):
    '''
    Find the NWP candidate indices of a preprocessed transcript
    '''

    preproc_fn = os.path.join(STIM_DIR, 'preprocessed', task, f'{task}_transcript-preprocessed.csv')
    df_preproc = pd.read_csv(preproc_fn)
    nwp_idxs = np.where(df_preproc['NWP_Candidate'])[0]

    return df_preproc, nwp_idxs

def divide_nwp_dataframe(df, accuracy_type, percentile):

    df_divide = df.copy()

    # first find the lowest and highest percentile for entropy
    low_entropy_idxs = df['entropy'] < np.nanpercentile(df['entropy'], percentile)
    high_entropy_idxs = df['entropy'] >= np.nanpercentile(df['entropy'], 100-percentile)

    ## set names for entropy group
    df_divide.loc[low_entropy_idxs, 'entropy_group'] = 'low'
    df_divide.loc[high_entropy_idxs, 'entropy_group'] = 'high'

    # repeat for continuous accuracy
    low_accuracy_idxs = df[accuracy_type] < np.nanpercentile(df[accuracy_type], percentile)
    high_accuracy_idxs = df[accuracy_type] >= np.nanpercentile(df[accuracy_type], 100-percentile)

    ## set names for accuracy group
    df_divide.loc[low_accuracy_idxs, 'accuracy_group'] = 'low'
    df_divide.loc[high_accuracy_idxs, 'accuracy_group'] = 'high'

    return df_divide.dropna()

def get_quadrant_distributions(df_divide, indices):
    
    df_idx = df_divide.loc[indices]
    
    # get the items as a dictionary for passing out to aggregate
    quadrant_dist = {f'{labels[0]}-entropy_{labels[1]}-accuracy': round(len(df)/len(df_idx), 2) 
                 for labels, df in df_idx.groupby(['entropy_group', 'accuracy_group'])}

    df_quadrants = pd.DataFrame.from_dict(quadrant_dist, orient='index').T
    
    return df_quadrants

def select_prediction_words(df_divide, remove_perc, select_perc, min_spacing_thresh=3):
    '''
    
    df_divide: candidate words divided into quartiles based on entropy and accuracy
    
    remove_perc: percentage of words to remove based on proximity to other words
        helps ensure decent spacing between presented words
        
    select_perc: percentage of words to select for presentation    
    
    '''
    
    df_divide['spacing'] = np.hstack([np.nan, np.diff(df_divide.index)])
    
    quadrant_distributions = get_quadrant_distributions(df_divide, df_divide.index).to_numpy()
    
    updated = []

    for i, df in df_divide.groupby(['entropy_group', 'accuracy_group']):
        # find how many words to remove in the quadrant based on the percent
        n_words = round(remove_perc * len(df))
        df = df.sort_values(by='spacing').iloc[n_words:]
        updated.append(df.sort_index())

    updated = pd.concat(updated).sort_index()
    updated_distributions = get_quadrant_distributions(updated, updated.index).to_numpy()
    assert (np.isclose(quadrant_distributions, updated_distributions, atol=0.01).all())
    
    # make sure it is scaled to the original dataframe
    select_perc = select_perc/(1-remove_perc)
    min_spacing = 0
    RANDOM_STATE = 0
    
    print (f'Selecting {select_perc*100:.2f}% of remaining items')
    
    while (min_spacing < min_spacing_thresh):
        # now sample the words from each quadrant
        sampled = []

        for i, df in updated.groupby(['entropy_group', 'accuracy_group']):

            df_sampled = df.sample(frac=select_perc, random_state=RANDOM_STATE).sort_index()
            sampled.append((len(df_sampled), df_sampled))

        n_sampled, sampled = zip(*sampled)
        sampled = pd.concat(sampled).sort_index()

        min_spacing = np.diff(sampled.index).min()
        
        RANDOM_STATE += 1
    
    print (f'Min spacing of {min_spacing}')
    print (f'{len(sampled)} total words')

    return sampled
    

In [675]:
all_tasks_quadrants = []

models_dir = os.path.join(DERIVATIVES_DIR, 'model-predictions')
model_name = 'gpt2-xl'
task = 'wheretheressmoke'

df_preproc, candidate_idxs = get_stim_candidate_idxs(task)

model_results = load_model_data(models_dir, model_name=model_name, task=task, top_n=5, window_size=100)
model_results.loc[:, 'binary_accuracy'] = model_results['binary_accuracy'].astype(bool)
model_results = model_results.iloc[candidate_idxs]

df_divide = divide_nwp_dataframe(model_results, accuracy_type='word2vec_continuous_accuracy', percentile=45)


df_selected = select_prediction_words(df_divide, remove_perc=0.5, select_perc=0.4, min_spacing_thresh=2)

# fig, axes, df_quadrants = plot_quadrant_distributions(model_results.dropna(), 'word2vec_continuous_accuracy', 45)

# plt.suptitle(f'{model_name} - task {task}')
# out_fn = os.path.join(out_dir, f'{model_name}-{task}_quadrant-distributions.jpg')

# plt.savefig(out_fn, dpi=300)
# plt.close('all')

# df_quadrants['model_name'] = model_name
# all_tasks_quadrants.append(df_quadrants)


Selecting 80.00% of remaining items
Min spacing of 3
216 total words


  df_divide.loc[low_entropy_idxs, 'entropy_group'] = 'low'
  df_divide.loc[low_accuracy_idxs, 'accuracy_group'] = 'low'


In [686]:
selected_idxs

Index([   4,   11,   23,   30,   34,   38,   47,   51,   61,   64,
       ...
       1710, 1722, 1732, 1738, 1745, 1758, 1762, 1772, 1783, 1817],
      dtype='int64', length=216)

In [690]:
df_preproc

Unnamed: 0,Word_Written,Case,POS,POS_Definition,Punctuation,Stop_Word,Word_Vocab,Onset,Offset,Duration,Named_Entity,NWP_Candidate,entropy_group,accuracy_group
0,I,success,PRP,"pronoun, personal",,True,I,0.012472,0.127781,0.115309,False,False,,
1,reached,success,VBD,"verb, past tense",,False,reached,0.127781,0.493847,0.366067,False,True,,
2,over,success,RB,adverb,,True,over,0.493847,0.960317,0.466470,False,False,,
3,and,success,CC,"conjunction, coordinating",,True,and,1.539002,1.661162,0.122160,False,False,,
4,secretly,success,RB,adverb,,False,secretly,1.664915,2.377098,0.712183,False,True,high,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1822,I,success,PRP,"pronoun, personal",,True,I,590.470522,590.611418,0.140897,False,False,,
1823,still,success,RB,adverb,,False,still,590.611418,590.999320,0.387902,False,True,,
1824,miss,success,VBP,"verb, present tense, not 3rd person singular",,False,miss,590.999320,591.188889,0.189569,False,True,,
1825,the,success,DT,determiner,,True,the,591.188889,591.265763,0.076874,False,False,,


In [692]:
selected_idxs = df_selected.index

df_preproc.loc[selected_idxs, ['entropy_group', 'accuracy_group']] = df_selected[['entropy_group', 'accuracy_group']]
df_preproc.loc[selected_idxs, 'Selected_CaNndidate'] = 

True

In [694]:
df_preproc.dropna()

Unnamed: 0,Word_Written,Case,POS,POS_Definition,Punctuation,Stop_Word,Word_Vocab,Onset,Offset,Duration,Named_Entity,NWP_Candidate,entropy_group,accuracy_group
4,secretly,success,RB,adverb,,False,secretly,1.664915,2.377098,0.712183,False,True,high,low
11,foot,success,NN,"noun, common, singular or mass",,False,foot,5.061272,5.370295,0.309022,False,True,high,low
23,door,success,NN,"noun, common, singular or mass",,False,door,8.094104,8.453288,0.359184,False,True,low,high
30,shoes,success,NNS,"noun, common, plural",,False,shoes,11.675964,12.006014,0.330050,False,True,low,low
34,crying,success,VBG,"verb, present participle or gerund",",",False,crying,12.693651,13.252381,0.558730,False,True,high,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1758,seven,success,CD,"numeral, cardinal",,False,seven,561.777047,562.144223,0.367176,False,True,low,low
1762,way,success,NN,"noun, common, singular or mass",,False,way,563.132653,563.302268,0.169615,False,True,low,high
1772,think,success,VBP,"verb, present tense, not 3rd person singular",", """,False,think,567.482766,567.745516,0.262750,False,True,high,high
1783,held,success,VBD,"verb, past tense",,False,held,573.289569,573.588889,0.299320,False,True,high,low


In [471]:
import random

def random_chunks(lst, n, shuffle=False):
    """Created randomized n-sized chunks from lst."""
    
    tmp_lst = lst.copy()
    n_total = len(lst)
    
    if shuffle:
        random.shuffle(tmp_lst)
    
    all_chunks = []
    
    for i in range(0, len(tmp_lst), n):
        all_chunks.append(tmp_lst[i:i + n])
    
    # distribute remaining items across orders
    if len(all_chunks) != n_total//n:
        remainder = all_chunks.pop()
        
        for i, item in enumerate(remainder):      
            all_chunks[i%n].append(item)
    
    # lastly sort for ordered indices
    all_chunks = [sorted(chunk) for chunk in all_chunks]
    
    return all_chunks

In [680]:
for order in test_orders:
    print (len(order)/len(df_preproc))

# len(test_orders[0])/len(df_preproc)

0.029556650246305417
0.029556650246305417
0.029556650246305417
0.029556650246305417


In [698]:
quadrant_distribution

array([[0.17, 0.31, 0.34, 0.18]])

In [702]:
quadrant_distribution = get_quadrant_distributions(df_selected, df_selected.index).to_numpy()
quadrant_distribution

array([[0.17, 0.31, 0.34, 0.18]])

In [697]:
# percent_sampled = 0.
n_orders = 4
n_participants_per_item = 50
consecutive_spacing = 10

# find distribution of selected words from the divided quadrants
quadrant_distribution = get_quadrant_distributions(df_divide, df_selected.index).to_numpy()
deviation_threshold = 0.05
order_distributions = np.zeros((n_orders, 4))

# find indices for presentation and set number of items each subject sees
nwp_indices = sorted(df_selected.index)

# # Find lists with consecutive items violating our constraint

while not (np.allclose(quadrant_distribution, order_distributions, atol=deviation_threshold)):
    
    subject_experiment_orders = random_chunks(nwp_indices, len(nwp_indices)//n_orders, shuffle=True)
    
    print ('starting')
#     test_orders = subject_experiment_orders.copy()
    idxs = get_consecutive_list_idxs(subject_experiment_orders, consecutive_spacing=consecutive_spacing)
    subject_experiment_orders = sort_consecutive_constraint(subject_experiment_orders, consecutive_spacing=consecutive_spacing)
    
    
    order_distributions = [get_quadrant_distributions(df_divide, order).to_numpy() for order in subject_experiment_orders]
    
    # sometimes the randomized order makes a quadrant be dropped --> reset and try again
    if not all([order.shape[-1] == 4 for order in order_distributions]):
        order_distributions = np.zeros((n_orders, 4))
# # Test again once we have completed resorting
# idxs = get_consecutive_list_idxs(subject_experiment_orders, consecutive_spacing=p.consecutive_spacing)
# print (f'Lists violating consecutive index constraint: {100*(len(idxs))/len(subject_experiment_orders)}%')

# uniq, counts = np.unique(subject_experiment_orders, return_counts=True)
# print (f'All counts per word: {np.sum(counts >= p.n_participants_per_item) / len(counts)*100}%')

# counts = Counter(tuple(o) for o in subject_experiment_orders)
# unique_orders = np.sum([v for k, v in counts.items()]) / len(counts)

# print (f'Unique orders: {unique_orders*100}%')

# orders_meeting_consecutive = np.sum([check_consecutive_spacing(order, consecutive_spacing=p.consecutive_spacing) for order in subject_experiment_orders]) / len(subject_experiment_orders)
# print (f'Consecutive constraint: {orders_meeting_consecutive*100}%'

starting
Starting pass #1
Number of lists w/ violation: 3
Starting pass #2
Number of lists w/ violation: 2
Starting pass #3
Number of lists w/ violation: 2
Starting pass #4
Number of lists w/ violation: 3
Starting pass #5
Number of lists w/ violation: 3
Starting pass #6
Number of lists w/ violation: 1
Starting pass #7
Number of lists w/ violation: 0
starting
Starting pass #1
Number of lists w/ violation: 2
Starting pass #2
Number of lists w/ violation: 3
Starting pass #3
Number of lists w/ violation: 3
Starting pass #4
Number of lists w/ violation: 1
Starting pass #5
Number of lists w/ violation: 0
starting
Starting pass #1
Number of lists w/ violation: 3
Starting pass #2
Number of lists w/ violation: 3
Starting pass #3
Number of lists w/ violation: 2
Starting pass #4
Number of lists w/ violation: 2
Starting pass #5
Number of lists w/ violation: 1
Starting pass #6
Number of lists w/ violation: 1
Starting pass #7
Number of lists w/ violation: 1
Starting pass #8
Number of lists w/ violat

In [None]:
# get the items as a dictionary for passing out to aggregate
quadrant_dist = {f'{labels[0]}-entropy_{labels[1]}-accuracy': round(len(df)/len(df_divide), 2) 
             for labels, df in df_divide.groupby(['entropy_group', 'accuracy_group'])}

df_quadrants = pd.DataFrame.from_dict(quadrant_dist, orient='index').T

In [343]:
order_distributions

[array([[0.15, 0.3 , 0.4 , 0.14]]),
 array([[0.13, 0.4 , 0.33, 0.14]]),
 array([[0.15, 0.34, 0.38, 0.12]]),
 array([[0.13, 0.33, 0.36, 0.18]])]

In [None]:
def create_n_random_orders(n_orders, n_participants_per_item, consecutive_spacing)

In [125]:
get_quadrant_distributions(df_divide, order).to_numpy().min()

0.15

In [207]:
test_orders

[[12,
  89,
  95,
  158,
  169,
  202,
  282,
  355,
  390,
  408,
  425,
  463,
  496,
  504,
  535,
  582,
  597,
  608,
  691,
  696,
  742,
  822,
  904,
  997,
  1024,
  1032,
  1065,
  1081,
  1105,
  1110,
  1222,
  1227,
  1235,
  1245,
  1282,
  1302,
  1307,
  1345,
  1447,
  1486,
  1565,
  1617,
  1637,
  1722,
  1798],
 [17,
  73,
  99,
  114,
  204,
  253,
  261,
  323,
  329,
  351,
  531,
  605,
  610,
  646,
  661,
  701,
  738,
  762,
  795,
  802,
  813,
  839,
  896,
  902,
  955,
  1103,
  1136,
  1162,
  1259,
  1290,
  1371,
  1411,
  1443,
  1448,
  1459,
  1484,
  1513,
  1551,
  1607,
  1701,
  1710,
  1735,
  1747,
  1754],
 [144,
  151,
  187,
  198,
  217,
  251,
  292,
  324,
  376,
  475,
  536,
  548,
  578,
  606,
  681,
  713,
  749,
  786,
  806,
  873,
  933,
  940,
  980,
  987,
  1044,
  1087,
  1102,
  1154,
  1172,
  1231,
  1255,
  1278,
  1327,
  1354,
  1367,
  1409,
  1438,
  1451,
  1571,
  1579,
  1610,
  1629,
  1655,
  1765],
 [11,
  23,


In [245]:
order_distributions

[array([[0.13, 0.28, 0.38, 0.2 ]]), array([[0.2 , 0.35, 0.3 , 0.15]])]

In [257]:
order_distributions

[array([[0.17, 0.35, 0.29, 0.19]]), array([[0.17, 0.28, 0.39, 0.16]])]

In [295]:
np.diff(test_orders[0]).shape

(91,)