## Load modules 

In [1]:
%load_ext autoreload
%autoreload 2
from IPython.display import clear_output
import os, sys, glob
import json
from operator import itemgetter
import re
import numpy as np
import pandas as pd

sys.path.append('/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavioral/code/utils/')
sys.path.append('/dartfs/rc/lab/F/FinnLab/tommy/utils/gentle')

import gentle
from narratives_utils import *
from text_utils import get_pos_tags, get_lemma

# Create function for loading transcripts

In [128]:
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from collections import defaultdict
  
lemmatizer = WordNetLemmatizer()

# generate the explained tags --> we will use these to make more sense of the outputs
tags_explained = nltk.data.load('help/tagsets/upenn_tagset.pickle')
STOP_WORDS = stopwords.words('english')

STOP_UTTERANCES = ['yes', 'well', 'oh', 'mhm', 'um', 'boom']

STOP_WORDS.extend(STOP_UTTERANCES)

# POS tag mapping, format: {Treebank tag (1st letter only): Wordnet}
tagset_mapping = defaultdict(
    lambda: 'n',   # defaults to noun
    {
        'N': 'n',  # noun types
        'P': 'n',  # pronoun types, predeterminers
        'V': 'v',  # verb types
        'J': 'a',  # adjective types
        'D': 'a',  # determiner
        'R': 'r'   # adverb types
    })

def gentle_to_dataframe(align_fn, interpolate_missing_times=True, shift_onset=True):
    
    # load the alignment file
    with open(align_fn, encoding='utf-8') as f:
        data = json.load(f)
    
    # grab the original transcript
    transcript = data['transcript']
    words_list = data['words']

    # go and extract each word --> pos tagging here incorporates context
    all_words = [word['word'] for word in words_list]
    _, pos_tags = map(list,zip(*pos_tag(all_words)))

    # go through each word
    df_stack = []

    for i, current_word in enumerate(words_list):
        
        tokens = word_tokenize(current_word['word'])
        tag = tagset_mapping[pos_tags[i][0]]
        lemmas = [lemmatizer.lemmatize(re.sub("[^a-zA-Z\s-]+", '', token.lower()), pos=tag) for token in tokens]
        stop_word = all([lemma in STOP_WORDS for lemma in lemmas if lemma]) # evaluate if not empty string
        
        word_dict = {
            'Word-Written': current_word['word'],
            'POS': pos_tags[i], # extract pos_tag for the word
            'POS-Definition': tags_explained[pos_tags[i]][0], # get the explained tag
            'Punctuation': transcript[current_word['endOffset']:words_list[i+1]['startOffset']]
                                      if i+1 < len(words_list) else transcript[current_word['endOffset']:], # punctuation following the word (use subsequent word)
            'Stop-Word': stop_word, # true or false if a stopword
        }

        # make sure that we've aligned the word --> could also check its a word in the vocabulary
        if 'alignedWord' in current_word:
            aligned_dict = {
                'Word-Vocab': current_word['alignedWord'],
                'Onset': current_word['start']
                'Offset': current_word['end'],
                'Duration': current_word['end'] - current_word['start'], # calculate duration of the current word
            }
            word_dict.update(aligned_dict)
        
        df_stack.append(
            pd.DataFrame(
                word_dict,
                index=[i],
            )
        )
    
    df_stack = pd.concat(df_stack)
    
    if interpolate_missing_times:
        df_stack['Onset'].interpolate()
    
    return df_stack

# Preprocessing steps to get punctuation aligned

Currently, gentle removes punctuation (e.g., hyphens, quotations) from the transcript. We want to perform the following operations:
1. Collapse over hyphenated words that make sense together
2. Ignore named-entities (e.g., cities, people)
3. Retain punctuation following words in the transcript for presentation

**Tory** run the following cells for these task names:
- black
- bronx
- forgot
- piemanpni

In [3]:
task = 'black' # replace this string with those above

# set directories
base_dir = '/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavioral/'
# narratives_dir = '/dartfs/rc/lab/D/DBIC/DBIC/archive/narratives'

gentle_dir = os.path.join(base_dir, 'stimuli', 'gentle')
preproc_dir = os.path.join(base_dir, 'stimuli', 'preprocessed')

task_out_dir = os.path.join(preproc_dir, task)

if not os.path.exists(task_out_dir):
    os.makedirs(task_out_dir)

# Create the dataframe for the current task

In [129]:
# loads the alignment file and parses variables of interest into dataframe
df_task = gentle_to_dataframe(os.path.join(gentle_dir, task, 'align.json'), shift_onset=True)

#save to file
df_task_raw_fn = os.path.join(task_out_dir, f'{task}_transcript_raw-onset-shifted.csv')
df_task.to_csv(df_task_raw_fn, index=False)

In [130]:
df_task.head(50)

Unnamed: 0,Word-Written,POS,POS-Definition,Punctuation,Stop-Word,Word-Vocab,Onset,Offset,Duration
0,So,RB,adverb,,True,so,0.228,0.6237,0.39
1,I,PRP,"pronoun, personal",,True,i,0.633083,1.2474,0.58
2,was,VBD,"verb, past tense",,True,was,1.366167,2.277,0.34
3,a,DT,determiner,,True,a,2.280833,2.4255,0.15
4,junior,JJ,"adjective or numeral, ordinal",,False,junior,2.43125,3.1086,0.68
5,in,IN,"preposition or conjunction, subordinating",,True,in,3.113833,3.3759,0.27
6,college,NN,"noun, common, singular or mass",,False,college,3.381583,4.158,0.79
7,when,WRB,Wh-adverb,,True,when,4.263333,4.9698,0.23
8,I,PRP,"pronoun, personal",,True,i,4.978167,5.0391,0.07
9,got,VBD,"verb, past tense",,False,got,5.047583,5.2767,0.24


# Clean hyphens

Run this cell. You will be presented with a hyphenated word and its place within a sentence. You will need to enter "y" or "n" depending on if the word is meant to be hyphenated.

In [None]:
def clean_hyphenated_words(df_task):
    
    print ("You will see a hyphenated word. Enter \'y' if the word is meant to be hyphenated or \'n' if not.\n")
    
    hyphenated = df_task['Punctuation'].str.contains('-')
    hyphenated_idxs = np.where(hyphenated)[0]
    
    for idx in hyphenated_idxs:
        # grab the current row and following row from the dataframe
        df_rows = df_task.iloc[idx:idx+2]
        hyphenated_word = '-'.join(df_rows['Word-Written'])
        
        # establish some context for the word
        precontext = ' '.join(df_task.iloc[idx-10:idx]['Word-Written'])
        postcontext = ' '.join(df_task.iloc[idx+2:idx+10]['Word-Written'])
        print (f'\nContext: {precontext} ___ {postcontext}')
        print (f'Word: {hyphenated_word}')

        response = input()
    
        if response == 'y':
            
            hyphenated_entry = {
                'Word-Written': hyphenated_word,
                'POS': df_rows['POS'].to_list()[-1],
                'POS-Definition': df_rows['POS-Definition'].to_list()[-1],
                'Punctuation': df_rows['Punctuation'].to_list()[-1] ,
                'Stop-Word': hyphenated_word.lower() in STOP_WORDS,
                'Word-Vocab': hyphenated_word,
                'Onset': df_rows['Onset'].to_list()[0],
                'Offset': df_rows['Offset'].to_list()[-1],
                'Duration': df_rows['Offset'].to_list()[-1] - df_rows['Onset'].to_list()[0]
            }
            
            df_task.at[idx, :] = hyphenated_entry
            df_task = df_task.drop(idx+1).reset_index(drop=True)

            # we've dropped an index
            hyphenated_idxs -= 1 
            
            print (f'Word updated to: {hyphenated_word}')
        else:
            # otherwise add padding on each side to ensure it's not hyphenated
            df_task.at[idx, 'Punctuation'] =  ' - '
            
            hyphenated_word = ' - '.join(df_rows['Word-Written'])
            print (f'Words separated to: {hyphenated_word}')
        
#     clear_output(wait=True)
        
df_task = df_task.reset_index(drop=True)

In [123]:

hyphenated = df_task['Punctuation'].str.contains('-')
hyphenated_idxs = np.where(hyphenated)[0]

for idx in hyphenated_idxs:
    # grab the current row and following row from the dataframe
    df_rows = df_task.iloc[idx:idx+2]
    hyphenated_word = '-'.join(df_rows['Word-Written'])
    precontext = ' '.join(df_task.iloc[idx-10:idx]['Word-Written'])
    postcontext = ' '.join(df_task.iloc[idx+2:idx+10]['Word-Written'])
    print (f'\nContext: {precontext} ___ {postcontext}')
    print (f'Word: {hyphenated_word}')
    
    response = input()
    
    if response == 'y':
        
        hyphenated_entry = {
            'Word-Written': hyphenated_word,
            'POS': df_rows['POS'].to_list()[-1],
            'POS-Definition': df_rows['POS-Definition'].to_list()[-1],
            'Punctuation': df_rows['Punctuation'].to_list()[-1] ,
            'Stop-Word': hyphenated_word.lower() in STOP_WORDS,
            'Word-Vocab': hyphenated_word,
            'Onset': df_rows['Onset'].to_list()[0],
            'Offset': df_rows['Offset'].to_list()[-1],
            'Duration': df_rows['Offset'].to_list()[-1] - df_rows['Onset'].to_list()[0]
        }
        print (df_rows['Punctuation'].to_list()[-1] )
#         print (hyphenated_entry['Punctuation'])
        df_task.at[idx, :] = hyphenated_entry
        df_task = df_task.drop(idx+1).reset_index(drop=True)
        
        # we've dropped an index
        hyphenated_idxs -= 1 
    else:
        # otherwise add padding on each side to ensure it's not hyphenated
        df_task.at[idx, 'Punctuation'] =  ' - '
        
#     clear_output(wait=True)
        
df_task = df_task.reset_index(drop=True)

You will see a hyphenated word. Enter 'y' if the word is meant to be hyphenated or 'n' if not.


Context: the week each Sunday It actually came on an album ___ so I had to take it out of
Word: pre-recorded


Context: of my dreams and people still don't know who I ___ of who I am So I hatch the
Word: am-all


Context: who I am So I hatch the secret plan This ___ didn't tell anybody about it I am going
Word: mission-I


Context: Essence Yes you get it Essence Magazine of course targets ___ women and so I think surely now they
Word: African-American


Context: and they're from Mississippi My siblings all speak this way ___ talent no training no classes none of that
Word: God-given


Context: the time I lived in Kansas City Missouri but my ___ at the time had never heard of them
Word: co-host



# Clean named entities

In [124]:
print ("You will see a potential named entity (e.g., person, place). Enter \'y' if the word is or refers to a named entity and \'n' otherwise.\n")
    # fix instructions?
named_entities = pd.Series(df_task['POS'] == 'NNP') & pd.Series(df_task['Stop-Word'] == False)
named_entity_idxs = np.where(named_entities)[0]
df_task['Named-Entity'] = False

for idx in named_entity_idxs:
    # grab the current row and following row from the dataframe
    df_rows = df_task.iloc[idx]
    ne_word = df_rows['Word-Written']
    precontext = ' '.join(df_task.iloc[idx-10:idx]['Word-Written'])
    postcontext = ' '.join(df_task.iloc[idx+1:idx+10]['Word-Written'])
    print (f'\nContext: {precontext} ___ {postcontext}')
    print (f'Word: {ne_word}')
    
    response = input()
    
    if response == 'y':
        df_task.at[idx, 'Named-Entity'] = True
    
#     clear_output(wait=True)

df_task = df_task.reset_index(drop=True)


You will see a potential named entity (e.g., person, place). Enter 'y' if the word is or refers to a named entity and 'n' otherwise.


Context: to play the top country hits of the week each ___ It actually came on an album pre recorded so
Word: Sunday


Context: my moment I get to read the weather live for ___ City Missouri Then I played commercials and while the
Word: Jefferson


Context: moment I get to read the weather live for Jefferson ___ Missouri Then I played commercials and while the commercials
Word: City


Context: I get to read the weather live for Jefferson City ___ Then I played commercials and while the commercials were
Word: Missouri


Context: break ends My career would eventually bring me here to ___ Louis It's the largest market I had ever been
Word: St


Context: ends My career would eventually bring me here to St ___ It's the largest market I had ever been in
Word: Louis


Context: events I promise you it was actually here at the ___ in St Louis Missouri a young ma

# Add a column indiciating what words to have people predict

In [125]:
# candidates are those that aren't named entities or stop-words

if task == 'nwp_practice_trial':
    practice_indices = df_task['Word-Written'].isin(['practice', 'recording', 'word'])
    df_task['NWP-Candidate'] =  practice_indices
elif task == 'example_trial':
    example_indices = df_task['Word-Written'].isin(['fox', 'lazy'])
    df_task['NWP-Candidate'] =  example_indices
else:
    df_task['NWP-Candidate'] = pd.Series(df_task['Named-Entity'] == False) & pd.Series(df_task['Stop-Word'] == False)

In [126]:
for word in df_task[df_task['NWP-Candidate']]['Word-Written']:
    print (word)

junior
college
got
first
paying
job
field
radio
internship
getting
check
country
western
radio
station
job
though
weekends
play
top
country
hits
week
Sunday
actually
came
album
pre
recorded
take
sleeve
put
turntable
put
needle
side
one
caring
scratch
let
part
one
play
moment
coming
training
coming
side
one
would
end
lift
needle
moment
get
read
weather
live
Jefferson
City
Missouri
played
commercials
commercials
playing
flip
album
put
back
turntable
put
needle
back
side
two
let
part
two
top
country
hits
week
play
know
getting
degree
communications
broadcasting
teach
kinds
stuff
teach
flip
album
record
time
commercial
break
ends
career
would
eventually
bring
St
Louis
largest
market
ever
excited
nervous
trying
get
know
community
going
every
fundraiser
every
event
think
one
events
promise
actually
Sheldon
St
Louis
Missouri
young
man
approached
outside
sidewalk
black
said
knew
black
keep
mind
heard
pretty
much
whole
adult
life
sound
white
heard
heard
sound
white
sound
like
white
girl
heard
n

# Save the file as both CSV and JSON

In [127]:
#save to file
df_task_preproc_fn = os.path.join(task_out_dir, f'{task}_transcript_preprocessed-shifted-onset')

df_task.columns = df_task.columns.str.replace("-", "_")
df_task.to_csv(f'{df_task_preproc_fn}.csv', index=False)
df_task.to_json(f'{df_task_preproc_fn}.json', orient='records') #, lines=True) #, index=False)

# Organize into randomized sections

In [961]:
import sys, os
import numpy as np
import pandas as pd

sys.path.append('/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavioral/code/utils/')

from randomization_utils import create_balanced_orders, sort_consecutive_constraint



In [958]:
import math, random
import numpy as np

## functions modified from https://stackoverflow.com/questions/93353/create-many-constrained-random-permutation-of-a-list
def get_pool(items, n_elements_per_subject, use_each_times):
    pool = {}
    for n in items:
        pool[n] = use_each_times
    
    return pool

def rebalance(ret, pool, n_elements_per_subject):
    max_item = None
    max_times = None
    
    for item, times in pool.items():
        if max_times is None:
            max_item = item
            max_times = times
        elif times > max_times:
            max_item = item
            max_times = times
    
    next_item, times = max_item, max_times

    candidates = []
    for i in range(len(ret)):
        item = ret[i]

        if next_item not in item:
            candidates.append( (item, i) )
    
    swap, swap_index = random.choice(candidates)

    swapi = []
    for i in range(len(swap)):
        if swap[i] not in pool:
            swapi.append( (swap[i], i) )
    
    which, i = random.choice(swapi)
    
    pool[next_item] -= 1
    pool[swap[i]] = 1
    swap[i] = next_item

    ret[swap_index] = swap

def create_balanced_orders(items, n_elements_per_subject, use_each_times, consecutive_limit=2,  error=1):
    '''
    Returns a set of unique lists under the constraints of 
    - n_elements_per_subject (must be less than items)
    - use_each_times: number of times each item should be seen across subjects

    Together these define the number of subjects

    '''

    n_subjects = math.ceil((use_each_times * len(items)) / n_elements_per_subject)

    print (f'Creating orders for {n_subjects} subjects')

    pool = get_pool(items, n_elements_per_subject, use_each_times)
    
    ret = []
    while len(pool.keys()) > 0:
        while len(pool.keys()) < n_elements_per_subject:
            rebalance(ret, pool, n_elements_per_subject)
        
        selections = sorted(random.sample(pool.keys(), n_elements_per_subject))
        
        for i in selections:
            pool[i] -= 1
            if pool[i] == 0:
                del pool[i]

        ret.append( selections )
        
        unique, counts = np.unique(ret, return_counts=True)
        
        if all(np.logical_and(counts <= use_each_times + error, counts >= use_each_times)):
               break
    return ret

def consecutive(data, stepsize=1):
    return np.split(data, np.where(np.diff(data) != stepsize)[0]+1)

def get_consecutive_list_idxs(orders, consecutive_length):
    
    # Find lists with consecutive items violating our constraint
    idxs = np.where([np.any(np.asarray(list(map(len, consecutive(order)))) >= consecutive_length) for order in orders])[0]
    
    return idxs

def sort_consecutive_constraint(orders, consecutive_length=3):
    
    # Get sets of all orders
    all_order_idxs = np.arange(len(orders))
    
    # Find lists with consecutive items violating our constraint
    consecutive_order_idxs = get_consecutive_list_idxs(orders, consecutive_length)
    
    while len(consecutive_order_idxs):

        for order_idx in consecutive_order_idxs:
            # Select the current list violating the constraint
            current_list = np.asarray(orders[order_idx])

            random_list_options = np.setdiff1d(all_order_idxs, order_idx)

            # Find all sets of consecutive items in the current list --> find their lengths
            consecutive_items = consecutive(current_list)
            consecutive_lengths = np.asarray(list(map(len, consecutive_items)))

            # Find sets of slices that violate the constraint
            violations = np.where(consecutive_lengths >= consecutive_length)[0]

            for violation in violations:
                # Select items that need to be swapped --> these will be swapped into a randomly selected list
                swap_items = consecutive_items[violation][1::2]

                for item in swap_items:
                    swap_idx = np.where(current_list == item)[0]

                    # Select a random other list
                    random_list_idx = random.choice(random_list_options)
                    random_list = np.asarray(orders[random_list_idx])

                    # Find choices not within our current list
                    swap_choices = np.setdiff1d(random_list, current_list)

                    # Select a random choice
                    choice = random.choice(swap_choices)
                    
                    # Make sure we didn't violate our constraint again with either list
                    while (
                        np.isin(choice,current_list) or 
                        np.isin(item,random_list)
                    ):
                        
                        # Select a random other list
                        random_list_idx = random.choice(random_list_options)
                        random_list = np.asarray(orders[random_list_idx])

                        # Find choices not within our current list
                        swap_choices = np.setdiff1d(random_list, current_list)

                        # Select a random choice
                        choice = random.choice(swap_choices)
                    
                    # Find the index to swap to
                    choice_idx = np.where(random_list == choice)[0]

                    # Swap the two items
                    current_list[swap_idx] = choice
                    random_list[choice_idx] = item

                    # Set them in the overall orders
                    orders[order_idx] = sorted(current_list)
                    orders[random_list_idx] = sorted(random_list)
                    
        # Find lists with consecutive items violating our constraint
        consecutive_order_idxs = get_consecutive_list_idxs(orders, consecutive_length)
    return orders


In [962]:
task = 'black'

EXPERIMENT_NAME = 'pilot_version02'

percent_sampled = 0.25 # number of items to sample for each subject
n_counts_per_item = 25 # number of times items are seen across subjects

# set directories
base_dir = '/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavioral/'
preproc_dir = os.path.join(base_dir, 'stimuli', 'preprocessed')
task_out_dir = os.path.join(base_dir, 'stimuli', 'presentation_orders', EXPERIMENT_NAME, task)

if not os.path.exists(task_out_dir):
    os.makedirs(task_out_dir)

# load preprocessed transcript
df_task_preproc_fn = os.path.join(preproc_dir, task, f'{task}_transcript_preprocessed')
df_preproc = pd.read_csv(f'{df_task_preproc_fn}.csv')

# find indices for presentation and set number of items each subject sees
nwp_indices = np.where(df_preproc['NWP_Candidate'])[0]
n_items_per_subject = round(len(nwp_indices) * percent_sampled)

Calculate percent consecutive

In [963]:
np.sum(np.diff(nwp_indices) == 1) / len(nwp_indices)

0.2903752039151713

create experiment structure for subjects --> sort the indices

In [964]:
subject_experiment_orders = create_balanced_orders(items=nwp_indices, n_elements_per_subject=n_items_per_subject, use_each_times=n_counts_per_item, error=1)
subject_experiment_orders = list(map(sorted, subject_experiment_orders))

# Find lists with consecutive items violating our constraint
idxs = get_consecutive_list_idxs(subject_experiment_orders, consecutive_length=2)
print (len(idxs))

orders = sort_consecutive_constraint(subject_experiment_orders, consecutive_length=2)

# Find lists with consecutive items violating our constraint
idxs = get_consecutive_list_idxs(orders, consecutive_length=2)
print (len(idxs))
# test = sort_consecutive_constraint(orders)

Creating orders for 101 subjects
101
0


Make sure no counts are less than the number we wanted

In [984]:
uniq, counts = np.unique(orders, return_counts=True)
counts_per_word = np.sum(counts < n_counts_per_item)

print (f'All counts per word: {np.sum(counts >= n_counts_per_item) / len(counts)*100}%')

All counts per word: 61300%


Then check that all orders are unique

In [982]:
from collections import Counter

counts = Counter(tuple(o) for o in orders)

unique_orders = np.sum([v for k, v in counts.items()]) / len(counts)

print (f'Unique orders: {unique_orders*100}%')

Unique orders: 100.0%


Then make sure all items are at least 2 apart

In [980]:
orders_meeting_consecutive = np.sum([(np.all(np.diff(order) >= 2)) for order in orders]) / len(orders)
print (f'Consecutive constraint: {orders_meeting_consecutive*100}%')

Consecutive constraint: 100.0%


Lastly write files out

In [838]:
task_out_dir

'/dartfs/rc/lab/F/FinnLab/tommy/isc_asynchrony_behavioral/stimuli/presentation_orders/black'

In [840]:
for i, order in enumerate(orders):
    # find indices not selected for the current subject and set to false
    df_subject = df_preproc.copy()
    unselected = np.setdiff1d(nwp_indices, order)
    df_subject['NWP_Candidate'].loc[unselected] = False

    sub_fn = os.path.join(task_out_dir, f'sub-{str(i+1).zfill(5)}_task-{task}.json')
    df_subject.to_json(sub_fn, orient='records')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
