# Transcript-to-TR Processing

This notebook transforms data from the `.xlsx` timestamped-transcript format into a list-of-TRs format that can be used with Transformer models.

In [1]:
STORY = "slumlordreach" # black, slumlordreach


In [16]:
import json
import pandas as pd
import itertools

original_json = json.load(open("./data/stimuli/{}/align.json".format(STORY)))

In [17]:
original_transcript = pd.DataFrame.from_records(original_json['words'])

In [18]:
original_transcript.rename(axis='columns', mapper={'start': 'start_ts', 'end':'end_ts', 'word': 'cased', 'alignedWord': 'uncased'}, inplace=True)

## First pass: discovering / correcting some timestamp errors.

In [19]:
# original_transcript = pd.read_csv("./data/stimuli/{}/align.csv".format(STORY), header=None, 
#                                   names=["cased", "uncased", "start_ts", "end_ts"])
# original_transcript.head()

In [20]:
# NB: a handful of datapoints' TS are null. I backfill them.
original_transcript.end_ts = original_transcript.end_ts.bfill()
original_transcript.start_ts = original_transcript.start_ts.bfill()

In [21]:
def seconds_to_tr(seconds):
    """
    Segment into TRs starting at TR=0. Events are segmented based on their end TS.
        1.0s = TR0
        1.6s = TR1
        3.1s = TR3
        etc
    """
    return int(seconds / 1.5)

In [22]:
original_transcript["tr"] = original_transcript.end_ts.apply(lambda x: seconds_to_tr(x))

In [23]:
# Transform NaN phonemes into empty lists
original_transcript['phones'] = original_transcript['phones'].apply(lambda d: d if isinstance(d, list) else [])

In [24]:
original_transcript.iloc[4:20]

Unnamed: 0,uncased,case,end_ts,endOffset,phones,start_ts,startOffset,cased,tr
4,of,success,25.44,20,"[{'duration': 0.05, 'phone': 'ah_B'}, {'durati...",25.32,18,of,16
5,my,success,25.59,23,"[{'duration': 0.06, 'phone': 'm_B'}, {'duratio...",25.44,21,my,17
6,first,success,25.84,29,"[{'duration': 0.07, 'phone': 'f_B'}, {'duratio...",25.59,24,first,17
7,landlord,success,26.52,38,"[{'duration': 0.08, 'phone': 'l_B'}, {'duratio...",25.87,30,landlord,17
8,,not-found-in-audio,26.92,43,[],26.8,40,and,17
9,my,success,26.92,46,"[{'duration': 0.05, 'phone': 'm_B'}, {'duratio...",26.8,44,my,17
10,first,success,27.2,52,"[{'duration': 0.08, 'phone': 'f_B'}, {'duratio...",26.92,47,first,18
11,apartment,success,27.6,62,"[{'duration': 0.05, 'phone': 'ah_B'}, {'durati...",27.2,53,apartment,18
12,in,success,27.689999,65,"[{'duration': 0.06, 'phone': 'ih_B'}, {'durati...",27.599999,63,in,18
13,new,success,27.77,69,"[{'duration': 0.01, 'phone': 'n_B'}, {'duratio...",27.69,66,New,18


In [25]:
def n_phonemes(tr_group):
    
    return sum(tr_group.phones.apply(len))

def phoneme_set(tr_group):
    
    try:
        all_phonemes = list(itertools.chain.from_iterable(tr_group.phones))
        unique_phonemes = set([p["phone"].split("_")[0] for p in all_phonemes])
        return unique_phonemes
    except TypeError:
        return {}

In [26]:
derived_phoneme_list = list(set([s.split("_")[0] for s in phoneme_set(original_transcript)]))
print(len(derived_phoneme_list))
json.dumps(derived_phoneme_list)

40


'["ey", "ow", "oov", "ao", "y", "th", "sh", "r", "ih", "oy", "m", "k", "hh", "uh", "n", "er", "w", "ay", "d", "b", "dh", "uw", "zh", "jh", "ah", "aa", "s", "z", "l", "ae", "f", "ng", "aw", "g", "t", "iy", "eh", "ch", "p", "v"]'

In [27]:
PHONEME_LIST_FROZEN = ["ao", "iy", "m", "dh", "ow", "k", "w", "ey", "s", "ch", "sh", "aw", "ay", "l", "jh", "v", "g", "r", "oy", "er", "ae", "d", "hh", "th", "ih", "uw", "aa", "z", "zh", "oov", "ng", "p", "f", "ah", "n", "b", "uh", "y", "t", "eh"]

def in_set(p, phoneme_set):

    if p in phoneme_set:
        return 1
    else:
        return 0

def phoneme_vector(tr_group):
    
    set_of_phonemes = phoneme_set(tr_group)
    return [in_set(p, set_of_phonemes) for p in PHONEME_LIST_FROZEN]

In [28]:
# print(original_transcript.phones[4:10])
# phoneme_vector(original_transcript[4:10])
# all_phonemes = list(itertools.chain.from_iterable(original_transcript.phones))
# unique_phonemes = set([p["phone"] for p in all_phonemes])
# unique_phonemes

In [29]:
TR_TO_CHECK = 1

tr_x = original_transcript[original_transcript.tr == TR_TO_CHECK]
print(len(phoneme_vector(tr_x)))

original_transcript[original_transcript.tr == TR_TO_CHECK]


40


Unnamed: 0,uncased,case,end_ts,endOffset,phones,start_ts,startOffset,cased,tr


In [30]:
tr_grouped = []

for k, g in original_transcript.groupby("tr"):
    tr_grouped.append({
        "start_ts": g.start_ts.min(),
        "end_ts": g.end_ts.max(),
        "tr": k,
        'phoneme_vector': phoneme_vector(g),
        'n_phonemes': n_phonemes(g),
        "tokens": " ".join(g.cased.values),
        "n_tokens": len(g)
    })

df = pd.DataFrame.from_records(tr_grouped)
df.head()

Unnamed: 0,start_ts,end_ts,tr,phoneme_vector,n_phonemes,tokens,n_tokens
0,24.55,25.44,16,"[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",14,This is the story of,5
1,25.44,26.92,17,"[1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, ...",16,my first landlord and my,5
2,26.92,28.19,18,"[1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",21,first apartment in New York,5
3,29.0,29.87,19,"[1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...",16,I think I'm going to call,6
4,29.87,30.86,20,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6,it Rent,2


In [31]:
df.n_tokens.value_counts().sort_index()

1      55
2     102
3     149
4     179
5     186
6     164
7     120
8      73
9      33
10     17
11      5
12      4
14      1
Name: n_tokens, dtype: int64

In [32]:
# Suspicious...
df[df.n_tokens == 12]

Unnamed: 0,start_ts,end_ts,tr,phoneme_vector,n_phonemes,tokens,n_tokens
237,404.98,406.41,270,"[1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, ...",31,know we're all going to go to court It's going...,12
408,680.99,682.149999,454,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, ...",18,and everything and I said Have you been doing ...,12
409,682.15,683.969999,455,"[0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, ...",23,financial crime You know and he said No no I u...,12
982,1627.11,1628.94,1085,"[0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, ...",41,experiments where I'd be at the window and got...,12


## Pad missing TRs

In [33]:
df["tr_shift"] = df.tr - df.tr.shift(1)
df["prev_tr"] = df.tr.shift(1)
df.tr_shift.value_counts()

1.0     1037
2.0       39
3.0        7
4.0        2
27.0       1
5.0        1
Name: tr_shift, dtype: int64

In [34]:
df[df["tr_shift"] > 2]

Unnamed: 0,start_ts,end_ts,tr,phoneme_vector,n_phonemes,tokens,n_tokens,tr_shift,prev_tr
5,34.379999,35.71,23,"[0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, ...",16,I first came up here from,6,3.0,20.0
87,167.91,168.92,112,"[0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...",16,He kind of looked at me,6,4.0,108.0
149,266.28,266.79,177,"[0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...",11,then there was my,4,3.0,174.0
218,377.389999,377.96,251,"[1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",8,So of course,3,3.0,248.0
288,489.47,490.45,326,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, ...",3,So I,2,4.0,322.0
340,571.66,572.72,381,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, ...",16,Alan did not want to,5,3.0,378.0
453,755.41,755.82,503,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",2,So,1,3.0,500.0
498,834.79,835.34,556,"[0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",4,No way,2,5.0,551.0
555,965.78,966.86,644,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, ...",14,Well when I was about,5,27.0,617.0
737,1247.03,1247.86,831,"[0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, ...",16,So I actually never took a,6,3.0,828.0


In [35]:
import itertools

def generate_missing_trs(row):
    
    if row["tr_shift"] > 1:
        return [{"tokens": "", "tr": int(row["prev_tr"] + i + 1)} for i in range(0, int(row["tr_shift"] - 1))]

def pad_missing_trs(df):
    
    missing = df.apply(lambda x: generate_missing_trs(x), axis=1)
    missing = missing[missing.values != None].values

    missing_tr_df = pd.DataFrame.from_records(itertools.chain.from_iterable(missing))
    
    return missing_tr_df

# Concat and sort by inferred TR to make sure our empty-space TRs get slotted in appropriately
final_df = pd.concat([df, pad_missing_trs(df)]).sort_values("tr")

final_df.tail(10)

Unnamed: 0,start_ts,end_ts,tr,phoneme_vector,n_phonemes,tokens,n_tokens,tr_shift,prev_tr
1078,1774.12,1775.71,1183,"[1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...",18.0,approved for all astronaut,4.0,1.0,1182.0
1079,1775.719999,1777.479999,1184,"[0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, ...",19.0,candidates laser corrective,3.0,1.0,1183.0
1080,1777.52,1778.26,1185,"[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, ...",6.0,eye surgery,2.0,1.0,1184.0
1081,1779.29,1779.77,1186,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",2.0,So,1.0,1.0,1185.0
1082,1780.58,1781.62,1187,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9.0,the thing that uh,4.0,1.0,1186.0
1083,1782.99,1783.47,1188,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",6.0,NASA of,2.0,1.0,1187.0
1084,1783.47,1784.88,1189,"[1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",15.0,course uh you know banned me,6.0,1.0,1188.0
1085,1784.88,1786.45,1190,"[1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, ...",22.0,for has kind of become my one,7.0,1.0,1189.0
1086,1786.47,1787.94,1191,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",15.0,contribution back,2.0,1.0,1190.0
1087,1787.97,1789.39,1192,"[0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, ...",16.0,to the fold So thank you,6.0,1.0,1191.0


In [37]:
# Set index to TR
final_df.index = final_df.tr

# Make sure no duplicates
final_df.tr.value_counts()

1192    1
419     1
403     1
404     1
405     1
       ..
800     1
801     1
802     1
803     1
16      1
Name: tr, Length: 1177, dtype: int64

In [38]:
final_df[:20]

Unnamed: 0_level_0,start_ts,end_ts,tr,phoneme_vector,n_phonemes,tokens,n_tokens,tr_shift,prev_tr
tr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
16,24.55,25.44,16,"[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",14.0,This is the story of,5.0,,
17,25.44,26.92,17,"[1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, ...",16.0,my first landlord and my,5.0,1.0,16.0
18,26.92,28.19,18,"[1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",21.0,first apartment in New York,5.0,1.0,17.0
19,29.0,29.87,19,"[1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...",16.0,I think I'm going to call,6.0,1.0,18.0
20,29.87,30.86,20,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6.0,it Rent,2.0,1.0,19.0
21,,,21,,,,,,
22,,,22,,,,,,
23,34.379999,35.71,23,"[0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, ...",16.0,I first came up here from,6.0,3.0,20.0
24,35.879999,36.94,24,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, ...",11.0,South Carolina,2.0,1.0,23.0
25,37.61,38.94,25,"[1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...",18.0,like apparently or North,4.0,1.0,24.0


In [243]:
final_df.to_csv("data/stimuli/{}/tr_tokens.csv".format(STORY))

In [245]:
len(final_df)

1177

In [246]:
final_df.head()

Unnamed: 0_level_0,start_ts,end_ts,tr,phoneme_vector,n_phonemes,tokens,n_tokens,tr_shift,prev_tr
tr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
16,24.55,25.44,16,"[1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",14.0,This is the story of,5.0,,
17,25.44,26.92,17,"[1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, ...",16.0,my first landlord and my,5.0,1.0,16.0
18,26.92,28.19,18,"[1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",21.0,first apartment in New York,5.0,1.0,17.0
19,29.0,29.87,19,"[1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...",16.0,I think I'm going to call,6.0,1.0,18.0
20,29.87,30.86,20,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6.0,it Rent,2.0,1.0,19.0


In [247]:
final_df['phoneme_vector'] = final_df['phoneme_vector'].apply(lambda d: d if isinstance(d, list) else [0] * len(PHONEME_LIST_FROZEN))
final_df['n_phonemes'].fillna(0, inplace=True)
final_df['n_tokens'].fillna(0, inplace=True)
final_df[598:630]

Unnamed: 0_level_0,start_ts,end_ts,tr,phoneme_vector,n_phonemes,tokens,n_tokens,tr_shift,prev_tr
tr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
614,921.01,921.91,614,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4.0,Bob DeSalvo,2.0,1.0,613.0
615,922.58,923.81,615,"[0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, ...",20.0,my crazy superintendent,3.0,1.0,614.0
616,924.37,925.48,616,"[0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, ...",11.0,may actually have,3.0,1.0,615.0
617,925.48,926.25,617,"[0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, ...",9.0,saved my life,3.0,1.0,616.0
618,,,618,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,,0.0,,
619,,,619,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,,0.0,,
620,,,620,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,,0.0,,
621,,,621,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,,0.0,,
622,,,622,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,,0.0,,
623,,,623,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0,,0.0,,


In [248]:
import numpy as np

phoneme_vector = np.stack(final_df.phoneme_vector)
np.save("{}_phoneme_vectors.npy".format(STORY), phoneme_vector)
np.save("{}_phoneme_counts.npy".format(STORY), final_df.n_phonemes)
np.save("{}_word_counts.npy".format(STORY), final_df.n_tokens)

In [249]:
print("scp {}_phoneme*.npy {}_word*.npy tsumers@apps.pni.princeton.edu:/jukebox/griffiths/bert-brains/code/bert-brains/data/{}".format(STORY, STORY, STORY))

scp slumlordreach_phoneme*.npy slumlordreach_word*.npy tsumers@apps.pni.princeton.edu:/jukebox/griffiths/bert-brains/code/bert-brains/data/slumlordreach
