# Transcript-to-TR Processing

This notebook transforms data from the `.xlsx` timestamped-transcript format into a list-of-TRs format that can be used with Transformer models.

In [2]:
import pandas as pd

## First pass: discovering / correcting some timestamp errors.

In [3]:
STORY = "black"

original_transcript = pd.read_csv("./data/stimuli/{}/align.csv".format(STORY), header=None, 
                                  names=["cased", "uncased", "start_ts", "end_ts"])
original_transcript.head()

Unnamed: 0,cased,uncased,start_ts,end_ts
0,So,so,0.24,0.63
1,I,i,0.68,1.26
2,was,was,1.96,2.3
3,a,a,2.3,2.45
4,junior,junior,2.46,3.14


In [4]:
# NB: a handful of datapoints' TS are null. I backfill them.
original_transcript.end_ts = original_transcript.end_ts.bfill()
original_transcript.start_ts = original_transcript.start_ts.bfill()

In [5]:
def seconds_to_tr(seconds):
    """
    Segment into TRs starting at TR=0. Events are segmented based on their end TS.
        1.0s = TR0
        1.6s = TR1
        3.1s = TR3
        etc
    """
    return int(seconds / 1.5)

In [6]:
original_transcript["tr"] = original_transcript.end_ts.apply(lambda x: seconds_to_tr(x))

In [7]:
original_transcript.iloc[875:890]

Unnamed: 0,cased,uncased,start_ts,end_ts,tr
875,I,i,452.55,452.64,301
876,took,took,452.64,452.9,301
877,care,care,452.9,453.34,302
878,on,on,453.36,453.53,302
879,the,the,453.53,453.69,302
880,air,air,453.7,453.98,302
881,to,to,453.98,454.16,302
882,say,say,454.16,454.49,302
883,not,not,454.49,454.8,303
884,bread,bread,454.81,455.21,303


In [8]:
tr_grouped = []

for k, g in original_transcript.groupby("tr"):
    tr_grouped.append({
        "start_ts": g.start_ts.min(),
        "end_ts": g.end_ts.max(),
        "tr": k,
        "tokens": " ".join(g.cased.values),
        "n_tokens": len(g)
    })

df = pd.DataFrame.from_records(tr_grouped)
df.head()

Unnamed: 0,start_ts,end_ts,tr,tokens,n_tokens
0,0.24,1.26,0,So I,2
1,1.96,2.45,1,was a,2
2,2.46,4.2,2,junior in college,3
3,4.79,5.59,3,when I got my,4
4,5.61,6.34,4,first,1


In [9]:
df.n_tokens.value_counts().sort_index()

1     60
2    111
3    129
4     89
5     54
6     23
7     11
8      2
9      2
Name: n_tokens, dtype: int64

In [10]:
# Suspicious...
df[df.n_tokens == 12]

Unnamed: 0,start_ts,end_ts,tr,tokens,n_tokens


## Pad missing TRs

In [11]:
df["tr_shift"] = df.tr - df.tr.shift(1)
df["prev_tr"] = df.tr.shift(1)
df.tr_shift.value_counts()

1.0    446
2.0     25
3.0      5
5.0      1
6.0      1
7.0      1
4.0      1
Name: tr_shift, dtype: int64

In [12]:
df[df["tr_shift"] > 2]

Unnamed: 0,start_ts,end_ts,tr,tokens,n_tokens,tr_shift,prev_tr
46,75.13,76.34,50,Then I played commercials,4,4.0,46.0
99,164.37,164.86,109,Now keep in,3,7.0,102.0
187,306.23,306.77,204,Boom,1,3.0,201.0
202,337.099999,337.42,224,I could,2,3.0,221.0
243,403.96,404.56,269,Well I,2,3.0,266.0
245,409.889999,410.85,273,My father speaks,3,3.0,270.0
256,434.109999,434.89,289,But I'm still,3,6.0,283.0
421,695.719999,696.79,464,And I said,3,3.0,461.0
480,799.81,799.83,533,you,1,5.0,528.0


In [13]:
import itertools

def generate_missing_trs(row):
    
    if row["tr_shift"] > 1:
        return [{"tokens": "", "tr": int(row["prev_tr"] + i + 1)} for i in range(0, int(row["tr_shift"] - 1))]

def pad_missing_trs(df):
    
    missing = df.apply(lambda x: generate_missing_trs(x), axis=1)
    missing = missing[missing.values != None].values

    missing_tr_df = pd.DataFrame.from_records(itertools.chain.from_iterable(missing))
    
    return missing_tr_df

# Concat and sort by inferred TR to make sure our empty-space TRs get slotted in appropriately
final_df = pd.concat([df, pad_missing_trs(df)]).sort_values("tr")

final_df.tail(10)

Unnamed: 0,start_ts,end_ts,tr,tokens,n_tokens,tr_shift,prev_tr
475,785.44,787.419999,524,white I,2.0,1.0,523.0
476,787.42,788.519999,525,have this job,3.0,1.0,524.0
477,789.059999,790.28,526,because I am good,4.0,1.0,525.0
478,790.57,791.469999,527,at what I do,4.0,1.0,526.0
479,792.35,792.61,528,Thank,1.0,1.0,527.0
49,,,529,,,,
50,,,530,,,,
51,,,531,,,,
52,,,532,,,,
480,799.81,799.83,533,you,1.0,5.0,528.0


In [14]:
# Set index to TR
final_df.index = final_df.tr

# Make sure no duplicates
final_df.tr.value_counts()

533    1
182    1
168    1
169    1
170    1
      ..
361    1
362    1
363    1
364    1
0      1
Name: tr, Length: 534, dtype: int64

In [15]:
final_df.to_csv("data/stimuli/{}/tr_tokens.csv".format(STORY))

In [16]:
final_df.tr.max()

533

In [17]:
len(final_df)

534

In [18]:
final_df.tr.min()

0