# Transcript-to-TR Processing

This notebook transforms data from the `.xlsx` timestamped-transcript format into a list-of-TRs format that can be used with Transformer models.

In [437]:
import pandas as pd

## First pass: discovering / correcting some timestamp errors.

In [438]:
STORY = "pieman"

original_transcript = pd.read_csv("./data/stimuli/{}/align.csv".format(STORY), header=None, 
                                  names=["cased", "uncased", "start_ts", "end_ts"])
original_transcript.head()

Unnamed: 0,cased,uncased,start_ts,end_ts
0,I,i,15.089999,15.169999
1,began,began,15.17,15.51
2,my,my,15.509999,15.699999
3,illustrious,<unk>,15.71,16.31
4,career,career,16.33,16.94


In [439]:
# NB: a handful of datapoints' TS are null. I backfill them.
original_transcript.end_ts = original_transcript.end_ts.bfill()
original_transcript.start_ts = original_transcript.start_ts.bfill()

In [440]:
def seconds_to_tr(seconds):
    """
    Segment into TRs starting at TR=0. Events are segmented based on their end TS.
        1.0s = TR0
        1.6s = TR1
        3.1s = TR3
        etc
    """
    return int(seconds / 1.5)

In [441]:
original_transcript["tr"] = original_transcript.end_ts.apply(lambda x: seconds_to_tr(x))

In [442]:
original_transcript.iloc[875:890]

Unnamed: 0,cased,uncased,start_ts,end_ts,tr
875,I,i,393.15,393.33,262
876,really,really,393.38,393.93,262
877,Pie,pie,393.94,394.25,262
878,Man,man,394.37,394.81,263
879,For,for,398.76,398.99,265
880,having,having,399.0,399.35,266
881,brought,,399.75,399.91,266
882,him,him,399.75,399.91,266
883,into,into,399.91,400.19,266
884,being,being,400.2,400.82,267


In [443]:
tr_grouped = []

for k, g in original_transcript.groupby("tr"):
    tr_grouped.append({
        "start_ts": g.start_ts.min(),
        "end_ts": g.end_ts.max(),
        "tr": k,
        "tokens": " ".join(g.cased.values),
        "n_tokens": len(g)
    })

df = pd.DataFrame.from_records(tr_grouped)
df.head()

Unnamed: 0,start_ts,end_ts,tr,tokens,n_tokens
0,15.089999,16.31,10,I began my illustrious,4
1,16.33,17.929999,11,career in journalism,3
2,18.469999,19.39,12,in the Bronx where,4
3,19.39,20.89,13,I toiled as a hard,5
4,20.89,22.36,14,boiled reporter for the,4


In [444]:
df.n_tokens.value_counts()

3     59
2     48
4     44
5     41
1     23
6     18
8     12
9      4
7      4
12     1
Name: n_tokens, dtype: int64

In [445]:
# Suspicious...
df[df.n_tokens == 12]

Unnamed: 0,start_ts,end_ts,tr,tokens,n_tokens
194,329.91,331.41,220,And toward the end of this run I was out at a,12


## Pad missing TRs

In [446]:
df["tr_shift"] = df.tr - df.tr.shift(1)
df["prev_tr"] = df.tr.shift(1)
df.tr_shift.value_counts()

1.0    231
2.0     21
3.0      1
Name: tr_shift, dtype: int64

In [447]:
df[df["tr_shift"] > 2]

Unnamed: 0,start_ts,end_ts,tr,tokens,n_tokens,tr_shift,prev_tr
99,175.67,176.83,117,I said that he cried out,6,3.0,114.0


In [448]:
def generate_missing_trs(row):
    
    if row["tr_shift"] > 1:
        return [{"tokens": "", "tr": int(row["prev_tr"] + i + 1)} for i in range(0, int(row["tr_shift"] - 1))]

def pad_missing_trs(df):
    
    missing = df.apply(lambda x: generate_missing_trs(x), axis=1)
    missing = missing[missing.values != None].values

    missing_tr_df = pd.DataFrame.from_records(itertools.chain.from_iterable(missing))
    
    return missing_tr_df

final_df = pd.concat([df, pad_missing_trs(df)])

final_df.sort_values("tr").head()

Unnamed: 0,start_ts,end_ts,tr,tokens,n_tokens,tr_shift,prev_tr
0,15.089999,16.31,10,I began my illustrious,4.0,,
1,16.33,17.929999,11,career in journalism,3.0,1.0,10.0
2,18.469999,19.39,12,in the Bronx where,4.0,1.0,11.0
3,19.39,20.89,13,I toiled as a hard,5.0,1.0,12.0
4,20.89,22.36,14,boiled reporter for the,4.0,1.0,13.0


In [449]:
# Set index to TR
final_df.index = final_df.tr

# Make sure no duplicates
final_df.tr.value_counts()

286    1
103    1
97     1
98     1
99     1
      ..
188    1
187    1
186    1
185    1
10     1
Name: tr, Length: 277, dtype: int64

In [452]:
final_df.to_csv("data/stimuli/{}/tr_tokens.csv".format(STORY))

In [453]:
final_df.tr.max()

286