# Transcript-to-TR Processing

This notebook transforms data from the `.xlsx` timestamped-transcript format into a list-of-TRs format that can be used with Transformer models.

In [1]:
import pandas as pd

## First pass: discovering / correcting some timestamp errors.

In [230]:
original_transcript = pd.read_csv("./data/21st_year/21st_year.csv", header=1).fillna("")
original_transcript.head()

Unnamed: 0,Index,Sentence,start_timestamp,end_timestamp,start_sec_rounded,end_sec_rounded,from_sec,to_sec,from_tr,to_tr,unknown_1,unknown_2,unknown_3,unknown_4
0,A1.1,This is Los Angeles.,0:21:02,0:23:00,21,23,1,2,1,1,1.5,1.5,0.5,-0.5
1,A1.2,And it's the height of summer.,0:23:00,0:24:03,23,24,2,3,1,2,1.5,3.0,-0.5,0.0
2,A1.3,"In a small bungalow off of La Cienega, Clara s...",0:24:04,0:32:07,24,33,3,12,2,8,3.0,12.0,0.0,0.0
3,A1.4,The colors don't match and the cups are too b...,0:32:08,0:36:03,33,36,12,15,8,10,12.0,15.0,0.0,0.0
4,A1.5,"“You didn’t get half the things on my list,” s...",0:36:06,0:40:06,36,40,15,19,10,13,15.0,19.5,0.0,0.5


In [231]:
original_transcript["tr_duration"] = original_transcript.to_tr - original_transcript.from_tr
original_transcript.tr_duration.value_counts().sort_index()

-37      1
-5       3
-2       1
 0      55
 1     334
 2     425
 3     137
 4      55
 5       7
 6       5
 8       2
 9       2
 10      1
Name: tr_duration, dtype: int64

In [232]:
original_transcript[original_transcript.tr_duration<0]

Unnamed: 0,Index,Sentence,start_timestamp,end_timestamp,start_sec_rounded,end_sec_rounded,from_sec,to_sec,from_tr,to_tr,unknown_1,unknown_2,unknown_3,unknown_4,tr_duration
362,B8.21,Then he waits.,19:46:01,19:44:01,1186,1184,1165,1163,777,775,1165.5,1162.5,0.5,-0.5,-2
609,A14.8,"We had the party a week ago,” she says.",1:8:33:06.000,1:8:25,1953,1945,1932,1924,1288,1283,1932.0,1924.5,0.0,0.5,-5
741,C3.6,"Alexander sighs, he looks so much older.",1:15:56:07,1:16:00:03,2396,2340,2375,2319,1583,1546,2374.5,2319.0,-0.5,0.0,-37
845,C8.5,“Nothing? You have nothing?!”,1:21:50:01,1:21:42:02.000,2750,2742,2729,2721,1819,1814,2728.5,2721.0,-0.5,0.0,-5
991,C14.12,"“He thought you were our son,” says his wife.",2:5:56:03,2:5:48:07,3236,3228,3215,3207,2143,2138,3214.5,3207.0,-0.5,0.0,-5


## Segmenting text to TRs

Load the hand-corrected CSV and segment / align text to individual TR windows.

In [276]:
df = pd.read_csv("./data/21st_year/21st_year_edited.csv", header=1).fillna("")

df["tr_duration"] = df.to_tr - df.from_tr
df.tr_duration.value_counts().sort_index()

0     55
1    335
2    429
3    139
4     55
5      7
6      5
8      2
9      1
Name: tr_duration, dtype: int64

In [277]:
df = df[["Index", "Sentence", "from_tr", "to_tr"]]
df.head()

Unnamed: 0,Index,Sentence,from_tr,to_tr
0,A1.1,This is Los Angeles.,1,1
1,A1.2,And it's the height of summer.,1,2
2,A1.3,"In a small bungalow off of La Cienega, Clara s...",2,8
3,A1.4,The colors don't match and the cups are too b...,8,10
4,A1.5,"“You didn’t get half the things on my list,” s...",10,13


In [278]:
import numpy as np

def split_sentence_to_trs(sentence, start_tr, end_tr, index=""):
    
    whitespace_tokens = sentence.split()
    
    num_tokens = len(whitespace_tokens)
    num_trs = end_tr - start_tr
    
    if num_trs == 0:
        return [{"Sentence": sentence, "tr": start_tr, "index":index}]
    
    # https://numpy.org/doc/stable/reference/generated/numpy.array_split.html
    arrayed_tokens = np.array_split(whitespace_tokens, num_trs)
    trs = [start_tr + i for i in range(0, num_trs)]
    
    return [{"Sentence": " ".join(tokens), "tr": tr, "index":index} for tokens, tr in zip(arrayed_tokens, trs)]

In [279]:
# test cases

zero_span_tr = split_sentence_to_trs("one two three four five six", 0, 0)
print("Expect to get a single sentence back: {}".format(zero_span_tr))

one_span_tr = split_sentence_to_trs("one two three four five six", 0, 1)
print("Expect to get a single sentence back: {}".format(zero_span_tr))

two_span_tr = split_sentence_to_trs("one two three four five six", 0, 2)
print("Expect to get two sentences back:")
for d in two_span_tr:
    print("\t{}".format(d))

mismatch_span_tr = split_sentence_to_trs("one two three four five six", 0, 5)
print("Expect to get four sentences back:")
for d in mismatch_span_tr:
    print("\t{}".format(d))
    
mismatch_span_tr = split_sentence_to_trs("one two three four five six", 1, 6)
print("Expect to get four sentences back:")
for d in mismatch_span_tr:
    print("\t{}".format(d))

Expect to get a single sentence back: [{'Sentence': 'one two three four five six', 'tr': 0, 'index': ''}]
Expect to get a single sentence back: [{'Sentence': 'one two three four five six', 'tr': 0, 'index': ''}]
Expect to get two sentences back:
	{'Sentence': 'one two three', 'tr': 0, 'index': ''}
	{'Sentence': 'four five six', 'tr': 1, 'index': ''}
Expect to get four sentences back:
	{'Sentence': 'one two', 'tr': 0, 'index': ''}
	{'Sentence': 'three', 'tr': 1, 'index': ''}
	{'Sentence': 'four', 'tr': 2, 'index': ''}
	{'Sentence': 'five', 'tr': 3, 'index': ''}
	{'Sentence': 'six', 'tr': 4, 'index': ''}
Expect to get four sentences back:
	{'Sentence': 'one two', 'tr': 1, 'index': ''}
	{'Sentence': 'three', 'tr': 2, 'index': ''}
	{'Sentence': 'four', 'tr': 3, 'index': ''}
	{'Sentence': 'five', 'tr': 4, 'index': ''}
	{'Sentence': 'six', 'tr': 5, 'index': ''}


In [280]:
def lambda_wrapper(x):
    return split_sentence_to_trs(x["Sentence"], x["from_tr"], x["to_tr"], x["Index"])

In [281]:
foo = df.apply(lambda x: lambda_wrapper(x), axis=1).values

In [282]:
import itertools

sentence_trs = pd.DataFrame.from_records(itertools.chain.from_iterable(foo))
sentence_trs.head()

Unnamed: 0,Sentence,tr,index
0,This is Los Angeles.,1,A1.1
1,And it's the height of summer.,1,A1.2
2,In a small bungalow,2,A1.3
3,"off of La Cienega,",3,A1.3
4,Clara serves homemade chili,4,A1.3


In [283]:
sentence_trs[sentence_trs.tr==66]

Unnamed: 0,Sentence,tr,index
62,"""Steven, are you listening?""",66,B1.4


In [284]:
# Check for duplicate TRs: sentences that have been assigned to multiple TRs.
sentence_trs.tr.value_counts().value_counts() 

1    1907
2      34
Name: tr, dtype: int64

In [285]:
tr_value_counts = sentence_trs.tr.value_counts()
duplicate_trs = sentence_trs[sentence_trs.tr.isin(tr_value_counts[tr_value_counts > 1].index)]

In [286]:
for i, r in duplicate_trs.iterrows():
    print("Index {} TR {}: {}".format(r["index"], r.tr, r.Sentence))

Index A1.1 TR 1: This is Los Angeles.  
Index A1.2 TR 1: And it's the height of summer.
Index A1.6 TR 13: He shrugs.
Index A1.7 TR 13: No one seems to
Index B1.18 TR 93: I think you should do it.”
Index B1.19 TR 93: She stops.
Index A2.4 TR 111: Son of friends.
Index A2.5 TR 111: His friends,
Index A3.11 TR 223: “Can't afford it for one.”  
Index A3.12 TR 223: Jeannie looks up, expecting
Index A5.19 TR 439: But he is 35.
Index A5.20 TR 439: "What's good for me Clara,
Index A6.8 TR 511: Gary's business account.
Index A6.9 TR 511: She takes his lunch and
Index B6.3 TR 551: Steven yells at a man.
Index B6.4 TR 551: At 10 o’clock at
Index A7.5 TR 601: “You want me to send you some money?”
Index A7.6 TR 601: “No mama,
Index A7.12 TR 613: It's August 18th.
Index A7.13 TR 613: She still remembers the
Index A7.15 TR 617: "Almost mama."
Index A7.16 TR 617: "You hold onto that job.
Index B7.1 TR 637: Steven can hear their agitated voices full of anger.
Index B7.2 TR 637: He tries
Index B7.11 TR 

## Hand-correct duplicates again...

In [306]:
df = pd.read_csv("./data/21st_year/21st_year_edited_two.csv", header=1).fillna("")
df.head()

Unnamed: 0,Index,Sentence,start_timestamp,end_timestamp,start_sec_rounded,end_sec_rounded,from_sec,to_sec,from_tr,to_tr,unknown_1,unknown_2,unknown_3,unknown_4
0,A1.1,This is Los Angeles.,0:21:02,0:23:00,21,23,1,2,0,1,1.5,1.5,0.5,-0.5
1,A1.2,And it's the height of summer.,0:23:00,0:24:03,23,24,2,3,1,2,1.5,3.0,-0.5,0.0
2,A1.3,"In a small bungalow off of La Cienega, Clara s...",0:24:04,0:32:07,24,33,3,12,2,8,3.0,12.0,0.0,0.0
3,A1.4,The colors don't match and the cups are too b...,0:32:08,0:36:03,33,36,12,15,8,10,12.0,15.0,0.0,0.0
4,A1.5,"“You didn’t get half the things on my list,” s...",0:36:06,0:40:06,36,40,15,19,10,13,15.0,19.5,0.0,0.5


In [307]:
sentence_trs = df.apply(lambda x: lambda_wrapper(x), axis=1).values
sentence_trs = pd.DataFrame.from_records(itertools.chain.from_iterable(sentence_trs))

tr_value_counts = sentence_trs.tr.value_counts()
duplicate_trs = tr_value_counts[tr_value_counts > 1].index

duplicate_tr_rows = sentence_trs[sentence_trs.tr.isin(duplicate_trs)]

for i, r in duplicate_tr_rows.iterrows():
    print("Index {} TR {}: {}".format(r["index"], r.tr, r.Sentence))

Index A1.6 TR 13: He shrugs.
Index A1.7 TR 13: No one seems to
Index B1.18 TR 93: I think you should do it.”
Index B1.19 TR 93: She stops.
Index A2.4 TR 111: Son of friends.
Index A2.5 TR 111: His friends,
Index A3.11 TR 223: “Can't afford it for one.”  
Index A3.12 TR 223: Jeannie looks up, expecting
Index A5.19 TR 439: But he is 35.
Index A5.20 TR 439: "What's good for me Clara,
Index A6.8 TR 511: Gary's business account.
Index A6.9 TR 511: She takes his lunch and
Index B6.3 TR 551: Steven yells at a man.
Index B6.4 TR 551: At 10 o’clock at
Index A7.5 TR 601: “You want me to send you some money?”
Index A7.6 TR 601: “No mama,
Index A7.12 TR 613: It's August 18th.
Index A7.13 TR 613: She still remembers the
Index A7.15 TR 617: "Almost mama."
Index A7.16 TR 617: "You hold onto that job.
Index B7.1 TR 637: Steven can hear their agitated voices full of anger.
Index B7.2 TR 637: He tries
Index B10.7 TR 935:  “So when do I start?”
Index B10.8 TR 935: “It’s funny, how you
Index B11.8 TR 1037

### "Duplicate" TRs
OK-- so what to do about these remaining duplicates?

What's actually happening is that very short sentences are that fall inside a single TR are resulting in duplicates. Ideally, we'd want to merge them either to the prior or subsequent TR *before* splitting it out. This could be accomplished by:  
(1) Finding all sentences in the original with duration == 0  
(2) Looking at previous and next TR to figure out which stimulus to associate with (whichever has the same `from_tr`/`to_tr` as the zero-duration  
(3) Concatenate this sentence to that one  
(4) Proceed with running sentence-split algorithm as before.  

Instead, as a first pass (since there are only a handful of these records anyway) I'm just going to concatenate the duplicates together.

In [308]:
concatenated_records = []
for k, g in duplicate_tr_rows.groupby("tr"):
    new_record = g.iloc[0]
    new_record["Sentence"] += " {}".format(g.iloc[1].Sentence)
    
    concatenated_records.append(new_record)

concated = pd.DataFrame.from_records(concatenated_records)
concated.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_record["Sentence"] += " {}".format(g.iloc[1].Sentence)


Unnamed: 0,Sentence,tr,index
0,He shrugs. No one seems to,13,A1.6
1,I think you should do it.” She stops.,93,B1.18
2,"Son of friends. His friends,",111,A2.4
3,"“Can't afford it for one.” Jeannie looks up,...",223,A3.11
4,"But he is 35. ""What's good for me Clara,",439,A5.19


In [316]:
deduped = sentence_trs[~sentence_trs.tr.isin(duplicate_trs)]
recatted = pd.concat([concated, deduped]).sort_values("tr")

recatted.head()

Unnamed: 0,Sentence,tr,index
0,This is Los Angeles.,0,A1.1
1,And it's the height of summer.,1,A1.2
2,In a small bungalow,2,A1.3
3,"off of La Cienega,",3,A1.3
4,Clara serves homemade chili,4,A1.3


## Pad missing TRs

In [320]:
recatted["tr_shift"] = recatted.tr - recatted.tr.shift(1)
recatted["prev_tr"] = recatted.tr.shift(1)
recatted.tr_shift.value_counts()

1.0    1653
2.0     284
3.0       1
Name: tr_shift, dtype: int64

In [321]:
recatted[recatted["tr_shift"] > 2]

Unnamed: 0,Sentence,tr,index,tr_shift,prev_tr
1452,"When she’s done throwing up,",1641,C4.12,3.0,1638.0


In [333]:
def generate_missing_trs(row):
    
    if row["tr_shift"] > 1:
        return [{"Sentence": "", "tr": int(row["prev_tr"] + i + 1)} for i in range(0, int(row["tr_shift"] - 1))]

def pad_missing_trs(df):
    
    missing = df.apply(lambda x: generate_missing_trs(x), axis=1)
    missing = missing[missing.values != None].values

    missing_tr_df = pd.DataFrame.from_records(itertools.chain.from_iterable(missing))
    
    return missing_tr_df

missing_trs = pad_missing_trs(recatted)

final_df = pd.concat([missing_trs, recatted]).fillna("").sort_values("tr")
final_df.head()

Unnamed: 0,Sentence,tr,index,tr_shift,prev_tr
0,This is Los Angeles.,0,A1.1,,
1,And it's the height of summer.,1,A1.2,1.0,0.0
2,In a small bungalow,2,A1.3,1.0,1.0
3,"off of La Cienega,",3,A1.3,1.0,2.0
4,Clara serves homemade chili,4,A1.3,1.0,3.0


In [334]:
# Set index to TR
final_df.index = final_df.tr

# Make sure no duplicates
final_df.tr.value_counts()

In [341]:
final_df["story_section"] = final_df.apply(lambda x: x["index"].split(".")[0], axis=1)

In [343]:
final_df.to_csv("data/21st_year/tr_tokens.csv")