### Extract linguistic features

In [1]:
import numpy as np
import csv
import pandas as pd
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [39]:
STIMULUS = 'black'

In [40]:
if STIMULUS == '21st_year':
    with open('data/21st_year/21st_year.csv', 'r') as f:
        reader = csv.reader(f)
        file = [row for row in reader]
        head = file[1]
        text = file[2:]
    sentences = [row[head.index('Sentence')].replace('\xa0', ' ') for row in text]
    TR_ids = [row[head.index('Index')] for row in text]
    
    doc = nlp.pipe(sentences)
    ling_features = []
    for TR ,line in zip(TR_ids, doc):
        for token in line:
            if token.pos_ != 'SPACE':
                ling_features.append([TR,token.text,token.lemma_,token.pos_,token.tag_,token.dep_,token.head.text,abs(token.i-token.head.i)])

elif STIMULUS in ['slumlordreach','black']:
    with open(f'data/{STIMULUS}/align.csv','r') as f:
        reader = csv.reader(f)
        token_list = [row[0] for row in reader]
    doc = nlp(' '.join(token_list))
    sent_id = 0
    ling_features = []
    for sent in doc.sents:
        for token in sent:
            if token.pos_ != 'SPACE':
                ling_features.append([sent_id,token.text,token.lemma_,token.pos_,token.tag_,token.dep_,token.head.text,abs(token.i-token.head.i)])
        sent_id += 1

In [41]:
ling_features = pd.DataFrame(ling_features,columns=["index","token","lemma","pos","tag","dep_rel","dep_head","dep_distance"])

In [42]:
ling_features

Unnamed: 0,index,token,lemma,pos,tag,dep_rel,dep_head,dep_distance
0,0,So,so,ADV,RB,advmod,was,2
1,0,I,-PRON-,PRON,PRP,nsubj,was,1
2,0,was,be,AUX,VBD,ROOT,was,0
3,0,a,a,DET,DT,det,junior,1
4,0,junior,junior,NOUN,NN,attr,was,2
...,...,...,...,...,...,...,...,...
1604,170,what,what,PRON,WP,dobj,Thank,3
1605,170,I,-PRON-,PRON,PRP,nsubj,Thank,2
1606,170,do,do,AUX,VBP,aux,Thank,1
1607,170,Thank,thank,VERB,VBP,pcomp,at,4


In [43]:
ling_features.to_csv(f'data/{STIMULUS}/ling_features.csv')

### Align with the TRs

In [44]:
with open(f'data/{STIMULUS}/ling_features.csv', 'r') as f:
    reader = csv.reader(f)
    file = [row for row in reader]
    head = file[0]
    text = file[1:]

In [45]:
with open(f'data/{STIMULUS}/tr_tokens.csv', 'r') as f:
    reader = csv.reader(f)
    TR_file = [row for row in reader]
    TR_head = TR_file[0]
    TR_text = TR_file[1:]

In [46]:
if STIMULUS == '21st_year':
    #TRs that include the same tokens twice
    double_count_dict = {'182': 'the','288':'the','399':'to','413':'do','439':'is','461':'a','502':'the','515':'it','525':'Every','630':'she','687':'the',\
                        '693':'her','707':'of','815':'go','828':'to','869':'we','898':'ticket','983':'in','1310':'her','1377':'the','1390':'her','1398':'"','1439':'“',\
                         '1461':'there','1528':'you','1579':'the','1800':'of','1821':'his','1982':'close','2039':'angry','2183':'the','2185':'than'}
    #TRs part of which appear in the previous TR
    bridging_TRs = ['A1.7','B1.19','A2.5','A3.12','A5.20','A6.9','B6.4','A7.6','A7.13','A7.16',\
                    'B7.2','B10.8','B11.9','B11.16','A12.5','B13.24','B14.12','C1.2','C2.5','C12.3']

In [47]:
if STIMULUS == '21st_year':
    current_id = 0
    new_data = []
    for TR_line in TR_text:
        TR_tokens = TR_line[TR_head.index('tokens')]
        TR_index = TR_line[TR_head.index('index')]
        if TR_tokens != "":
            TR_tokens = TR_tokens.replace('\xa0', ' ')
            token_list = [token.text for token in nlp(TR_tokens)]
            lemma_list = []
            pos_list = []
            tag_list = []
            dep_rel_list = []
            dep_head_list = []
            dep_dist_list = []
            check_list = []
            double_count = 0
            triple_count = 0
            while text[current_id][head.index('token')] in token_list\
            and (text[current_id][head.index('index')] == TR_index\
            or text[current_id][head.index('index')] in bridging_TRs)\
            and double_count != 2 and triple_count != 3:
                if TR_line[TR_head.index('tr')] in list(double_count_dict.keys()):
                    if text[current_id][head.index('token')] == double_count_dict[TR_line[TR_head.index('tr')]]:
                        double_count += 1
                if TR_line[TR_head.index('tr')] == '534' and text[current_id][head.index('token')] == '"':
                    triple_count += 1
                line = text[current_id]
                lemma_list.append(line[head.index('lemma')])
                pos_list.append(line[head.index('pos')])
                tag_list.append(line[head.index('tag')])
                dep_rel_list.append(line[head.index('dep_rel')])
                dep_head_list.append(line[head.index('dep_head')])
                dep_dist_list.append(line[head.index('dep_distance')])
                check_list.append(line[head.index('token')])
                if current_id == len(text)-1:
                    break
                current_id += 1
                if double_count == 2 or triple_count == 3:
                    current_id -= 1
                    del lemma_list[-1], pos_list[-1], tag_list[-1], dep_rel_list[-1], dep_head_list[-1], dep_dist_list[-1]
            TR_line.extend([lemma_list,pos_list,tag_list,dep_rel_list,dep_head_list,dep_dist_list])

            #Sanity check#
            if TR_line[TR_head.index('tr')] not in ['539','935','2161']:
                if check_list[0] != token_list[0]:
                    print(f'Error in {TR_line}')

            new_data.append(TR_line)
        else:
            TR_line.extend([[],[],[],[],[],[]])
            new_data.append(TR_line)
    TR_head.extend(["lemma","pos","tag","dep_rel","dep_head","dep_distance"])
    new_data = pd.DataFrame(new_data,columns=TR_head)

In [48]:
if STIMULUS in ['slumlordreach','black']:
    current_id = 0
    new_data = []
    for TR_line in TR_text:
        TR_tokens = TR_line[TR_head.index('tokens')]
        lemma_list = []
        pos_list = []
        tag_list = []
        dep_rel_list = []
        dep_head_list = []
        dep_dist_list = []
        if TR_tokens != "":
            token_list = [token.text for token in nlp(TR_tokens)]
            for token in token_list:
                line = text[current_id]
                lemma_list.append(line[head.index('lemma')])
                pos_list.append(line[head.index('pos')])
                tag_list.append(line[head.index('tag')])
                dep_rel_list.append(line[head.index('dep_rel')])
                dep_head_list.append(line[head.index('dep_head')])
                dep_dist_list.append(line[head.index('dep_distance')])
                current_id += 1
        TR_line.extend([lemma_list,pos_list,tag_list,dep_rel_list,dep_head_list,dep_dist_list])
        new_data.append(TR_line)
    TR_head.extend(["lemma","pos","tag","dep_rel","dep_head","dep_distance"])
    new_data = pd.DataFrame(new_data,columns=TR_head)

In [49]:
new_data

Unnamed: 0,tr,start_ts,end_ts,tr.1,tokens,n_tokens,tr_shift,prev_tr,lemma,pos,tag,dep_rel,dep_head,dep_distance
0,0,0.24,1.26,0,So I,2.0,,,"[so, -PRON-]","[ADV, PRON]","[RB, PRP]","[advmod, nsubj]","[was, was]","[2, 1]"
1,1,1.96,2.45,1,was a,2.0,1.0,0.0,"[be, a]","[AUX, DET]","[VBD, DT]","[ROOT, det]","[was, junior]","[0, 1]"
2,2,2.46,4.2,2,junior in college,3.0,1.0,1.0,"[junior, in, college]","[NOUN, ADP, NOUN]","[NN, IN, NN]","[attr, prep, pobj]","[was, junior, in]","[2, 1, 1]"
3,3,4.79,5.59,3,when I got my,4.0,1.0,2.0,"[when, -PRON-, get, -PRON-]","[ADV, PRON, VERB, DET]","[WRB, PRP, VBD, PRP$]","[advmod, nsubj, advcl, poss]","[got, got, was, paying]","[2, 1, 7, 2]"
4,4,5.61,6.34,4,first,1.0,1.0,3.0,[first],[ADJ],[JJ],[amod],[paying],[1]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,529,,,529,,,,,[],[],[],[],[],[]
530,530,,,530,,,,,[],[],[],[],[],[]
531,531,,,531,,,,,[],[],[],[],[],[]
532,532,,,532,,,,,[],[],[],[],[],[]


In [50]:
new_data.to_csv(f'data/{STIMULUS}/tr_tokens_new.csv')
new_data.to_pickle(f'data/{STIMULUS}/tr_tokens_new.pkl')