### Extract linguistic features

In [1]:
import numpy as np
import csv
import pandas as pd
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
with open('data/21st_year/21st_year.csv', 'r') as f:
    reader = csv.reader(f)
    file = [row for row in reader]
    head = file[1]
    text = file[2:]
sentences = [row[head.index('Sentence')].replace('\xa0', ' ') for row in text]
TR_ids = [row[head.index('Index')] for row in text]

In [4]:
doc = nlp.pipe(sentences)
ling_features = []
for TR ,line in zip(TR_ids, doc):
    for token in line:
        if token.pos_ != 'SPACE':
            ling_features.append([TR,token.text,token.lemma_,token.pos_,token.tag_,token.dep_,token.head.text])

In [5]:
ling_features = pd.DataFrame(ling_features,columns=["index","token","lemma","pos","tag","dep_rel","dep_head"])

In [6]:
ling_features

Unnamed: 0,index,token,lemma,pos,tag,dep_rel,dep_head
0,A1.1,This,this,DET,DT,nsubj,is
1,A1.1,is,be,AUX,VBZ,ROOT,is
2,A1.1,Los,Los,PROPN,NNP,compound,Angeles
3,A1.1,Angeles,Angeles,PROPN,NNP,attr,is
4,A1.1,.,.,PUNCT,.,punct,is
...,...,...,...,...,...,...,...
11069,C15.24,birth,birth,NOUN,NN,pobj,Like
11070,C15.24,.,.,PUNCT,.,punct,Like
11071,C15.24,Each,each,DET,DT,det,day
11072,C15.24,day,day,NOUN,NN,ROOT,day


In [7]:
ling_features.to_csv('data/21st_year/ling_features.csv')

### Align with the TRs

In [8]:
with open('data/21st_year/ling_features.csv', 'r') as f:
    reader = csv.reader(f)
    file = [row for row in reader]
    head = file[0]
    text = file[1:]

In [9]:
with open('data/21st_year/tr_tokens.csv', 'r') as f:
    reader = csv.reader(f)
    TR_file = [row for row in reader]
    TR_head = TR_file[0]
    TR_text = TR_file[1:]

In [10]:
#TRs that include the same tokens twice
double_count_dict = {'182': 'the','288':'the','399':'to','413':'do','439':'is','461':'a','502':'the','515':'it','525':'Every','630':'she','687':'the',\
                    '693':'her','707':'of','815':'go','828':'to','869':'we','898':'ticket','983':'in','1310':'her','1377':'the','1390':'her','1398':'"','1439':'“',\
                     '1461':'there','1528':'you','1579':'the','1800':'of','1821':'his','1982':'close','2039':'angry','2183':'the','2185':'than'}
#TRs part of which appear in the previous TR
bridging_TRs = ['A1.7','B1.19','A2.5','A3.12','A5.20','A6.9','B6.4','A7.6','A7.13','A7.16',\
                'B7.2','B10.8','B11.9','B11.16','A12.5','B13.24','B14.12','C1.2','C2.5','C12.3']

In [11]:
current_id = 0
new_data = []
for TR_line in TR_text:
    TR_tokens = TR_line[TR_head.index('Sentence')]
    TR_index = TR_line[TR_head.index('index')]
    if TR_tokens != "":
        TR_tokens = TR_tokens.replace('\xa0', ' ')
        token_list = [token.text for token in nlp(TR_tokens)]
        lemma_list = []
        pos_list = []
        tag_list = []
        dep_rel_list = []
        dep_head_list = []
        check_list = []
        double_count = 0
        triple_count = 0
        while text[current_id][head.index('token')] in token_list\
        and (text[current_id][head.index('index')] == TR_index\
        or text[current_id][head.index('index')] in bridging_TRs)\
        and double_count != 2 and triple_count != 3:
            if TR_line[TR_head.index('tr')] in list(double_count_dict.keys()):
                if text[current_id][head.index('token')] == double_count_dict[TR_line[TR_head.index('tr')]]:
                    double_count += 1
            if TR_line[TR_head.index('tr')] == '534' and text[current_id][head.index('token')] == '"':
                triple_count += 1
            line = text[current_id]
            lemma_list.append(line[head.index('lemma')])
            pos_list.append(line[head.index('pos')])
            tag_list.append(line[head.index('tag')])
            dep_rel_list.append(line[head.index('dep_rel')])
            dep_head_list.append(line[head.index('dep_head')])
            check_list.append(line[head.index('token')])
            if current_id == len(text)-1:
                break
            current_id += 1
            if double_count == 2 or triple_count == 3:
                current_id -= 1
                del lemma_list[-1], pos_list[-1], tag_list[-1], dep_rel_list[-1], dep_head_list[-1]
        TR_line.extend([lemma_list,pos_list,tag_list,dep_rel_list,dep_head_list])
        
        #Sanity check#
        if TR_line[TR_head.index('tr')] not in ['539','935','2161']:
            if check_list[0] != token_list[0]:
                print(f'Error in {TR_line}')
        
        new_data.append(TR_line)
    else:
        TR_line.extend([[],[],[],[],[]])
        new_data.append(TR_line)
TR_head.extend(["lemma","pos","tag","dep_rel","dep_head"])
new_data = pd.DataFrame(new_data,columns=TR_head)

In [12]:
new_data

Unnamed: 0,tr,Sentence,tr.1,index,tr_shift,prev_tr,story_section,n_tokens,lemma,pos,tag,dep_rel,dep_head
0,0,This is Los Angeles.,0,A1.1,,,A1,2225,"[this, be, Los, Angeles, .]","[DET, AUX, PROPN, PROPN, PUNCT]","[DT, VBZ, NNP, NNP, .]","[nsubj, ROOT, compound, attr, punct]","[is, is, Angeles, is, is]"
1,1,And it's the height of summer.,1,A1.2,1.0,0.0,A1,2225,"[and, -PRON-, be, the, height, of, summer, .]","[CCONJ, PRON, AUX, DET, NOUN, ADP, NOUN, PUNCT]","[CC, PRP, VBZ, DT, NN, IN, NN, .]","[cc, nsubj, ROOT, det, attr, prep, pobj, punct]","['s, 's, 's, height, 's, height, of, 's]"
2,2,In a small bungalow,2,A1.3,1.0,1.0,A1,2225,"[in, a, small, bungalow]","[ADP, DET, ADJ, NOUN]","[IN, DT, JJ, NN]","[prep, det, amod, pobj]","[serves, bungalow, bungalow, In]"
3,3,"off of La Cienega,",3,A1.3,1.0,2.0,A1,2225,"[off, of, La, Cienega, ,]","[ADP, ADP, PROPN, PROPN, PUNCT]","[IN, IN, NNP, NNP, ,]","[prep, prep, compound, pobj, punct]","[bungalow, off, Cienega, of, serves]"
4,4,Clara serves homemade chili,4,A1.3,1.0,3.0,A1,2225,"[Clara, serve, homemade, chili]","[PROPN, VERB, ADJ, NOUN]","[NNP, VBZ, JJ, NNS]","[nsubj, ROOT, amod, dobj]","[serves, serves, chili, serves]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2220,2220,"day, she thinks.",2220,C15.22,1.0,2219.0,C15,2225,"[day, ,, -PRON-, think, .]","[NOUN, PUNCT, PRON, VERB, PUNCT]","[NN, ,, PRP, VBZ, .]","[npadvmod, punct, nsubj, ROOT, punct]","[comes, thinks, thinks, thinks, thinks]"
2221,2221,"The sun is white,",2221,C15.23,1.0,2220.0,C15,2225,"[the, sun, be, white, ,]","[DET, NOUN, AUX, ADJ, PUNCT]","[DT, NN, VBZ, JJ, ,]","[det, nsubj, ROOT, acomp, punct]","[sun, is, is, is, is]"
2222,2222,floods Clara's eyes.,2222,C15.23,1.0,2221.0,C15,2225,"[flood, Clara, 's, eye, .]","[NOUN, PROPN, PART, NOUN, PUNCT]","[NNS, NNP, POS, NNS, .]","[npadvmod, poss, case, appos, punct]","[is, eyes, Clara, floods, is]"
2223,2223,Like birth.,2223,C15.24,1.0,2222.0,C15,2225,"[like, birth, .]","[SCONJ, NOUN, PUNCT]","[IN, NN, .]","[ROOT, pobj, punct]","[Like, Like, Like]"


In [13]:
new_data.to_csv('data/21st_year/tr_tokens_new.csv')
new_data.to_pickle('data/21st_year/tr_tokens_new.pkl')