### Extract linguistic features

In [1]:
import numpy as np
import csv
import pandas as pd
import spacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
with open('data/21st_year/21st_year.csv', 'r') as f:
    reader = csv.reader(f)
    file = [row for row in reader]
    head = file[1]
    text = file[2:]
sentences = [row[head.index('Sentence')].replace('\xa0', ' ') for row in text]
TR_ids = [row[head.index('Index')] for row in text]

In [4]:
doc = nlp.pipe(sentences)
ling_features = []
for TR ,line in zip(TR_ids, doc):
    for token in line:
        if token.pos_ != 'SPACE':
            ling_features.append([TR,token.text,token.lemma_,token.pos_,token.tag_,token.dep_,token.head.text])

In [5]:
ling_features = pd.DataFrame(ling_features,columns=["index","token","lemma","pos","tag","dep_rel","dep_head"])

In [6]:
ling_features

Unnamed: 0,index,token,lemma,pos,tag,dep_rel,dep_head
0,A1.1,This,this,DET,DT,nsubj,is
1,A1.1,is,be,AUX,VBZ,ROOT,is
2,A1.1,Los,Los,PROPN,NNP,compound,Angeles
3,A1.1,Angeles,Angeles,PROPN,NNP,attr,is
4,A1.1,.,.,PUNCT,.,punct,is
...,...,...,...,...,...,...,...
11069,C15.24,birth,birth,NOUN,NN,pobj,Like
11070,C15.24,.,.,PUNCT,.,punct,Like
11071,C15.24,Each,each,DET,DT,det,day
11072,C15.24,day,day,NOUN,NN,ROOT,day


In [7]:
ling_features.to_csv('data/21st_year/ling_features.csv')

### Align with the TRs

In [8]:
with open('data/21st_year/ling_features.csv', 'r') as f:
    reader = csv.reader(f)
    file = [row for row in reader]
    head = file[0]
    text = file[1:]

In [9]:
with open('data/21st_year/tr_tokens.csv', 'r') as f:
    reader = csv.reader(f)
    TR_file = [row for row in reader]
    TR_head = TR_file[0]
    TR_text = TR_file[1:]

In [10]:
bridging_TRs = ['A1.7','B1.19','A2.5','A3.12','A5.20','A6.9','B6.4','A7.6','A7.13','A7.16',\
                'B7.2','B10.8','B11.9','B11.16','A12.5','B13.24','B14.12','C1.2','C2.5','C12.3']
with open('data/21st_year/tr_tokens_new.csv','w') as f:
    writer = csv.writer(f)
    TR_head.extend(["lemma","pos","tag","dep_rel","dep_head"])
    writer.writerow(TR_head)
    current_id = 0
    for TR_line in TR_text:
        TR_tokens = TR_line[TR_head.index('Sentence')]
        TR_index = TR_line[TR_head.index('index')]
        if TR_tokens != "":
            TR_tokens = TR_tokens.replace('\xa0', ' ')
            token_list = [token.text for token in nlp(TR_tokens)]
            lemma_list = []
            pos_list = []
            tag_list = []
            dep_rel_list = []
            dep_head_list = []
            while text[current_id][head.index('token')] in token_list and \
            (text[current_id][head.index('index')] == TR_index or \
             text[current_id][head.index('index')] in bridging_TRs):
                line = text[current_id]
                lemma_list.append(line[head.index('lemma')])
                pos_list.append(line[head.index('pos')])
                tag_list.append(line[head.index('tag')])
                dep_rel_list.append(line[head.index('dep_rel')])
                dep_head_list.append(line[head.index('dep_head')])
                if current_id == len(text)-1:
                    break
                current_id += 1
            TR_line.extend([lemma_list,pos_list,tag_list,dep_rel_list,dep_head_list])
            writer.writerow(TR_line)
        else:
            TR_line.extend([[],[],[],[],[]])
            writer.writerow(TR_line)