In [39]:
import numpy as np
import pandas as pd
import pickle
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
import collections

In [40]:
with open('21st_year_bert-base-uncased_syntactic_complexity_L-inf.csv','r') as f:
    reader = csv.reader(f)
    vec_file = [row for row in reader]
    vec_head = vec_file[0]
    vec_text = vec_file[1:]
vecs  = [[float(element) for element in row[vec_head.index('attention_heads_Linf_vector')][1:-1].split(', ')] \
         if row[vec_head.index('attention_heads_Linf_vector')] != '' else [] for row in vec_text]
vec_ids = [len(vec)==144 for vec in vecs]

In [41]:
with open('data/21st_year/tr_tokens_new.csv','r') as f:
    reader = csv.reader(f)
    ling_file = [row for row in reader]
    ling_head = ling_file[0]
    ling_text = ling_file[1:]
features = [[element[1:-1] for element in row[ling_head.index('dep_rel')][1:-1].split(', ')]\
           if len(row[ling_head.index('dep_rel')]) >0 else [] for row in ling_text]
feature_ids = [feature[0] != '' for feature in features]

In [42]:
ids = [vec_id and feature_id for vec_id, feature_id in zip(vec_ids,feature_ids)]

In [43]:
new_vecs = np.array([vec for element,vec in zip(ids,vecs) if element])
new_features = [feature for element,feature in zip(ids,features) if element]
assert new_vecs.shape[0] == len(new_features)

In [44]:
def CreateLabels(new_features,label):
    return np.array([1 if label in feature else 0 for feature in new_features])
def MyCrossValidation(X,y,model,verbose,cv):
    assert X.shape[0] == y.shape[0], 'Shape mismatch between X and y; make sure both are np arrays'
    random_ids = np.random.permutation(X.shape[0])
    batch_size = X.shape[0] // cv
    scores = []
    params= []
    for i in range(cv):
        if i == cv-1:
            test_ids = random_ids[batch_size*i:]
        else:
            test_ids = random_ids[batch_size*i:batch_size*(i+1)]
        train_ids = np.array([element for element in random_ids if element not in test_ids])
        assert len(test_ids) +  len(train_ids) == X.shape[0]
        train_X = X[train_ids]
        test_X = X[test_ids]
        train_y = y[train_ids]
        test_y = y[test_ids]
        model.fit(train_X,train_y)
        scores.append(model.score(test_X,test_y))
        assert scores[-1] == np.mean(model.predict(test_X)==test_y)
        params.append(model.coef_[0])
        if verbose:
            print([ConvertHeadNotation(head_num) for head_num in params[-1].argsort()[::-1][:5]])
    return scores,np.array(params)
def ConvertHeadNotation(head_num):
    return f'{head_num//12+1}-{head_num%12+1}'

In [45]:
out_list = []
out_head = ['alpha','dep_rel','max_head','score','max_head for each split in CV']
cv=5
verbose = False
for alpha in [0.01,0.1,1,10,100]:
    #Smaller alpha for stronger regularization
    for dep_rel in ['prep','pobj','det','nsubj','amod','dobj','advmod','aux','poss','ccomp','mark','prt']:
        labels = CreateLabels(new_features,dep_rel)
        model = LogisticRegression(penalty='l1', solver='liblinear',C=alpha)
        scores,params = MyCrossValidation(new_vecs,labels,model,verbose,cv=cv)
        important_head = collections.Counter([param.argmax() for param in params]).most_common()[0]
        print(f'Head for {dep_rel}　for alpha = {alpha}: {ConvertHeadNotation(important_head[0])} for {important_head[1]}/5 splits')
        out_list.append([alpha,dep_rel,ConvertHeadNotation(important_head[0]), np.mean(scores),\
                          ', '.join([ConvertHeadNotation(param.argmax()) for param in params])])
pd.DataFrame(out_list,columns=out_head).to_csv('DepRelDecoding.csv')

Head for prep　for alpha = 0.01: 1-1 for 5/5 splits
Head for pobj　for alpha = 0.01: 1-1 for 5/5 splits
Head for det　for alpha = 0.01: 1-1 for 5/5 splits
Head for nsubj　for alpha = 0.01: 8-2 for 2/5 splits
Head for amod　for alpha = 0.01: 1-1 for 5/5 splits
Head for dobj　for alpha = 0.01: 1-1 for 5/5 splits
Head for advmod　for alpha = 0.01: 1-1 for 5/5 splits
Head for aux　for alpha = 0.01: 1-1 for 5/5 splits
Head for poss　for alpha = 0.01: 1-1 for 5/5 splits
Head for ccomp　for alpha = 0.01: 1-1 for 5/5 splits
Head for mark　for alpha = 0.01: 1-1 for 5/5 splits
Head for prt　for alpha = 0.01: 1-1 for 5/5 splits
Head for prep　for alpha = 0.1: 6-2 for 5/5 splits
Head for pobj　for alpha = 0.1: 8-11 for 3/5 splits
Head for det　for alpha = 0.1: 8-11 for 5/5 splits
Head for nsubj　for alpha = 0.1: 8-2 for 5/5 splits
Head for amod　for alpha = 0.1: 1-1 for 5/5 splits
Head for dobj　for alpha = 0.1: 8-10 for 4/5 splits
Head for advmod　for alpha = 0.1: 8-4 for 4/5 splits
Head for aux　for alpha = 0.1: 8-