In [1]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
import pandas

train_ud = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\train-ud_paths_per_relation_no_relation.csv')
test_ud = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\test-ud_paths_per_relation_no_relation.csv')

train_ucca = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\train-ucca_paths_per_relation_no_relation.csv')
test_ucca = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\test-ucca_paths_per_relation_no_relation.csv')

In [3]:
def transform_row(r):
    return '{entity1} {org_path} {entity2}'.format(entity1=r.type1[0:3], org_path=r.path, entity2=r.type2[0:3])

train_ud['path'] = train_ud.apply(transform_row, axis=1)
test_ud['path'] = test_ud.apply(transform_row, axis=1)

train_ucca['path'] = train_ucca.apply(transform_row, axis=1)
test_ucca['path'] = test_ucca.apply(transform_row, axis=1)

In [4]:
train_ud = train_ud.rename(columns={'path' : 'ud_path'})
test_ud = test_ud.rename(columns={'path' : 'ud_path'})

train_ucca = train_ucca.rename(columns={'path' : 'ucca_path'})
test_ucca = test_ucca.rename(columns={'path' : 'ucca_path'})

In [5]:
test = pandas.merge(test_ud, test_ucca[['id', 'ucca_path']], on='id').dropna()
#test = test[test['relation'] != 'no_relation']
test = test[~test['relation'].isin(['no_relation', 'per:spouse', 'per:other_family', 'per:siblings', 'per:children', 'per:parents', 'per:alternate_names'])]


In [6]:
#train_ud_total_by_r_and_p = train_ud[train_ud['relation'] != 'no_relation'].groupby(['relation','ud_path'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['relation','total'], ascending=[True,False]).reset_index(drop=True)
#train_ucca_total_by_r_and_p = train_ucca[train_ucca['relation'] != 'no_relation'].groupby(['relation','ucca_path'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['relation','total'], ascending=[True,False]).reset_index(drop=True)

train_ud_total_by_r_and_p = train_ud[~train_ud['relation'].isin(['no_relation', 'per:spouse', 'per:other_family', 'per:siblings', 'per:children', 'per:parents', 'per:alternate_names'])].groupby(['relation','ud_path'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['relation','total'], ascending=[True,False]).reset_index(drop=True)
train_ucca_total_by_r_and_p = train_ucca[~train_ucca['relation'].isin(['no_relation', 'per:spouse', 'per:other_family', 'per:siblings', 'per:children', 'per:parents', 'per:alternate_names'])].groupby(['relation','ucca_path'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['relation','total'], ascending=[True,False]).reset_index(drop=True)

In [7]:
def label_matches(test_row):
    relation = test_row['relation']
    ud_path = test_row['ud_path']
    ucca_path = test_row['ucca_path']
    
    train_ud_row = train_ud_total_by_r_and_p.loc[
        (train_ud_total_by_r_and_p['relation'] == relation) & 
        (train_ud_total_by_r_and_p['ud_path'] == ud_path)]

    train_ucca_row = train_ucca_total_by_r_and_p.loc[
        (train_ucca_total_by_r_and_p['relation'] == relation) & 
        (train_ucca_total_by_r_and_p['ucca_path'] == ucca_path)]


    match_any = not train_ud_row.empty or not train_ucca_row.empty
    match_ud = not train_ud_row.empty
    match_ucca = not  train_ucca_row.empty
    
    return pandas.Series( {'match_any' : match_any, 'match_ud' : match_ud, 'match_ucca' : match_ucca})


matching_results = test.apply( label_matches, axis=1)

In [8]:
test = pandas.concat((test, matching_results), axis=1)

In [9]:
stats = []
relations = train_ud_total_by_r_and_p['relation'].unique().tolist()

for relation in relations:
    
    
    df = test[test['relation']==relation]
    
    matched = df[df['match_any']]['id'].count()
    total = df.shape[0]
    
    stats.append( ('{} ({})'.format(relation,df.shape[0]), round(matched/total,4)) )

matched = test[test['match_any']]['id'].count()
total = test.shape[0]
stats.append( ('overall', round(matched/total,4)) )

stats = sorted(stats, key=lambda x: x[1], reverse=True)

In [10]:
import plotly.graph_objs as go

overall_index = next(i for (i, stat) in enumerate(stats) if stat[0] == 'overall')

colors = ['rgb(58,200,225)'] * len(stats)
colors[overall_index] = 'rgb(181,59,89)'

trace = go.Bar(
    x=[stat[0] for stat in stats],
    y=[stat[1] for stat in stats],
    name='Recall',
    width=0.5,
    marker=dict(
        color=colors,
        line=dict(
            color='rgb(8,48,107)',
            width=0.5),
        ),
    opacity=0.6
)

data = [trace]
layout = go.Layout(
    title='UD + UCCA Path - Recall',
    xaxis_title="Relation",
    yaxis_title="Recall",    
    barmode='overlay',
    width=1200,
    xaxis = go.layout.XAxis(
        tickangle = 45,
        automargin = True
        
    )
    
        
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [11]:
train_ud_total_by_r_and_p[train_ud_total_by_r_and_p['relation']== 'per:religion']

Unnamed: 0,relation,ud_path,total
4289,per:religion,PER <nsubj REL,3
4290,per:religion,PER >amod REL,2
4291,per:religion,PER >appos >compound REL,2
4292,per:religion,PER <appos <nsubj <ccomp <advcl:since >dobj >a...,1
4293,per:religion,PER <appos <nsubj REL,1
4294,per:religion,PER <appos >appos >nmod:of >appos >amod REL,1
4295,per:religion,PER <appos >compound REL,1
4296,per:religion,PER <compound <dobj >dobj >amod REL,1
4297,per:religion,PER <compound <dobj >nsubj:xsubj >amod REL,1
4298,per:religion,PER <conj:and >amod REL,1


In [12]:
train_ud[train_ud['relation']== 'per:religion']

Unnamed: 0,id,docid,relation,ud_path,type1,type2
44,61b3af097586be8f7177,eng-WL-11-174611-12976276,per:religion,PER >amod REL,PERSON,RELIGION
1867,61b3af09753019aba9a5,eng-WL-11-174611-12969239,per:religion,PER >appos >compound REL,PERSON,RELIGION
2360,61b3af09759bb3e0a65e,XIN_ENG_20070110.0345.LDC2009T13,per:religion,PER >nmod:as REL,PERSON,RELIGION
4779,61b3af097527a0a926ec,APW_ENG_20070102.0540.LDC2009T13,per:religion,PER <nsubj <nmod:poss >amod REL,PERSON,RELIGION
4955,61b3af09754b4faf0873,APW_ENG_20020402.1472.LDC2007T07,per:religion,PER <nsubjpass >nmod:as >dep <nmod:poss >conj:...,PERSON,RELIGION
7134,61b3af0975783c51f418,eng-NG-31-142885-10105592,per:religion,PER <conj:and >amod REL,PERSON,RELIGION
7208,61b3af097549ea94e093,NYT_ENG_20101210.0131,per:religion,PER <nsubj <parataxis >nsubj >appos >compound REL,PERSON,RELIGION
7691,61b3af0975258189c4e6,bolt-eng-DF-170-181104-8724758,per:religion,PER <appos <nsubj <ccomp <advcl:since >dobj >a...,PERSON,RELIGION
8541,61b3af0975bcf9437493,APW_ENG_20080229.0949.LDC2009T13,per:religion,PER <nsubj >ccomp >dobj >nmod:on REL,PERSON,RELIGION
9522,61b3af09752740c11953,eng-NG-31-142817-10100243,per:religion,PER <nmod:poss <nsubj >nsubj >acl:of >nmod:in ...,PERSON,RELIGION
