In [2]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [1]:
import pandas

train_ud = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\train-ud_paths_v0.0.5.csv')
test_ud = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\test-ud_paths_v0.0.5.csv')

train_ucca = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\train-ucca_paths_v0.0.5.csv')
test_ucca = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\test-ucca_paths_v0.0.5.csv')

In [4]:
train_ud = train_ud.rename(columns={'path' : 'ud_path'})
test_ud = test_ud.rename(columns={'path' : 'ud_path'})

train_ucca = train_ucca.rename(columns={'path' : 'ucca_path'})
test_ucca = test_ucca.rename(columns={'path' : 'ucca_path'})

In [5]:
test = pandas.merge(test_ud, test_ucca[['id', 'ucca_path']], on='id').dropna()
test = test[test['relation'] != 'no_relation']



In [6]:
train_ud_total_by_r_and_p = train_ud[train_ud['relation'] != 'no_relation'].groupby(['relation','ud_path'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['relation','total'], ascending=[True,False]).reset_index(drop=True)
train_ucca_total_by_r_and_p = train_ucca[train_ucca['relation'] != 'no_relation'].groupby(['relation','ucca_path'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['relation','total'], ascending=[True,False]).reset_index(drop=True)



In [7]:
def label_matches(test_row):
    relation = test_row['relation']
    ud_path = test_row['ud_path']
    ucca_path = test_row['ucca_path']
    
    train_ud_row = train_ud_total_by_r_and_p.loc[
        (train_ud_total_by_r_and_p['relation'] == relation) & 
        (train_ud_total_by_r_and_p['ud_path'] == ud_path)]

    train_ucca_row = train_ucca_total_by_r_and_p.loc[
        (train_ucca_total_by_r_and_p['relation'] == relation) & 
        (train_ucca_total_by_r_and_p['ucca_path'] == ucca_path)]


    match_any = not train_ud_row.empty or not train_ucca_row.empty
    match_ud = not train_ud_row.empty
    match_ucca = not  train_ucca_row.empty
    
    return pandas.Series( {'match_any' : match_any, 'match_ud' : match_ud, 'match_ucca' : match_ucca})


matching_results = test.apply( label_matches, axis=1)

In [8]:
test = pandas.concat((test, matching_results), axis=1)

In [9]:
stats = []
relations = train_ud_total_by_r_and_p['relation'].unique().tolist()

for relation in relations:
    
    
    df = test[test['relation']==relation]
    
    matched = df[df['match_any']]['id'].count()
    total = df.shape[0]
    
    stats.append( ('{} ({})'.format(relation,df.shape[0]), round(matched/total,4)) )

matched = test[test['match_any']]['id'].count()
total = test.shape[0]
stats.append( ('overall', round(matched/total,4)) )

stats = sorted(stats, key=lambda x: x[1], reverse=True)

In [10]:
import plotly.graph_objs as go

overall_index = next(i for (i, stat) in enumerate(stats) if stat[0] == 'overall')

colors = ['rgb(58,200,225)'] * len(stats)
colors[overall_index] = 'rgb(181,59,89)'

trace = go.Bar(
    x=[stat[0] for stat in stats],
    y=[stat[1] for stat in stats],
    name='Recall',
    width=0.5,
    marker=dict(
        color=colors,
        line=dict(
            color='rgb(8,48,107)',
            width=0.5),
        ),
    opacity=0.6
)

data = [trace]
layout = go.Layout(
    title='UD + UCCA Path - Recall',
    xaxis_title="Relation",
    yaxis_title="Recall",    
    barmode='overlay',
    width=1200,
    xaxis = go.layout.XAxis(
        tickangle = 45,
        automargin = True
        
    )
    
        
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)