In [1]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [2]:
import pandas

train_ud = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\train-ud_paths_v0.0.5.csv').set_index(['id'])
test_ud = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\test-ud_paths_v0.0.5.csv').set_index(['id'])

train_ucca = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\train-ucca_paths_v0.0.5.csv').set_index(['id'])
test_ucca = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\test-ucca_paths_v0.0.5.csv').set_index(['id'])

In [3]:
train_ner = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\train-ud_paths_with_ner_v0.0.6.csv').set_index(['id'])
test_ner = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\test-ud_paths_with_ner_v0.0.6.csv').set_index(['id'])

train_ids_with_proper_ner = train_ner[(train_ner['type1']==train_ner['type1_corenlp']) & (train_ner['type2']==train_ner['type2_corenlp'])].drop(['docid', 'tokens', 'relation', 'path', 'lemmas_on_path', 'type1', 'type2', 'ent1_head', 'ent2_head', 'type1_corenlp', 'type2_corenlp'], axis=1)
test_ids_with_proper_ner = test_ner[(test_ner['type1']==test_ner['type1_corenlp']) & (test_ner['type2']==test_ner['type2_corenlp'])].drop(['docid', 'tokens', 'relation', 'path', 'lemmas_on_path', 'type1', 'type2', 'ent1_head', 'ent2_head', 'type1_corenlp', 'type2_corenlp'], axis=1)

In [4]:
print('train_ids_with_proper_ner size:', train_ids_with_proper_ner.shape[0])
print('train_ner size:', train_ner.shape[0])
print('train_ud size:', train_ud.shape[0])
print('train_ucca size:', train_ucca.shape[0])


train_ids_with_proper_ner size: 56648
train_ner size: 68049
train_ud size: 68049
train_ucca size: 67617


In [5]:
train_ucca = train_ids_with_proper_ner.join(train_ucca).dropna(axis=0).reset_index()
test_ucca = test_ids_with_proper_ner.join(test_ucca).dropna(axis=0).reset_index()


train_ud = train_ids_with_proper_ner.join(train_ud).reset_index()
test_ud = test_ids_with_proper_ner.join(test_ud).reset_index()


In [6]:
def transform_row(r):
    return '{entity1} {org_path} {entity2}'.format(entity1=r.type1[0:3], org_path=r.path, entity2=r.type2[0:3])

train_ud['path'] = train_ud.apply(transform_row, axis=1)
test_ud['path'] = test_ud.apply(transform_row, axis=1)

train_ucca['path'] = train_ucca.apply(transform_row, axis=1)
test_ucca['path'] = test_ucca.apply(transform_row, axis=1)

In [7]:
train_ud = train_ud.rename(columns={'path' : 'ud_path'})
test_ud = test_ud.rename(columns={'path' : 'ud_path'})

train_ucca = train_ucca.rename(columns={'path' : 'ucca_path'})
test_ucca = test_ucca.rename(columns={'path' : 'ucca_path'})

In [8]:
test = pandas.merge(test_ud, test_ucca[['id', 'ucca_path']], on='id').dropna()

In [9]:
relation_to_paths_ud = train_ud.groupby(['relation'])['ud_path'].unique().to_dict()
relation_to_paths_ucca = train_ucca.groupby(['relation'])['ucca_path'].unique().to_dict()

In [10]:
def label_sentence_recall(test_row, relation):
    
    ud_path = test_row['ud_path']
    ucca_path = test_row['ucca_path']
    true_relation = test_row['relation']
    
    #if ud_path in relation_to_paths_ud[relation] or ucca_path in relation_to_paths_ucca[relation]:
    if ud_path in relation_to_paths_ud[relation]:
        if true_relation == relation:
            return 'TP'
        
        # the next two lines are artificial and should be removed
        # they are designed to remove the affect of all 'no_relation' sentences
        #elif true_relation == 'no_relation':
        #    return 'N'
        
        
        else:
            return 'FP'
    else:
        return 'N'

In [11]:
for relation in filter(lambda relation: relation != 'no_relation', relation_to_paths_ud.keys()):
    test[relation] = test.apply(lambda row : label_sentence_recall(row, relation), axis=1)    

In [12]:
def get_adjusted_recall(test_row):
    
    true_relation = test_row['relation']
    fps = 0
    tps = 0
    for relation in filter(lambda relation: relation != 'no_relation', relation_to_paths_ud.keys()):
        if test_row[relation] == 'FP':
            fps += 1
        elif test_row[relation] == 'TP':
            tps = 1
            
    if (tps+fps) > 0:
        adjusted_recall = tps / (tps+fps)
    else:
        adjusted_recall = 0
    
    
    return adjusted_recall

In [13]:
test['adjusted_recall'] = test.apply(lambda row: get_adjusted_recall(row), axis=1 )

In [14]:
stats = []

overall_matched = 0
overall_total = 0

for relation in filter(lambda relation: relation != 'no_relation', relation_to_paths_ud.keys()):
    df = test[test['relation']==relation]

    matched = df['adjusted_recall'].sum()
    total = df.shape[0]
    
    overall_matched += matched
    overall_total += total
    
    stats.append( ('{} ({})'.format(relation,total), round(matched/total,4)) )

stats.append( ('overall', round(overall_matched/overall_total,4)) )

stats = sorted(stats, key=lambda x: x[1], reverse=True)

In [15]:
import plotly.graph_objs as go

overall_index = next(i for (i, (relation, score)) in enumerate(stats) if relation == 'overall')

colors = ['rgb(58,200,225)'] * len(stats)
colors[overall_index] = 'rgb(181,59,89)'


trace1 = go.Bar(
    x=[stat[0] for stat in stats],
    y=[stat[1] for stat in stats],
    name='Precision',
    width=0.5,
    marker=dict(
        color=colors,
        line=dict(
            color='rgb(8,48,107)',
            width=0.5),
        ),
    opacity=0.6
)

data = [trace1]
layout = go.Layout(
    title='UD + UCCA Path - Recal (Playing)',
    xaxis_title="Relation",
    yaxis_title="Precision",        
    barmode='overlay',
    width=1200,
    xaxis = go.layout.XAxis(
        tickangle = 45,
        automargin = True
        
    )
    
        
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [16]:
def get_precision_contribution(test_row):
    
    matched = 0

    for relation in filter(lambda relation: relation != 'no_relation', relation_to_paths_ud.keys()):
        if test_row[relation] in ['FP', 'TP']:
            matched += 1

    if matched > 0:
        return 1/matched
    else:
        return 0

In [17]:
test['precision_contribution'] = test.apply(lambda row: get_precision_contribution(row), axis=1 )

In [18]:
stats = []
overall_tp = 0
overall_fp = 0

for relation in filter(lambda relation: relation != 'no_relation', relation_to_paths_ud.keys()):

    tp_contribution = test[test[relation]=='TP']['precision_contribution'].sum()
    fp_contribution = test[test[relation]=='FP']['precision_contribution'].sum()
    
    precision = tp_contribution / (tp_contribution+fp_contribution)
    
    #print('{}: total ={}, tp = {}, fp = {}'.format(relation, num_examples, tp, fp))

    stats.append( ( '{}'.format(relation), round(precision,4)) )
    
    overall_tp += tp_contribution
    overall_fp += fp_contribution




overall_precision = overall_tp / (overall_tp+overall_fp)
stats.append( ('overall', round(overall_precision,4)) )
stats = sorted(stats, key=lambda x: x[1], reverse=True)




invalid value encountered in double_scalars



In [19]:
import plotly.graph_objs as go

overall_index = next(i for (i, (relation, score)) in enumerate(stats) if relation == 'overall')

colors = ['rgb(58,200,225)'] * len(stats)
colors[overall_index] = 'rgb(181,59,89)'


trace1 = go.Bar(
    x=[stat[0] for stat in stats],
    y=[stat[1] for stat in stats],
    name='Precision',
    width=0.5,
    marker=dict(
        color=colors,
        line=dict(
            color='rgb(8,48,107)',
            width=0.5),
        ),
    opacity=0.6
)

data = [trace1]
layout = go.Layout(
    title='UD + UCCA Path - Precision (Play)',
    xaxis_title="Relation",
    yaxis_title="Precision",        
    barmode='overlay',
    width=1200,
    xaxis = go.layout.XAxis(
        tickangle = 45,
        automargin = True
        
    )
    
        
)

fig = go.Figure(data=data, layout=layout)
iplot(fig)