In [1]:
from _plotly_future_ import v4_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [2]:
import pandas

train = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\train-ud_paths_v0.0.5.csv')
test = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\results\general\test-ud_paths_v0.0.5.csv')

# no point to consider 'no_relation' in the 'recall' scenario ...
train = train[ train['relation'] != 'no_relation']
test = test[ test['relation'] != 'no_relation']


# 07/03/2020 - turns out some rows don't have a path .. I forgot about that
test.dropna(subset=['path'], inplace=True)

In [3]:
test['len-path'] =  test.apply(lambda row : len(row['path'].split(' ')), axis=1)

In [4]:
#path_len_stats = test.groupby(['len-path'])['id'].count().reset_index().rename(columns={'id': 'total-for-path-len'}).sort_values(['total-for-path-len'], ascending=[False]).reset_index(drop=True)
#path_len_stats['per-for-path-len'] = path_len_stats.apply(lambda row : row['total-for-path-len'] / test.shape[0], axis = 1 )
#path_len_to_percent = path_len_stats.set_index('len-path').drop(columns=['total-for-path-len']).to_dict()['per-for-path-len']

In [5]:
path_len_to_percent_per_relation = {}

for relation in test['relation'].unique().tolist():
    
    df = test[test['relation']==relation]

    
    path_len_stats = df.groupby(['len-path'])['id'].count().reset_index().rename(columns={'id': 'total-for-path-len'}).sort_values(['total-for-path-len'], ascending=[False]).reset_index(drop=True)
    path_len_stats['per-for-path-len'] = path_len_stats.apply(lambda row : row['total-for-path-len'] / df.shape[0], axis = 1 )
    path_len_to_percent = path_len_stats.set_index('len-path').drop(columns=['total-for-path-len']).to_dict()['per-for-path-len']    
    
    
    path_len_to_percent_per_relation[relation] = path_len_to_percent
    


In [6]:
def transform_row(r):
    return '{entity1} {org_path} {entity2}'.format(entity1=r.type1[0:3], org_path=r.path, entity2=r.type2[0:3])

train['path'] = train.apply(transform_row, axis=1)
test['path'] = test.apply(transform_row, axis=1)

In [7]:
train_total_by_r_and_p = train.groupby(['relation','path'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['relation','total'], ascending=[True,False]).reset_index(drop=True)

In [8]:
def is_false_negative(test_row):
    relation = test_row['relation']
    path = test_row['path']

    train_row = train_total_by_r_and_p.loc[
        (train_total_by_r_and_p['relation'] == relation) & 
        (train_total_by_r_and_p['path'] == path)]
    if train_row.empty:
        return True
    else:
        return None

test['false-negative'] = test.apply(lambda row : is_false_negative(row), axis=1)
test.dropna(subset=['false-negative'], inplace=True)

In [9]:
relations = test['relation'].unique().tolist()
relation_to_fn_stats = {}

for relation in relations:
    
    df = test[test['relation']==relation]

    
    path_len_to_percent = path_len_to_percent_per_relation[relation]
    
    
    fn_len_path_stats = df.groupby(['len-path'])['id'].count().reset_index().rename(columns={'id': 'total'}).reset_index(drop=True)
    fn_len_path_stats['weighted-total'] = fn_len_path_stats.apply(lambda row : row['total'] / path_len_to_percent[row['len-path']], axis=1)
    weighted_total_sum = fn_len_path_stats['weighted-total'].sum()
    fn_len_path_stats['weighted-total-per'] = fn_len_path_stats.apply(lambda row : row['weighted-total'] / weighted_total_sum, axis=1)
    
    fn_len_path_stats = fn_len_path_stats.sort_values(['len-path'], ascending=[True]).reset_index(drop=True)
    relation_to_fn_stat = fn_len_path_stats.set_index(['len-path']).drop(columns=['total','weighted-total']).to_dict()['weighted-total-per']
    
    relation_to_fn_stats[relation] = relation_to_fn_stat


In [10]:
num_relations = len(relations)

In [11]:
import math
num_columns = 5
num_rows = math.ceil(num_relations/num_columns)

layout = go.Layout(
    title='UD: Path length to weighted number of false negatives',
    width=2000,
    height=2000,
    font=dict(family="Arial", size=12)
)

fig = make_subplots(
    rows=num_rows, 
    cols=num_columns,
    subplot_titles=relations,
)

fig['layout'].update(layout)

True

#fig.update_layout(height=600, width=800)




True

In [12]:
for ind, relation in enumerate(relations):
    
    row_num, column_num = divmod(ind,num_columns)
    
    column_num += 1
    row_num +=1 
    
    
    relation_to_fn_stat = relation_to_fn_stats[relation]
    
    
    fig.add_trace(
        go.Scatter(
            x=list(relation_to_fn_stat.keys()),
            y=list(relation_to_fn_stat.values()),
            name=relation),
        row=row_num, 
        col=column_num)


for i in fig['layout']['annotations']:
    i['font'] = dict(family="Arial", size=10)

    

fig.show()
    