In [1]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [2]:
import pandas

In [3]:
unfiltered_misses = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\analysis\filtering-by-path-length\comparing misses\ucca\comparison-2\ucca-misses-nofilter.csv').rename(columns={'predicted':'no-filter-prediction'})
filtered_misses = pandas.read_csv(r'C:\Users\JYellin\re_1\tacred\analysis\filtering-by-path-length\comparing misses\ucca\comparison-2\ucca-misses-filter9.csv').rename(columns={'predicted':'filter-prediction'})

In [4]:
merged_misses = filtered_misses.merge(unfiltered_misses, how='outer', on=['id'], indicator=True)

In [5]:
def get_gold(r):
    if not pandas.isnull(r['gold_x']):
        return r['gold_x']
    
    return r['gold_y']

merged_misses['gold'] = merged_misses.apply(lambda r : get_gold(r), axis=1)
merged_misses.drop(columns=['gold_x', 'gold_y'], inplace=True)

In [6]:
train = pandas.read_json(r'C:\Users\JYellin\re_1\tacred\data\json-enhanced\train.json')
dev = pandas.read_json(r'C:\Users\JYellin\re_1\tacred\data\json-enhanced\dev.json')
test = pandas.read_json(r'C:\Users\JYellin\re_1\tacred\data\json-enhanced\test.json')
data = pandas.concat([train,dev,test])

In [7]:
merged_misses = merged_misses.merge(data, how='left', on=['id'])

In [8]:
common_misses = merged_misses[merged_misses['_merge']=='both'].drop(columns=['_merge']) 
filtered_only_misses = merged_misses[merged_misses['_merge']=='left_only'].drop(columns=['_merge','no-filter-prediction'])
unfiltered_only_misses = merged_misses[merged_misses['_merge']=='right_only'].drop(columns=['_merge','filter-prediction'])

In [9]:
print('filtered miss count:       {:4d}'.format(filtered_misses.shape[0]))
print('unfiltered miss count:     {:4d}'.format(unfiltered_misses.shape[0]))
print('common miss count:         {:4d}'.format(common_misses.shape[0]))
print('filtered only miss count:  {:4d}'.format(filtered_only_misses.shape[0]))
print('unfiltered only miss count:{:4d}'.format(unfiltered_only_misses.shape[0]))

filtered miss count:       1967
unfiltered miss count:     1971
common miss count:         1630
filtered only miss count:   337
unfiltered only miss count: 341


In [10]:
filtered_only_misses_by_gold = filtered_only_misses.groupby(['gold'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['total'], ascending=[False]).reset_index(drop=True)
filtered_only_misses_by_gold_dict = filtered_only_misses_by_gold.to_dict()
filtered_lookup_index = dict((y,x) for x,y in filtered_only_misses_by_gold_dict['gold'].items())

In [11]:
unfiltered_only_misses_by_gold = unfiltered_only_misses.groupby(['gold'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['total'], ascending=[False]).reset_index(drop=True)
unfiltered_only_misses_by_gold_dict = unfiltered_only_misses_by_gold.to_dict()
unfiltered_lookup_index = dict((y,x) for x,y in unfiltered_only_misses_by_gold_dict['gold'].items())

In [12]:
#relations = set(unfiltered_only_misses_by_gold_dict['gold'].values())
#relations2 = set(filtered_only_misses_by_gold_dict['gold'].values())
#relations.update(relations2)
#relations.remove('no_relation')
#relations = list(relations)

relations= list(filtered_only_misses_by_gold_dict['gold'].values())

for relation in unfiltered_only_misses_by_gold_dict['gold'].values():
    if relation not in relations:
        relations.append(relation)
        
relations.remove('no_relation')

In [13]:
scatter1 = go.Scatter(
    x=[relation for relation in relations], 
    y=[filtered_only_misses_by_gold_dict['total'][filtered_lookup_index[relation]] if relation in filtered_lookup_index else 0 for relation in relations],
    mode='lines',
    name='filtered',
    marker = dict(color='#BC80BD')
    
)

scatter2= go.Scatter(
    x=[relation for relation in relations], 
    y=[unfiltered_only_misses_by_gold_dict['total'][unfiltered_lookup_index[relation]] if relation in unfiltered_lookup_index else 0 for relation in relations],
    mode='lines',
    name='unfiltered',
    marker = dict(color='#CCEBC5')

)


layout = go.Layout(
    title='Miss Per Relation',
    xaxis_title="Path Length",
    yaxis_title="Number of Misses",        
    barmode='overlay',
    width=1050,
    xaxis = go.layout.XAxis(
        tickangle = 45,
        automargin = True
    )
)

fig = go.Figure(data=[scatter1,scatter2], layout=layout)


fig.show()

In [14]:
filtered_only_misses_by_ucca = filtered_only_misses.groupby(['ucca_path_len'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['total'], ascending=[False]).reset_index(drop=True)
filtered_only_misses_by_ucca_dict = filtered_only_misses_by_ucca.to_dict()

In [15]:
unfiltered_only_misses_by_ucca = unfiltered_only_misses.groupby(['ucca_path_len'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['total'], ascending=[False]).reset_index(drop=True)
unfiltered_only_misses_by_ucca_dict = unfiltered_only_misses_by_ucca.to_dict()

In [16]:
scatter1 = go.Scatter(
    x=[i for i in range(-1,10)], 
    y=[filtered_only_misses_by_ucca_dict['total'].get(i, 0) for i in range(10)],
    mode='lines',
    name='filtered',
    marker = dict(color='#BC80BD')    
)

scatter2= go.Scatter(
    x=[i for i in range(-1,10)], 
    y=[unfiltered_only_misses_by_ucca_dict['total'].get(i, 0) for i in range(10)],
    mode='lines',
    name='unfiltered',
    marker = dict(color='#CCEBC5')
)


layout = go.Layout(
    title='UCCA Path Length Comparison',
    xaxis_title="Path Length",
    yaxis_title="Number of Misses",        
    barmode='overlay',
    width=900,
    xaxis = go.layout.XAxis(
        tickangle = 45,
        automargin = True
        
    )
    
        
)

fig = go.Figure(data=[scatter1,scatter2], layout=layout)


fig.show()

In [17]:
filtered_only_misses_by_ud = filtered_only_misses.groupby(['ud_path_len'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['total'], ascending=[False]).reset_index(drop=True)
filtered_only_misses_by_ud_dict = filtered_only_misses_by_ud.to_dict()

In [18]:
unfiltered_only_misses_by_ud = unfiltered_only_misses.groupby(['ud_path_len'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['total'], ascending=[False]).reset_index(drop=True)
unfiltered_only_misses_by_ud_dict = unfiltered_only_misses_by_ud.to_dict()

In [19]:
scatter1 = go.Scatter(
    x=[i for i in range(10)], 
    y=[filtered_only_misses_by_ud_dict['total'].get(i, 0) for i in range(10)],
    mode='lines',
    name='filtered',
    marker = dict(color='#BC80BD')

)

scatter2= go.Scatter(
    x=[i for i in range(10)], 
    y=[unfiltered_only_misses_by_ud_dict['total'].get(i, 0) for i in range(10)],
    mode='lines',
    name='unfiltered',
    marker = dict(color='#CCEBC5')
)


layout = go.Layout(
    title='UD Path Length Comparison',
    xaxis_title="Path Length",
    yaxis_title="Number of Misses",        
    barmode='overlay',
    width=1200,
    xaxis = go.layout.XAxis(
        tickangle = 45,
        automargin = True
        
    )
    
        
)

fig = go.Figure(data=[scatter1,scatter2], layout=layout)


fig.show()