## Data Collection

This notebook was used to analyze the errors made by the best features

In [1]:
import pandas as pd
from collections import defaultdict
from pprint import pprint
pd.options.display.max_rows = 1500

In [2]:
conf = pd.read_csv('../../code/data/data.csv')[['id','most_frequent_percentage']]
conf.set_index('id')
df_cols = ['id','caption','sentence','object_a','object_b','predicted','gold', 'most_frequent_percentage']
paths = pd.read_csv('../../code/data/middle_paths_unrestricted_16.csv')
paths['sentence'] = paths.apply(lambda row: row['sentence'][2:-2],axis=1)
data = pd.read_csv('../../all_data_files/experiments/results_2_classes/missclassified_binary_True.csv')
pd.options.display.max_colwidth = 100000

In [3]:
shared_3 = pd.read_csv('../../all_data_files/experiments/results_3_classes/errors/3_shared.csv')
se_3 = pd.read_csv('../../all_data_files/experiments/results_3_classes/errors/3_se.csv')
lex_3 = pd.read_csv('../../all_data_files/experiments/results_3_classes/errors/3_lex.csv')

In [4]:
paths[paths.paths.str.len() < 20].paths.value_counts()

[['NOPATH']]    399
Name: paths, dtype: int64

In [5]:
# all errors made by sentence embeddings
se = data[data.caption == "InferSent"]
se = se.join(conf,rsuffix="_r")[df_cols]
se['repeated'] = se.apply(lambda x: True if x.sentence in se_3.sentence.values.tolist() else False,axis=1)

In [6]:
# all errors for the lexnet feature
lex = data[data.caption == 'middle_paths_unrestricted_16']
#lex.set_index('id')
lex = lex.join(conf,rsuffix="_r")[df_cols]
lex['repeated'] = lex.apply(lambda x: True if x.sentence in lex_3.sentence.values.tolist() else False,axis=1)

In [7]:
# all errors exclusivly made by se
se_unique = se[~se.sentence.isin(lex.sentence.values.tolist())].copy()
se_unique['origin'] = 'se (unique)'

In [12]:
p = paths[['sentence','paths','id']].copy()
# errors exclusivly made by lexnet
lex_unique = lex[~lex.sentence.isin(se.sentence.values.tolist())].copy()
lex_unique['origin'] = 'lex (unique)'
lex_unique = lex_unique.join(p,rsuffix='r')
lex_unique = lex_unique.drop_duplicates(subset=['sentence'])

In [13]:
# all errors made by both classifiers
se_multi = se[~se.sentence.isin(se_unique.sentence.values.tolist())].copy()
lex_multi = lex[~lex.sentence.isin(lex_unique.sentence.values.tolist())].copy()
se_multi['origin'] = 'both'
lex_multi['origin'] = 'both'
assert sorted(se_multi.id.values.tolist()) == sorted(lex_multi.id.values.tolist())
shared = se_multi.copy()
shared['caption'] = 'shared'
shared['repeated'] = shared.apply(lambda x: True if x.sentence in shared_3.sentence.values.tolist() else False,axis=1)
shared['pred_if'] = shared.apply(lambda x: se_multi[se_multi.sentence == x['sentence']].predicted.values.tolist()[0],axis=1)
shared['pred_lex'] = shared.apply(lambda x: lex_multi[lex_multi.sentence == x['sentence']].predicted.values.tolist()[0],axis=1)
shared = shared.set_index('id')
shared = shared.join(conf,rsuffix="_r")[['sentence','object_a','object_b','pred_if', 'pred_lex', 'gold', 'most_frequent_percentage','repeated']]

In [14]:
print('Shared {}'.format(len(shared)))
print('SE Unique {} | Lex Unique {}'.format(len(se_unique),len(lex_unique)))
print('SE {} | Lex {}'.format(len(se),len(lex)))
assert len(shared) + len(se_unique) == len(se)
assert len(shared) + len(lex_unique) == len(lex)

Shared 380
SE Unique 283 | Lex Unique 520
SE 663 | Lex 900


In [15]:
def class_confusion(df):
    d2 = defaultdict(int)
    d = defaultdict(int)
    for i, row in df.iterrows():
        k = 'Pred: {} - Gold: {}'.format(row['predicted'],row['gold'])
        d[k] += 1
    print('\n')
    pprint(sorted(d.items(),key=lambda x: x[1],reverse=True))

In [16]:
def analyze_errors(df,conf=True):
    df['s_len'] = df.apply(lambda x: len(x['sentence']),axis=1)
    print('### Confidence')
    print(df.most_frequent_percentage.value_counts())
    print('### Repeated Errors')
    print(df.repeated.value_counts())
    if conf:
        print('\n### Confusion')
        class_confusion(df)
    print('\n### Average sentence length')
   
    print(df.s_len.describe())
    print(df.groupby('most_frequent_percentage').mean().sort_values('s_len',ascending=False))
    

## Sentence Embedding Errors

In [17]:
analyze_errors(se_unique)

### Confidence
1.0    198
0.8     51
0.6     27
0.4      6
0.5      1
Name: most_frequent_percentage, dtype: int64
### Repeated Errors
False    146
True     137
Name: repeated, dtype: int64

### Confusion


[('Pred: ARG - Gold: NONE', 164), ('Pred: NONE - Gold: ARG', 119)]

### Average sentence length
count    283.000000
mean     131.872792
std       71.705303
min       24.000000
25%       82.500000
50%      116.000000
75%      161.000000
max      498.000000
Name: s_len, dtype: float64
                                   id  repeated       s_len
most_frequent_percentage                                   
0.4                       1318.333333  0.500000  168.333333
1.0                       1257.691919  0.489899  132.772727
0.8                       1242.705882  0.450980  129.137255
0.6                       1258.481481  0.518519  124.555556
0.5                       1140.000000  0.000000   72.000000


In [18]:
se_unique.sort_values('most_frequent_percentage',ascending=False).drop(columns=['caption','origin','id'])

Unnamed: 0,sentence,object_a,object_b,predicted,gold,most_frequent_percentage,repeated,s_len
2919,"I haven't tried it with this modem, but on my ADSL line the superior performance of Ethernet over USB modems is striking, particularly in terms of responsiveness.",Ethernet,USB,NONE,ARG,1.0,True,162
3360,"Talking about the apps, the OS have some of the integrated apps by Sony which might have a better replacement in the Google Play Store, but they are well mixed with the present UI.",Sony,Google,ARG,NONE,1.0,False,180
3290,"A further advantage of perl or Python (or ruby) would be that these should all have libraries for interfacing with matlab, and much better data structures than Bash for tables.",Python,Bash,NONE,ARG,1.0,True,176
3291,Glass Fiber reinforced plastic technology is an innovative technology that is rapidly replacing the conventional materials such as concrete due to its superior properties.,plastic,concrete,NONE,ARG,1.0,False,171
3292,"I actually use my own custom-rolled flavor of Haml that is less Ruby-like, and I think it maps a bit better to the stylesheet and JavaScript.",Ruby,JavaScript,NONE,ARG,1.0,False,141
3293,"and dont even start with the ""Microsoft greed monopoly"" thing, Apple is 10X worse""",Microsoft,Apple,NONE,ARG,1.0,False,82
3294,"Learned Perl, more specifically the AWK subset, at the same job to make my life easier.",Perl,AWK,ARG,NONE,1.0,False,87
3296,The new Windows 7 is said to be so much better than the problems ridden Windows Vista and is a considerable improvement on Windows XP.,Windows Vista,Windows XP,NONE,ARG,1.0,False,134
3298,"Robert Kubica was second fastest in his BMW Sauber, just four-tenths shy of pace-setter Glock while Nico Rosberg posted the third best time just a fraction slower in his Williams Toyota.",BMW,Toyota,ARG,NONE,1.0,True,186
3303,"I chose a BMW over a Ford Mondeo because it's better put together, more powerful, more comfortable and is more attractive (to me), for the same price (second hand, at the time).",BMW,Ford,NONE,ARG,1.0,True,177


## LexNet Errors

In [19]:
analyze_errors(lex_unique)

### Confidence
1.000000    360
0.800000     84
0.600000     52
0.400000     12
0.833333      6
0.500000      3
0.666667      2
0.714286      1
Name: most_frequent_percentage, dtype: int64
### Repeated Errors
True     281
False    239
Name: repeated, dtype: int64

### Confusion


[('Pred: NONE - Gold: ARG', 317), ('Pred: ARG - Gold: NONE', 203)]

### Average sentence length
count    520.000000
mean     128.096154
std       75.299239
min       17.000000
25%       81.000000
50%      109.000000
75%      158.000000
max      520.000000
Name: s_len, dtype: float64
                                   id  repeated       s_len
most_frequent_percentage                                   
0.833333                  1156.000000  0.666667  178.500000
0.800000                  1150.035714  0.595238  140.583333
1.000000                  1141.180556  0.525000  127.950000
0.600000                  1143.961538  0.557692  114.961538
0.714286                   869.000000  1.000000  114.000000
0.666667        

In [20]:
temp = lex_unique.sort_values('most_frequent_percentage',ascending=False).drop(columns=['caption','origin','sentencer','s_len'])
np = temp[temp.paths.str.startswith("[['NOP")]
print('Sentences without path: {}'.format(len(np)))

Sentences without path: 37


In [21]:
temp[~temp.repeated]


Unnamed: 0,id,sentence,object_a,object_b,predicted,gold,most_frequent_percentage,repeated,paths,idr
1788,1511,"missouri is much easier, since you just throw them in with Illinois and Northwestern and put them in the michigan pod.",missouri,michigan,ARG,NONE,1.0,False,[['X/PROPN/ROOT/^ Y/PROPN/conj/<']],L1754036133
2089,1529,The coffee really improved a mediocre base beer.,coffee,beer,NONE,ARG,1.0,False,"[['X/PROPN/ROOT/^ Y/PROPN/conj/<'], ['X/PROPN/ROOT/^ and/CONJ/cc/V Y/PROPN/conj/<']]",A47388539
1802,704,"All my friends who have computers was still running Windows 98, and I recall that Windows XP ran muuuuuuuuuuuch faster than 98 in my (still outdated) AMD Athlon 700MHz.",Windows 98,Windows XP,NONE,ARG,1.0,False,"[['X/ADJ/amod/> Y/PROPN/ROOT/^'], ['X/ADJ/amod/> bunting/NOUN/compound/V Y/PROPN/ROOT/^'], ['X/ADJ/amod/> byd/PROPN/compound/V Y/PROPN/ROOT/^']]",N2155322514
1800,229,harvard Head Coach Tim Murphy opening statement: Obviously it as a very solid win over princeton last week.,harvard,princeton,ARG,NONE,1.0,False,[['NOPATH']],R2936176963
2093,2298,"If REAL football turns your stomach, better watch tennis... Oakland vs. Denver: An AFC West pillowfight.",football,tennis,NONE,ARG,1.0,False,"[['X/PROPN/npadvmod/> base/VERB/amod/> be/VERB/ROOT/^ than/ADP/prep/<', 'liquid/NOUN/pobj/< as/ADP/prep/< juice/NOUN/pobj/< Y/PROPN/conj/<']]",E613641206
1796,2276,"Google is the main player now, Microsoft are just plain inferior in Mobile.",Google,Microsoft,NONE,ARG,1.0,False,"[['X/PROPN/ROOT/^ Y/PROPN/conj/<'], ['X/PROPN/ROOT/^ and/CONJ/cc/V Y/PROPN/conj/<']]",O2345744622
2094,1346,"Compare the Cloud released new rankings indicating Amazon Web Services, though a perennial cloud services frontrunner, is experiencing slower growth and is losing the ""mind share"" battle with competitors such as IBM and Microsoft.",Amazon,Microsoft,NONE,ARG,1.0,False,"[['X/PROPN/nsubj/> be/VERB/ROOT/^ slow/ADJ/acomp/< than/ADP/prep/<', 'Y/PROPN/pobj/<']]",T3190503368
2102,1648,Search using Google for V300_525fw_Win to find the download page on the Kingston web site.,Google,Kingston,ARG,NONE,1.0,False,"[['X/PROPN/ROOT/^ Y/PROPN/conj/<'], ['X/PROPN/ROOT/^ and/CONJ/cc/V Y/PROPN/conj/<']]",S2981371421
1808,1,"Note, that because of my own terrible experiences with Windows Vista, I had dutifully remained on Windows XP, accepting the fact that my environment continued to get slower and slower in between more frequent cleaning reinstall cycles.",Windows Vista,Windows XP,NONE,ARG,1.0,False,"[['X/PROPN/compound/> include/VERB/ROOT/^ Y/PROPN/appos/<'], ['X/PROPN/compound/> include/VERB/ROOT/^ ex/PROPN/compound/V', 'Y/PROPN/appos/<']]",M1941541803
1782,239,Wi-SB BOX brings superior WiFi and Bluetooth signal quality and even provides 2 front USB 3.0 ports.,Bluetooth,USB,ARG,NONE,1.0,False,"[['X/PROPN/compound/> be/VERB/ROOT/^ slow/ADJ/advmod/< paper/NOUN/conj/<', 'Y/PROPN/conj/<']]",N2136561640


## Shared

In [22]:
analyze_errors(shared,conf=False)

### Confidence
1.000000    270
0.800000     55
0.600000     38
0.400000     11
0.833333      4
0.666667      1
0.500000      1
Name: most_frequent_percentage, dtype: int64
### Repeated Errors
True     321
False     59
Name: repeated, dtype: int64

### Average sentence length
count    380.000000
mean     122.015789
std       67.352182
min       16.000000
25%       77.000000
50%      105.000000
75%      151.250000
max      510.000000
Name: s_len, dtype: float64
                          repeated       s_len
most_frequent_percentage                      
0.666667                  1.000000  239.000000
0.400000                  0.909091  169.363636
0.600000                  0.815789  130.105263
0.800000                  0.854545  128.472727
0.833333                  1.000000  119.750000
1.000000                  0.840741  117.544444
0.500000                  1.000000   38.000000


In [23]:
shared[~shared.repeated].sample(20)

Unnamed: 0,sentence,object_a,object_b,pred_if,pred_lex,gold,most_frequent_percentage,repeated,s_len
154,But my biggest worry is that the tech press will continue to do what they always do and ignore a superior product from Microsoft and flog an inferior offering because it's from Google.,Microsoft,Google,NONE,NONE,ARG,1.0,False,184
581,"Now for some reason the more I learn Python, the faster I can do things in Java.",Python,Java,ARG,ARG,NONE,1.0,False,80
1234,"Is Sigma so technologically deprived that it lags so far behind the ""big boys"" (Canon, Nikon, Sony, etc.) in the business that they could not do better?",Sigma,Nikon,ARG,ARG,NONE,1.0,False,152
517,"Instead, they've been able to score against everyone except harvard, but Ashley Holt has had the better numbers in goal after Newell had a tough night versus cornell.",harvard,cornell,ARG,ARG,NONE,0.8,False,166
1334,"It'll get better, but it's baseball, not basketball.",baseball,basketball,ARG,ARG,NONE,0.6,False,52
576,"It's light as aluminum, but harder than steel,"" he said.",aluminum,steel,ARG,ARG,NONE,1.0,False,56
1438,Google Chrome is even worse on my old IBM ThinkPad.,Google,IBM,ARG,ARG,NONE,1.0,False,51
573,OO in Ruby or Smalltalk is very different than OO in C# or Java.,Smalltalk,Java,ARG,ARG,NONE,1.0,False,64
822,"In our panel on The Future of Code I observed that IBM was a company that had gone through a near-death experience yet had come through to the other side wiser and stronger, that Microsoft was a company undergoing a midlife crisis, and that Google was a company in serious need of adult supervision - at which point Vint Cerf piped up from the audience saying ""why do you think they hired me?"" Another highlight of these past months was my appointment to the board of trustees for the Computer History Museum .",IBM,Google,ARG,ARG,NONE,0.4,False,510
1879,"amazon is much better, as is netflix, and hulu.",amazon,hulu,ARG,ARG,NONE,1.0,False,47


## All

In [24]:
a = pd.concat([shared,lex_unique,se_unique])
analyze_errors(a)

### Confidence
1.000000    828
0.800000    190
0.600000    117
0.400000     29
0.833333     10
0.500000      5
0.666667      3
0.714286      1
Name: most_frequent_percentage, dtype: int64
### Repeated Errors
True     739
False    444
Name: repeated, dtype: int64

### Confusion


[('Pred: NONE - Gold: ARG', 436),
 ('Pred: ARG - Gold: NONE', 367),
 ('Pred: nan - Gold: ARG', 246),
 ('Pred: nan - Gold: NONE', 134)]

### Average sentence length
count    1183.000000
mean      127.046492
std        72.007170
min        16.000000
25%        80.000000
50%       110.000000
75%       158.000000
max       520.000000
Name: s_len, dtype: float64
                                   id  repeated       s_len
most_frequent_percentage                                   
0.833333                  1156.000000  0.800000  155.000000
0.666667                  1006.000000  0.666667  144.333333
0.400000                  1497.500000  0.586207  137.827586
0.800000                  1185.044444  0.631579  134.005263


## Misc

In [25]:
different = []
for i, row in shared.iterrows():
    se_s = se_multi[se_multi.sentence == row['sentence']].predicted.values.tolist()[0]
    lex_s = lex_multi[lex_multi.sentence == row['sentence']].predicted.values.tolist()[0]
    if se_s != lex_s:
        different.append(row['sentence'])
print('Different mistakes on same sentence: {}'.format(len(different)))


Different mistakes on same sentence: 0


In [26]:
pd.concat([shared,lex_unique,se_unique]).to_csv('2_classes_errors.csv')