## Data Collection

This notebook was used to analyze the errors made by the best features

In [129]:
import pandas as pd
from collections import defaultdict
from pprint import pprint
pd.options.display.max_rows = 1500

In [2]:
conf = pd.read_csv('../../code/data/data_if.csv')[['id','most_frequent_percentage']]
conf.set_index('id')
df_cols = ['id','caption','sentence','object_a','object_b','predicted','gold', 'most_frequent_percentage']
paths = pd.read_csv('middle_paths_unrestricted_16.csv')
paths['sentence'] = paths.apply(lambda row: row['sentence'][2:-2],axis=1)
data = pd.read_csv('binary_False.csv')
pd.options.display.max_colwidth = 100000

In [187]:
paths[paths.paths.str.len() < 20].paths.value_counts()

[['NOPATH']]    390
Name: paths, dtype: int64

In [3]:
# all errors made by sentence embeddings
se = data[data.caption == "InferSent"]
se = se.join(conf,rsuffix="_r")[df_cols]

In [184]:
# all errors for the lexnet feature
lex = data[data.caption == 'middle_paths_unrestricted_16']
#lex.set_index('id')
#lex = lex.join(conf,rsuffix="_r")[df_cols]
print(len(lex))

1055


In [5]:
# all errors exclusivly made by se
se_unique = se[~se.sentence.isin(lex.sentence.values.tolist())].copy()
se_unique['origin'] = 'se (unique)'

In [80]:
p = paths[['sentence','paths','id']].copy()
p = p.set_index('id')

# errors exclusivly made by lexnet
lex_unique = lex[~lex.sentence.isin(se.sentence.values.tolist())].copy()
lex_unique['origin'] = 'lex (unique)'
lex_unique = lex_unique.set_index('id')
lex_unique = lex_unique.join(p,rsuffix='r')
lex_unique = lex_unique.drop_duplicates(subset=['sentence'])

In [220]:
# all errors made by both classifiers
se_multi = se[~se.sentence.isin(se_unique.sentence.values.tolist())].copy()
lex_multi = lex[~lex.sentence.isin(lex_unique.sentence.values.tolist())].copy()
se_multi['origin'] = 'both'
lex_multi['origin'] = 'both'
assert sorted(se_multi.id.values.tolist()) == sorted(lex_multi.id.values.tolist())
shared = se_multi.copy()
shared['caption'] = 'shared'
shared['pred_if'] = shared.apply(lambda x: se_multi[se_multi.sentence == x['sentence']].predicted.values.tolist()[0],axis=1)
shared['pred_lex'] = shared.apply(lambda x: lex_multi[lex_multi.sentence == x['sentence']].predicted.values.tolist()[0],axis=1)
shared = shared.set_index('id')
shared = shared.join(conf,rsuffix="_r")[['sentence','object_a','object_b','pred_if', 'pred_lex', 'gold', 'most_frequent_percentage']]

In [218]:
print('Shared {}'.format(len(shared)))
print('SE Unique {} | Lex Unique {}'.format(len(se_unique),len(lex_unique)))
print('SE {} | Lex {}'.format(len(se),len(lex)))
assert len(shared) + len(se_unique) == len(se)
assert len(shared) + len(lex_unique) == len(lex)

Shared 570
SE Unique 219 | Lex Unique 485
SE 789 | Lex 1055


In [120]:
def class_confusion(df):
    d2 = defaultdict(int)
    d = defaultdict(int)
    for i, row in df.iterrows():
        k = 'Pred: {} - Gold: {}'.format(row['predicted'],row['gold'])
        k2 = '-'.join(sorted([row['predicted'],row['gold']]))
        d[k] += 1
        d2[k2]+=1
    pprint(sorted(d2.items(),key=lambda x: x[1],reverse=True))
    print('\n')
    pprint(sorted(d.items(),key=lambda x: x[1],reverse=True))

In [161]:
def analyze_errors(df):
    df['s_len'] = df.apply(lambda x: len(x['sentence']),axis=1)
    print('### Confidence')
    print(df.most_frequent_percentage.value_counts())
    print('\n### Confusion')
    class_confusion(df)
    print('\n### Average sentence length')
    print(df.s_len.describe())
    print(df.groupby('most_frequent_percentage').mean().sort_values('s_len',ascending=False))
    

## Sentence Embedding Errors

In [159]:
analyze_errors(se_unique)

Confidence
1.000000    156
0.800000     30
0.600000     26
0.400000      6
0.666667      1
Name: most_frequent_percentage, dtype: int64

Confusion
[('BETTER-NONE', 135), ('NONE-WORSE', 52), ('BETTER-WORSE', 32)]


[('Pred: BETTER - Gold: NONE', 84),
 ('Pred: NONE - Gold: BETTER', 51),
 ('Pred: WORSE - Gold: NONE', 40),
 ('Pred: WORSE - Gold: BETTER', 19),
 ('Pred: BETTER - Gold: WORSE', 13),
 ('Pred: NONE - Gold: WORSE', 12)]

Average sentence length
count    219.000000
mean     128.534247
std       71.279412
min       27.000000
25%       77.500000
50%      113.000000
75%      162.000000
max      498.000000
Name: s_len, dtype: float64
                               s_len
most_frequent_percentage            
0.666667                  187.000000
0.400000                  146.666667
1.000000                  129.019231
0.600000                  126.384615
0.800000                  122.300000


In [162]:
#se_unique.sort_values('most_frequent_percentage',ascending=False).drop(columns=['caption','origin','id'])

## LexNet Errors

In [164]:
analyze_errors(lex_unique)

### Confidence
1.000000    336
0.800000     73
0.600000     62
0.400000      8
0.833333      3
0.666667      2
0.857143      1
Name: most_frequent_percentage, dtype: int64

### Confusion
[('BETTER-NONE', 314), ('BETTER-WORSE', 87), ('NONE-WORSE', 84)]


[('Pred: NONE - Gold: BETTER', 200),
 ('Pred: BETTER - Gold: NONE', 114),
 ('Pred: BETTER - Gold: WORSE', 63),
 ('Pred: NONE - Gold: WORSE', 51),
 ('Pred: WORSE - Gold: NONE', 33),
 ('Pred: WORSE - Gold: BETTER', 24)]

### Average sentence length
count    485.000000
mean     124.338144
std       72.187350
min       21.000000
25%       78.000000
50%      105.000000
75%      152.000000
max      536.000000
Name: s_len, dtype: float64
                               s_len
most_frequent_percentage            
0.833333                  142.000000
1.000000                  127.547619
0.600000                  120.951613
0.800000                  118.095890
0.666667                  106.000000
0.400000                   80.750000
0.857143       

In [175]:
temp = lex_unique.sort_values('most_frequent_percentage',ascending=False).drop(columns=['caption','origin','sentencer','s_len'])
np = temp[temp.paths.str.startswith("[['NOP")]
print('Sentences without path: {}'.format(len(np)))

Sentences without path: 41


In [188]:
temp

Unnamed: 0_level_0,sentence,object_a,object_b,predicted,gold,most_frequent_percentage,paths
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A102026242,7. google Plus is evolving faster than facebook,google,facebook,NONE,BETTER,1.0,[['X/PROPN/compound/> evolve/VERB/ROOT/^ than/ADP/prep/< Y/PROPN/pobj/<']]
K1654161907,Love the plastic....so much better than the Kingdome's green concrete.,plastic,concrete,NONE,BETTER,1.0,"[['X/NOUN/ROOT/^ Y/PROPN/pobj/<'], ['X/NOUN/ROOT/^ good/ADJ/amod/V Y/PROPN/pobj/<'], ['X/NOUN/ROOT/^ kingdome/PROPN/poss/V Y/PROPN/pobj/<'], ['X/NOUN/ROOT/^ green/ADJ/amod/V Y/PROPN/pobj/<']]"
L1818525389,"No one is saying ""BMW makes a better one,"" or ""Honda makes a better one,"" we're JUST SAYING THAT THE VOLT SUCKS, PERIOD!!!",BMW,Honda,WORSE,NONE,1.0,[['X/PROPN/nsubj/> make/VERB/ROOT/^ Y/PROPN/conj/<']]
L1813335558,Nokia Lumia 920 has the best screen - 2.4x faster than Samsung S3's screen and 5.4x faster than iPhone 5's screen (tested with Lagom test).,Nokia,Samsung,WORSE,BETTER,1.0,[['X/PROPN/compound/> have/VERB/ROOT/^ than/ADP/prep/< Y/PROPN/pobj/<']]
L1806111729,"Nissan , which was able to return to normal production levels much faster that Honda and Toyota, reported a 19.2 percent gain.",Nissan,Toyota,NONE,BETTER,1.0,"[['X/PROPN/ROOT/^ Y/PROPN/conj/<'], ['X/PROPN/ROOT/^ be/VERB/relcl/V Y/PROPN/conj/<']]"
L1795713475,New Ferrari better than 2009 Renault - Alonso,Ferrari,Renault,NONE,BETTER,1.0,[['NOPATH']]
L1791534312,NetClipse is an Eclipse plugin for NetBeans.,Eclipse,NetBeans,BETTER,NONE,1.0,[['X/PROPN/compound/> plugin/NOUN/ROOT/^ for/ADP/prep/< Y/PROPN/pobj/<']]
L1790053271,"Neither is a clear winner on performance; Perl is faster for some things, Python for others.",Perl,Python,NONE,BETTER,1.0,[['X/PROPN/nsubj/> be/VERB/ROOT/^ Y/PROPN/npadvmod/<']]
L1789037833,"Need to buy the Sigma dock to unlock fastest autofocus, and even then, it feels slightly slower than my Nikon 70-200 VRII.",Sigma,Nikon,BETTER,WORSE,1.0,"[['X/ADJ/amod/> feel/VERB/ROOT/^ than/ADP/prep/< Y/PROPN/pobj/<'], ['X/ADJ/amod/> feel/VERB/ROOT/^ than/ADP/prep/< my/ADJ/poss/V', 'Y/PROPN/pobj/<']]"
L1784833999,Nat Pryce noticed that collections were processed easier (and with less code) in the Haskell and Smalltalk solutions than in Java.,Haskell,Java,NONE,BETTER,1.0,"[['X/PROPN/nsubj/> solution/NOUN/ROOT/^ than/ADP/prep/< in/ADP/prep/<', 'Y/PROPN/pobj/<'], ['X/PROPN/nsubj/> and/CONJ/cc/V solution/NOUN/ROOT/^ than/ADP/prep/<', 'in/ADP/prep/< Y/PROPN/pobj/<'], ['X/PROPN/nsubj/> smalltalk/PROPN/conj/V solution/NOUN/ROOT/^', 'than/ADP/prep/< in/ADP/prep/< Y/PROPN/pobj/<']]"


## Shared

In [178]:
analyze_errors(shared)

### Confidence
1.000000    400
0.800000     95
0.600000     55
0.400000     13
0.833333      4
0.500000      2
0.333333      1
Name: most_frequent_percentage, dtype: int64

### Confusion
[('BETTER-NONE', 258), ('NONE-WORSE', 227), ('BETTER-WORSE', 85)]


[('Pred: NONE - Gold: WORSE', 210),
 ('Pred: NONE - Gold: BETTER', 191),
 ('Pred: BETTER - Gold: WORSE', 76),
 ('Pred: BETTER - Gold: NONE', 67),
 ('Pred: WORSE - Gold: NONE', 17),
 ('Pred: WORSE - Gold: BETTER', 9)]

### Average sentence length
count    570.000000
mean     120.214035
std       64.024928
min       16.000000
25%       76.250000
50%      105.000000
75%      147.000000
max      368.000000
Name: s_len, dtype: float64
                               s_len
most_frequent_percentage            
0.333333                  150.000000
0.600000                  124.218182
0.800000                  122.000000
1.000000                  120.247500
0.400000                  115.846154
0.833333                   63.250000
0.500000       

In [228]:
shared[shared.most_frequent_percentage < 0.7].sample(20)

Unnamed: 0,sentence,object_a,object_b,pred_if,pred_lex,gold,most_frequent_percentage
M2054175207,"Python/Django/GeoDjango are a mature combination, with a somewhat slower, steadier development pace than Ruby/Rails/RGeo, which ...",Python,Ruby,WORSE,NONE,BETTER,0.6
I1260304340,"In fact, they look even worse than the ps2 visuals, since the psp doesn't have quite as much graphical firepower.",ps2,psp,WORSE,NONE,BETTER,0.666667
F835980058,"Groovy code often looks and feels like Java code, but is almost always simpler and easier to use.",Groovy,Java,NONE,NONE,BETTER,0.4
J1432575082,"Its 362 nit brightness level is one of the best we've seen, improving on the MacBook and Samsung, but the Lenovo's 646:1 contrast ratio is mediocre - the Toshiba Kira hit a mighty 2,326:1.",Samsung,Toshiba,WORSE,WORSE,NONE,0.4
M1988309762,Perl is slower and faster than Java,Perl,Java,WORSE,WORSE,BETTER,0.6
N2243612645,Solid beer but not my cup of tea.,beer,tea,NONE,BETTER,WORSE,0.6
S2969102354,"Unfortunately, the Bluetooth mouse goes through batteries much faster than ""regular"" USB wireless mice do.",Bluetooth,USB,BETTER,BETTER,WORSE,0.6
S3082464864,We say 'Java is the gateway to hell.' Lisp is just a better alternative to the intelligent Internet: It runs on 14 different platforms and doesn't need any type of virtual machine because it compiles directly to native machine instructions on all popular microprocessor architectures.',Java,Lisp,NONE,NONE,WORSE,0.6
J1513178281,I was also surprised at how robust the query planner and optimizers are compared to Oracle when our performance testing found PostgreSQL to be faster.,Oracle,PostgreSQL,NONE,NONE,WORSE,0.6
P2583885800,"The NP900X4C was decidedly mediocre at gaming tests, thanks to its Intel HD Graphics 4000 integrated graphics, but the 62Wh battery let it last a long time on the MobileMark battery test: just short of 7 hours, ahead of the Zenbook Prime UX32VD (5:26) and the Sony VAIO E15 (4:34) by a wide margin.",Intel,Sony,NONE,NONE,BETTER,0.6


## All

In [207]:
analyze_errors(pd.concat([shared,lex_unique,se_unique]))

### Confidence
1.000000    889
0.800000    198
0.600000    142
0.400000     27
0.833333     11
0.666667      4
0.857143      2
0.714286      1
Name: most_frequent_percentage, dtype: int64

### Confusion
[('BETTER-NONE', 677), ('NONE-WORSE', 369), ('BETTER-WORSE', 228)]


[('Pred: NONE - Gold: BETTER', 415),
 ('Pred: NONE - Gold: WORSE', 276),
 ('Pred: BETTER - Gold: NONE', 262),
 ('Pred: BETTER - Gold: WORSE', 149),
 ('Pred: WORSE - Gold: NONE', 93),
 ('Pred: WORSE - Gold: BETTER', 79)]

### Average sentence length
count    1274.000000
mean      123.214286
std        68.505059
min        16.000000
25%        77.000000
50%       106.000000
75%       153.000000
max       536.000000
Name: s_len, dtype: float64
                               s_len
most_frequent_percentage            
0.666667                  128.000000
0.714286                  127.000000
0.600000                  124.816901
1.000000                  124.349831
0.400000                  122.851852
0.833333                

## Misc

In [199]:
different = []
for i, row in shared.iterrows():
    se_s = se_multi[se_multi.sentence == row['sentence']].predicted.values.tolist()[0]
    lex_s = lex_multi[lex_multi.sentence == row['sentence']].predicted.values.tolist()[0]
    if se_s != lex_s:
        different.append(row['sentence'])
print('Different mistakes on same sentence: {}'.format(len(different)))


Different mistakes on same sentence: 113
