## Data Collection

This notebook was used to analyze the errors made by the best features

In [129]:
import pandas as pd
from collections import defaultdict
from pprint import pprint
pd.options.display.max_rows = 1500

In [2]:
conf = pd.read_csv('../../code/data/data_if.csv')[['id','most_frequent_percentage']]
conf.set_index('id')
df_cols = ['id','caption','sentence','object_a','object_b','predicted','gold', 'most_frequent_percentage']
paths = pd.read_csv('middle_paths_unrestricted_16.csv')
paths['sentence'] = paths.apply(lambda row: row['sentence'][2:-2],axis=1)
data = pd.read_csv('binary_False.csv')
pd.options.display.max_colwidth = 100000

In [3]:
# all errors made by sentence embeddings
se = data[data.caption == "InferSent"]
se = se.join(conf,rsuffix="_r")[df_cols]

In [67]:
# all errors for the lexnet feature
lex = data[data.caption == 'middle_paths_unrestricted_16']
lex.set_index('id')
lex = lex.join(conf,rsuffix="_r")[df_cols]
lex[lex.id == 'F801586602']

Unnamed: 0,id,caption,sentence,object_a,object_b,predicted,gold,most_frequent_percentage
1899,F801586602,middle_paths_unrestricted_16,"Generally, both Amazon and Google are superior to Apple in managing recommendations.",Amazon,Apple,NONE,BETTER,0.4


In [5]:
# all errors exclusivly made by se
se_unique = se[~se.sentence.isin(lex.sentence.values.tolist())].copy()
se_unique['origin'] = 'se (unique)'

In [80]:
p = paths[['sentence','paths','id']].copy()
p = p.set_index('id')

# errors exclusivly made by lexnet
lex_unique = lex[~lex.sentence.isin(se.sentence.values.tolist())].copy()
lex_unique['origin'] = 'lex (unique)'
lex_unique = lex_unique.set_index('id')
lex_unique = lex_unique.join(p,rsuffix='r')
lex_unique = lex_unique.drop_duplicates(subset=['sentence'])

In [81]:
# all errors made by both classifiers
se_multi = se[~se.sentence.isin(se_unique.sentence.values.tolist())].copy()
lex_multi = lex[~lex.sentence.isin(lex_unique.sentence.values.tolist())].copy()
se_multi['origin'] = 'both'
lex_multi['origin'] = 'both'
assert sorted(se_multi.id.values.tolist()) == sorted(lex_multi.id.values.tolist())
shared = lex_multi
shared = shared.set_index('id')

In [84]:
print('Shared {}'.format(len(shared)))
print('SE Unique {} | Lex Unique {}'.format(len(se_unique),len(lex_unique)))
print('SE {} | Lex {}'.format(len(se),len(lex)))
assert len(shared) + len(se_unique) == len(se)
assert len(shared) + len(lex_unique) == len(lex)

Shared 570
SE Unique 219 | Lex Unique 485
SE 789 | Lex 1055


In [120]:
def class_confusion(df):
    d2 = defaultdict(int)
    d = defaultdict(int)
    for i, row in df.iterrows():
        k = 'Pred: {} - Gold: {}'.format(row['predicted'],row['gold'])
        k2 = '-'.join(sorted([row['predicted'],row['gold']]))
        d[k] += 1
        d2[k2]+=1
    pprint(sorted(d2.items(),key=lambda x: x[1],reverse=True))
    print('\n')
    pprint(sorted(d.items(),key=lambda x: x[1],reverse=True))

In [161]:
def analyze_errors(df):
    df['s_len'] = df.apply(lambda x: len(x['sentence']),axis=1)
    print('### Confidence')
    print(df.most_frequent_percentage.value_counts())
    print('\n### Confusion')
    class_confusion(df)
    print('\n### Average sentence length')
    print(df.s_len.describe())
    print(df.groupby('most_frequent_percentage').mean().sort_values('s_len',ascending=False))
    

## Sentence Embedding Errors

In [159]:
analyze_errors(se_unique)

Confidence
1.000000    156
0.800000     30
0.600000     26
0.400000      6
0.666667      1
Name: most_frequent_percentage, dtype: int64

Confusion
[('BETTER-NONE', 135), ('NONE-WORSE', 52), ('BETTER-WORSE', 32)]


[('Pred: BETTER - Gold: NONE', 84),
 ('Pred: NONE - Gold: BETTER', 51),
 ('Pred: WORSE - Gold: NONE', 40),
 ('Pred: WORSE - Gold: BETTER', 19),
 ('Pred: BETTER - Gold: WORSE', 13),
 ('Pred: NONE - Gold: WORSE', 12)]

Average sentence length
count    219.000000
mean     128.534247
std       71.279412
min       27.000000
25%       77.500000
50%      113.000000
75%      162.000000
max      498.000000
Name: s_len, dtype: float64
                               s_len
most_frequent_percentage            
0.666667                  187.000000
0.400000                  146.666667
1.000000                  129.019231
0.600000                  126.384615
0.800000                  122.300000


In [162]:
#se_unique.sort_values('most_frequent_percentage',ascending=False).drop(columns=['caption','origin','id'])

## LexNet Errors

In [164]:
analyze_errors(lex_unique)

### Confidence
1.000000    336
0.800000     73
0.600000     62
0.400000      8
0.833333      3
0.666667      2
0.857143      1
Name: most_frequent_percentage, dtype: int64

### Confusion
[('BETTER-NONE', 314), ('BETTER-WORSE', 87), ('NONE-WORSE', 84)]


[('Pred: NONE - Gold: BETTER', 200),
 ('Pred: BETTER - Gold: NONE', 114),
 ('Pred: BETTER - Gold: WORSE', 63),
 ('Pred: NONE - Gold: WORSE', 51),
 ('Pred: WORSE - Gold: NONE', 33),
 ('Pred: WORSE - Gold: BETTER', 24)]

### Average sentence length
count    485.000000
mean     124.338144
std       72.187350
min       21.000000
25%       78.000000
50%      105.000000
75%      152.000000
max      536.000000
Name: s_len, dtype: float64
                               s_len
most_frequent_percentage            
0.833333                  142.000000
1.000000                  127.547619
0.600000                  120.951613
0.800000                  118.095890
0.666667                  106.000000
0.400000                   80.750000
0.857143       

In [175]:
temp = lex_unique.sort_values('most_frequent_percentage',ascending=False).drop(columns=['caption','origin','sentencer','s_len'])
np = temp[temp.paths.str.startswith("[['NOP")]
print('Sentences without path: {}'.format(len(np)))

Sentences without path: 41


In [177]:
#temp

## Shared

In [178]:
analyze_errors(shared)

### Confidence
1.000000    400
0.800000     95
0.600000     55
0.400000     13
0.833333      4
0.500000      2
0.333333      1
Name: most_frequent_percentage, dtype: int64

### Confusion
[('BETTER-NONE', 258), ('NONE-WORSE', 227), ('BETTER-WORSE', 85)]


[('Pred: NONE - Gold: WORSE', 210),
 ('Pred: NONE - Gold: BETTER', 191),
 ('Pred: BETTER - Gold: WORSE', 76),
 ('Pred: BETTER - Gold: NONE', 67),
 ('Pred: WORSE - Gold: NONE', 17),
 ('Pred: WORSE - Gold: BETTER', 9)]

### Average sentence length
count    570.000000
mean     120.214035
std       64.024928
min       16.000000
25%       76.250000
50%      105.000000
75%      147.000000
max      368.000000
Name: s_len, dtype: float64
                               s_len
most_frequent_percentage            
0.333333                  150.000000
0.600000                  124.218182
0.800000                  122.000000
1.000000                  120.247500
0.400000                  115.846154
0.833333                   63.250000
0.500000       

In [183]:
shared

Unnamed: 0_level_0,caption,sentence,object_a,object_b,predicted,gold,most_frequent_percentage,origin,s_len
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
E594409476,middle_paths_unrestricted_16,Does this imply that milk is worse than soda as far as tooth decay is concerned??,milk,soda,WORSE,NONE,1.0,both,81
D402677286,middle_paths_unrestricted_16,But my biggest worry is that the tech press will continue to do what they always do and ignore a superior product from Microsoft and flog an inferior offering because it's from Google.,Microsoft,Google,NONE,BETTER,1.0,both,184
I1291586391,middle_paths_unrestricted_16,"Instead of pizza, it'll be easier to choose a salad or healthy sandwich .",pizza,sandwich,BETTER,WORSE,1.0,both,73
F824293615,middle_paths_unrestricted_16,Google Chrome is even worse on my old IBM ThinkPad.,Google,IBM,BETTER,NONE,1.0,both,51
L1842619513,middle_paths_unrestricted_16,No way is Python or TCL faster than Java.,Python,Java,BETTER,WORSE,1.0,both,41
F802055652,middle_paths_unrestricted_16,"Generally Toyota appears to build well solid if boring cars, but this is a Ford class failure, e.g.",Toyota,Ford,NONE,BETTER,1.0,both,99
S3113863522,middle_paths_unrestricted_16,"What puzzles him, though, is that with hundreds of millions of computers still running Windows XP-and Microsoft keener than ever to get their owners to migrate to Windows 7, if not Windows 8-the least the firm might have done is to make the whole upgrade path a good deal easier road to travel.",Windows XP,Windows 8,BETTER,NONE,1.0,both,294
S3090092782,middle_paths_unrestricted_16,We've all seen these benchmarks where Python is 10 or even 100 times slower than Java or C#.,Python,Java,BETTER,WORSE,1.0,both,92
H1203579210,middle_paths_unrestricted_16,"IMHO Lisp is way too heavy on the parenthesis, a Java program is easier to read.",Lisp,Java,NONE,WORSE,0.8,both,80
B326211513,middle_paths_unrestricted_16,"Apple has also add the ability to connect the Apple TV with a Bluetooth keyboard (?!), which allows users to enter text for search much faster that using the four-way remote control, and likely faster than the Remote app for iOS, once you factor in connection and navigation time.",Apple TV,iOS,NONE,BETTER,1.0,both,280


## All

In [181]:
analyze_errors(pd.concat([shared,lex_unique,se_unique]))

### Confidence
1.000000    892
0.800000    198
0.600000    143
0.400000     27
0.833333      7
0.666667      3
0.500000      2
0.333333      1
0.857143      1
Name: most_frequent_percentage, dtype: int64

### Confusion
[('BETTER-NONE', 707), ('NONE-WORSE', 363), ('BETTER-WORSE', 204)]


[('Pred: NONE - Gold: BETTER', 442),
 ('Pred: NONE - Gold: WORSE', 273),
 ('Pred: BETTER - Gold: NONE', 265),
 ('Pred: BETTER - Gold: WORSE', 152),
 ('Pred: WORSE - Gold: NONE', 90),
 ('Pred: WORSE - Gold: BETTER', 52)]

### Average sentence length
count    1274.000000
mean      123.214286
std        68.505059
min        16.000000
25%        77.000000
50%       106.000000
75%       153.000000
max       536.000000
Name: s_len, dtype: float64
                               s_len
most_frequent_percentage            
0.333333                  150.000000
0.666667                  133.000000
1.000000                  124.531390
0.600000                  123.195804
0.800000                  120.606061
0.400000

## Misc

In [182]:
different = []
for i, row in lex_multi.iterrows():
    predicted = row['predicted']
    uni_sentence = se_multi[se_multi.sentence == row['sentence']]
    u_p = row['predicted']
    if predicted != u_p:
        different.append(row['sentence'])
print('Different mistakes on same sentence: {}'.format(len(different)))

Different mistakes on same sentence: 0
