In [304]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import chisquare

In [267]:
# Experiment 1 preprocessing of data
def preprocess_experiment1_results(path):
    data = pd.read_csv(path)
    columns = ['HITId', 'WorkerId', 'Input.image_url', 'Answer.algorithms.alg1', 'Answer.algorithms.alg2', 'Answer.algorithms.none']
    data = data[columns]
    
    # Extract the img_id, image_class and algorithms of the image_url
    data['img_id'] = data['Input.image_url'].str.split('_').apply(lambda x: x[0])
    data['img_class'] = data['Input.image_url'].str.split('_').apply(lambda x: x[2])
    data['alg1'] = data['Input.image_url'].str.split('_').apply(lambda x: x[1])
    data['alg2'] = data['Input.image_url'].str.split('_').apply(lambda x: x[3].split('.')[0])
    
    # Get all answers, where nonpruned was voted as more reasonable and set a new column 'nonpruned' to 1 for those cases, the others get 0
    data.loc[((data['Answer.algorithms.alg1'] == True) & (data['alg1'] == 'nonpruned') |
              (data['Answer.algorithms.alg2'] == True) & (data['alg2'] == 'nonpruned')), 'nonpruned'] = 1
    data['nonpruned'].fillna(0, inplace=True)
    
    # Get all answers, where 2pruned was voted as more reasonable and set a new column '2pruned' to 1 for those cases, the others get 0
    data.loc[((data['Answer.algorithms.alg1'] == True) & (data['alg1'] == '2pruned') |
              (data['Answer.algorithms.alg2'] == True) & (data['alg2'] == '2pruned')), '2pruned'] = 1
    data['2pruned'].fillna(0, inplace=True)
    
    # Get all answers, where 32pruned was voted as more reasonable and set a new column '32pruned' to 1 for those cases, the others get 0
    data.loc[((data['Answer.algorithms.alg1'] == True) & (data['alg1'] == '32pruned') |
              (data['Answer.algorithms.alg2'] == True) & (data['alg2'] == '32pruned')), '32pruned'] = 1
    data['32pruned'].fillna(0, inplace=True)
    
    return data

# Experiment 1 - Class-Specific Image Parts 
This experiment investigates the reasonability of algorithms when they classify images. You will decide which of the algorithms makes a more reasonable decision based on the parts of the image it uses for its decision.

This experiment investigates the reasonability of algorithms when they classify images.
The image in the middle is the original image with its predicted class shown above the image.
The images on the left and right show only parts of the image that seem most important to the algorithm.
You will decide which of the algorithms makes a more reasonable decision based on the parts of the image it uses for its decision.
If you think that none of the algorithms seems more reasonable than the other, select "Both algorithms seem equally reasonable".

Some rows not filled with any '1' show that "Both algorithms equally reasonable" were chosen. These are ignored / left for now.
- Maybe they should be removed?
- Somehow put into the calculation / test later on?

In [342]:
# Load and pre-process CSV
occ_results = preprocess_experiment1_results('experiments/experiment1-5000-results.csv')
occ_results.head(5)

Unnamed: 0,HITId,WorkerId,Input.image_url,Answer.algorithms.alg1,Answer.algorithms.alg2,Answer.algorithms.none,img_id,img_class,alg1,alg2,nonpruned,2pruned,32pruned
0,3GITHABAD67H76AWTVB589IP4QCN29,A3A1PY8GLGYRR5,167_32pruned_2-cassetteplayer_nonpruned.png,False,False,True,167,2-cassetteplayer,32pruned,nonpruned,0.0,0.0,0.0
1,3D4BBDG70PIN1K9O85GWB8T5B0Y3CI,AV5TLLCULVF0P,8_2pruned_2-cassetteplayer_nonpruned.png,False,True,False,8,2-cassetteplayer,2pruned,nonpruned,1.0,0.0,0.0
2,3Z56AA6ELCMZ7P8X4W88L7A1F556MA,A250KKS1WKKUFY,360_2pruned_4-church_nonpruned.png,False,False,True,360,4-church,2pruned,nonpruned,0.0,0.0,0.0
3,3OKP4QVBQAJ02HOEJ6RPO7SQF2IAGV,A2U9OZCL9ULYLZ,235_32pruned_8-golfball_nonpruned.png,False,False,True,235,8-golfball,32pruned,nonpruned,0.0,0.0,0.0
4,36FQTHX30BD4RFHIU5K10ONNTFY3BB,A3NXNCQ5PD9LO2,62_32pruned_3-chainsaw_nonpruned.png,False,False,True,62,3-chainsaw,32pruned,nonpruned,0.0,0.0,0.0


In [283]:
# How many datapoints
len(occ_results)

5000

In [285]:
# How often did the models appear - expected: nonpruned twice as often as both others
occ_results['alg1'].value_counts(normalize=True) + occ_results['alg2'].value_counts(normalize=True)

2pruned      0.5
32pruned     0.5
nonpruned    1.0
dtype: float64

In [284]:
# 'Both algorithms equally reasonable'
occ_results['Answer.algorithms.none'].value_counts(normalize=True)

False    0.5994
True     0.4006
Name: Answer.algorithms.none, dtype: float64

### 2Pruned VS Non-Pruned

In [286]:
# Compare non-pruned VS 2pruned
occ_results_np_2 = occ_results.loc[(occ_results['alg1'] == '2pruned') | (occ_results['alg2'] == '2pruned')]
len(occ_results_np_2)

2500

In [287]:
# 'Both algorithms equally reasonable'
occ_results_np_2['Answer.algorithms.none'].value_counts(normalize=True), occ_results_np_2['Answer.algorithms.none'].value_counts()

(True     0.5372
 False    0.4628
 Name: Answer.algorithms.none, dtype: float64,
 True     1343
 False    1157
 Name: Answer.algorithms.none, dtype: int64)

In [288]:
# How often was non-pruned more reasonable than 2pruned?
occ_results_np_2.loc[occ_results_np_2['Answer.algorithms.none'] == 0]['nonpruned'].value_counts(), occ_results_np_2.loc[occ_results_np_2['Answer.algorithms.none'] == 0]['2pruned'].value_counts()

(0.0    636
 1.0    521
 Name: nonpruned, dtype: int64,
 1.0    636
 0.0    521
 Name: 2pruned, dtype: int64)

In [None]:
# observed values, opt: expected values (otherwise every category same likely)
chisquare([110, 129], [120, 120])

### 32Pruned VS Non-Pruned

In [289]:
# Compare non-pruned VS 32pruned
occ_results_np_32 = occ_results.loc[(occ_results['alg1'] == '32pruned') | (occ_results['alg2'] == '32pruned')]
len(occ_results_np_32)

2500

In [290]:
# Around 20% of 'Both algorithms equally reasonable'
occ_results_np_32['Answer.algorithms.none'].value_counts(), occ_results_np_32['Answer.algorithms.none'].value_counts(normalize=True)

(False    1840
 True      660
 Name: Answer.algorithms.none, dtype: int64,
 False    0.736
 True     0.264
 Name: Answer.algorithms.none, dtype: float64)

In [291]:
# How often was non-pruned more reasonable than 32pruned?
occ_results_np_32.loc[occ_results_np_32['Answer.algorithms.none'] == 0]['nonpruned'].value_counts(), occ_results_np_32.loc[occ_results_np_32['Answer.algorithms.none'] == 0]['32pruned'].value_counts()

(1.0    997
 0.0    843
 Name: nonpruned, dtype: int64,
 0.0    997
 1.0    843
 Name: 32pruned, dtype: int64)

In [None]:
chisquare([206, 195], [250, 250])

# Experiment 1 - Grad-CAM Heat-Maps
This experiment investigates the reasonability of algorithms when they classify images. You will decide which of the algorithms makes a more reasonable decision based on the parts of the image it uses for its decision.

The image in the middle is the original image with its predicted class shown above the image. The images on the left and right have a heatmap, ranging from blue to red, overlayed. The whole range of the heatmap is shown here.

The blue areas indicate the least important parts of the image to the algorithm, the red areas indicate the most important parts of the image to the algorithm. Therefore, more important (red) areas help the algorithm to make its correct decision more than less important (blue) areas. You will decide which of the algorithms makes a more reasonable decision based on the parts of the image it uses most for its decision. If you think that none of the algorithms seems more reasonable than the other, select "Both algorithms seem equally reasonable".

In [268]:
hm_results = preprocess_experiment1_results('experiments/experiment1-jet-1000-results.csv')
hm_results.head()

Unnamed: 0,HITId,WorkerId,Input.image_url,Answer.algorithms.alg1,Answer.algorithms.alg2,Answer.algorithms.none,img_id,img_class,alg1,alg2,nonpruned,2pruned,32pruned
0,3E9VAUV7C400O89EDCDVV7ZUK3VAYQ,A3U5D83UGFY23Y,388_2pruned_3-chainsaw_nonpruned.png,True,False,False,388,3-chainsaw,2pruned,nonpruned,0.0,1.0,0.0
1,3ZUE82NE1ING5TZ4BO1TAVMJO458FN,A2KLWSLNAXJUAZ,64_32pruned_7-gaspump_nonpruned.png,True,False,False,64,7-gaspump,32pruned,nonpruned,0.0,0.0,1.0
2,3CVDZS289PMVQI185PBCFDNE89UFMI,A3T1M3NZ5UIA56,221_2pruned_8-golfball_nonpruned.png,False,True,False,221,8-golfball,2pruned,nonpruned,1.0,0.0,0.0
3,38O9DZ0A7A990L44UEPO6YW395A62F,A1HDYTLK36Y0G5,219_2pruned_7-gaspump_nonpruned.png,True,False,False,219,7-gaspump,2pruned,nonpruned,0.0,1.0,0.0
4,3SD15I2WEAGYL30NYE3TP81VFJA63Q,A6B55HA085S5V,361_2pruned_5-frenchhorn_nonpruned.png,False,True,False,361,5-frenchhorn,2pruned,nonpruned,1.0,0.0,0.0


In [269]:
# How many datapoints
len(hm_results)

1000

In [274]:
# 'Both algorithms equally reasonable'
# A lot less equally reasonable than with occlusion maps on top!
hm_results['Answer.algorithms.none'].value_counts(normalize=True)

False    0.712
True     0.288
Name: Answer.algorithms.none, dtype: float64

### 2Pruned VS Non-Pruned

In [275]:
# Compare non-pruned VS 2pruned
hm_results_np_2 = hm_results.loc[(hm_results['alg1'] == '2pruned') | (hm_results['alg2'] == '2pruned')]
len(np_2)

2500

In [276]:
# 'Both algorithms equally reasonable'
hm_results_np_2['Answer.algorithms.none'].value_counts(normalize=True), hm_results_np_2['Answer.algorithms.none'].value_counts()

(False    0.562
 True     0.438
 Name: Answer.algorithms.none, dtype: float64,
 False    281
 True     219
 Name: Answer.algorithms.none, dtype: int64)

In [280]:
# How often was non-pruned more reasonable than 2pruned?
hm_results_np_2.loc[hm_results_np_2['Answer.algorithms.none'] == 0]['nonpruned'].value_counts(), hm_results_np_2.loc[hm_results_np_2['Answer.algorithms.none'] == 0]['2pruned'].value_counts()

(1.0    144
 0.0    137
 Name: nonpruned, dtype: int64,
 0.0    144
 1.0    137
 Name: 2pruned, dtype: int64)

### 32Pruned VS Non-Pruned

In [277]:
# Compare non-pruned VS 32pruned
hm_results_np_32 = hm_results.loc[(hm_results['alg1'] == '32pruned') | (hm_results['alg2'] == '32pruned')]
len(np_32)

2500

In [278]:
# 'Both algorithms equally reasonable'
hm_results_np_32['Answer.algorithms.none'].value_counts(), hm_results_np_32['Answer.algorithms.none'].value_counts(normalize=True)

(False    431
 True      69
 Name: Answer.algorithms.none, dtype: int64,
 False    0.862
 True     0.138
 Name: Answer.algorithms.none, dtype: float64)

In [281]:
# How often was non-pruned more reasonable than 32pruned?
hm_results_np_32.loc[hm_results_np_32['Answer.algorithms.none'] == 0]['nonpruned'].value_counts(), hm_results_np_32.loc[hm_results_np_32['Answer.algorithms.none'] == 0]['32pruned'].value_counts()

(1.0    234
 0.0    197
 Name: nonpruned, dtype: int64,
 0.0    234
 1.0    197
 Name: 32pruned, dtype: int64)

# Experiment 2 - What is on the picture?
Given Parts of an Image, choose what you see on the Image

You will be shown parts of an image. Based on what you see on the image, choose one of the ten categories.

You will be shown parts of an image. Based on what you see on the image, choose one of the ten categories below.
If you are not sure what is on the image, choose 'I don't know / None of the above'.

In [331]:
def preprocess_experiment2_results(path):
    exp2 = pd.read_csv(path)
    columns = ['HITId'
           , 'WorkerId'
           , 'Input.image_url'
           , 'Answer.classes.0-fish'
           , 'Answer.classes.1-dog'
           , 'Answer.classes.2-cassetteplayer'
           , 'Answer.classes.3-chainsaw'
           , 'Answer.classes.4-church'
           , 'Answer.classes.5-frenchhorn'
           , 'Answer.classes.6-garbagetruck'
           , 'Answer.classes.7-gaspump'
           , 'Answer.classes.8-golfball'
           , 'Answer.classes.9-parachute'
           , 'Answer.classes.none']
    exp2 = exp2[columns]
    
    # Extract the img_id, image_class and algorithms of the image_url
    exp2['img_id'] = exp2['Input.image_url'].str.split('_').apply(lambda x: x[0])
    exp2['img_class'] = exp2['Input.image_url'].str.split('_').apply(lambda x: x[1])
    exp2['algorithm'] = exp2['Input.image_url'].str.split('_').apply(lambda x: x[2])
    exp2['algorithm'] = exp2['algorithm'].str.split('.').apply(lambda x: x[0])
    
    # Create column with 1.0 for correct, 0.0 for wrong
    for label in classes:
        exp2.loc[(exp2['img_class'] == label) & (exp2['Answer.classes.{}'.format(label)] == True), 'correct'] = 1
        exp2['correct'].fillna(0, inplace=True)
    
    return exp2

In [332]:
# Imagenette Classes
classes = ['0-fish', 
           '1-dog', 
           '2-cassetteplayer', 
           '3-chainsaw',
           '4-church', 
           '5-frenchhorn', 
           '6-garbagetruck', 
           '7-gaspump', 
           '8-golfball', 
           '9-parachute']

In [333]:
# Load CSV
exp2 = preprocess_experiment2_results('experiments/experiment2-7500-results.csv')
exp2.head(5)

Unnamed: 0,HITId,WorkerId,Input.image_url,Answer.classes.0-fish,Answer.classes.1-dog,Answer.classes.2-cassetteplayer,Answer.classes.3-chainsaw,Answer.classes.4-church,Answer.classes.5-frenchhorn,Answer.classes.6-garbagetruck,Answer.classes.7-gaspump,Answer.classes.8-golfball,Answer.classes.9-parachute,Answer.classes.none,img_id,img_class,algorithm,correct
0,3X4Q1O9UCP86BS7E0MSDJPZ25G47OP,ARUXF7MD6PFIJ,379_2-cassetteplayer_nonpruned.png,False,False,True,False,False,False,False,False,False,False,False,379,2-cassetteplayer,nonpruned,1.0
1,3X4Q1O9UCP86BS7E0MSDJPZ25G47OP,A3OSGBQXGDDRT5,379_2-cassetteplayer_nonpruned.png,False,False,True,False,False,False,False,False,False,False,False,379,2-cassetteplayer,nonpruned,1.0
2,3X4Q1O9UCP86BS7E0MSDJPZ25G47OP,A18V8FRN40LJ4J,379_2-cassetteplayer_nonpruned.png,False,False,True,False,False,False,False,False,False,False,False,379,2-cassetteplayer,nonpruned,1.0
3,3X4Q1O9UCP86BS7E0MSDJPZ25G47OP,A3TYCJO2DG3RMH,379_2-cassetteplayer_nonpruned.png,False,False,True,False,False,False,False,False,False,False,False,379,2-cassetteplayer,nonpruned,1.0
4,3X4Q1O9UCP86BS7E0MSDJPZ25G47OP,A1BZNPQ0H7ZSER,379_2-cassetteplayer_nonpruned.png,False,False,True,False,False,False,False,False,False,False,False,379,2-cassetteplayer,nonpruned,1.0


In [320]:
# Total of 7500 answers
len(exp2)

7500

In [321]:
# Check if every class was answered 750 times
exp2['img_class'].value_counts()

5-frenchhorn        750
6-garbagetruck      750
7-gaspump           750
2-cassetteplayer    750
1-dog               750
4-church            750
8-golfball          750
9-parachute         750
3-chainsaw          750
0-fish              750
Name: img_class, dtype: int64

## Overall: Correct per Class and per Models

In [326]:
# Overview of how many correct per class for all three models, maximum = 750 per class
exp2['img_class'].loc[(exp2['correct'] == 1)].value_counts()

5-frenchhorn        706
1-dog               703
8-golfball          702
4-church            665
0-fish              650
6-garbagetruck      646
9-parachute         613
2-cassetteplayer    599
7-gaspump           594
3-chainsaw          461
Name: img_class, dtype: int64

In [327]:
# Overview of how many correct per model, per model maximum = 2500; total 7500
exp2['algorithm'].loc[(exp2['correct'] == 1)].value_counts()

2pruned.png      2160
nonpruned.png    2146
32pruned.png     2033
Name: algorithm, dtype: int64

In [339]:
# For each class, how many corrects differ between models:
for label in classes:
    print(label)
    print(exp2['algorithm'].loc[(exp2['correct'] == 1) & (exp2['img_class'] == label)].value_counts())

0-fish
2pruned      231
nonpruned    228
32pruned     191
Name: algorithm, dtype: int64
1-dog
32pruned     240
2pruned      238
nonpruned    225
Name: algorithm, dtype: int64
2-cassetteplayer
2pruned      206
nonpruned    199
32pruned     194
Name: algorithm, dtype: int64
3-chainsaw
nonpruned    165
2pruned      165
32pruned     131
Name: algorithm, dtype: int64
4-church
2pruned      231
32pruned     221
nonpruned    213
Name: algorithm, dtype: int64
5-frenchhorn
32pruned     236
nonpruned    235
2pruned      235
Name: algorithm, dtype: int64
6-garbagetruck
2pruned      220
nonpruned    215
32pruned     211
Name: algorithm, dtype: int64
7-gaspump
nonpruned    207
32pruned     197
2pruned      190
Name: algorithm, dtype: int64
8-golfball
2pruned      243
nonpruned    239
32pruned     220
Name: algorithm, dtype: int64
9-parachute
nonpruned    220
2pruned      201
32pruned     192
Name: algorithm, dtype: int64


## Non-Pruned

In [334]:
# Correct answers for non-pruned, splitted by every class, maximum 250 per class
exp2['img_class'].loc[(exp2['correct'] == 1) & (exp2['algorithm'] == 'nonpruned')].value_counts()

8-golfball          239
5-frenchhorn        235
0-fish              228
1-dog               225
9-parachute         220
6-garbagetruck      215
4-church            213
7-gaspump           207
2-cassetteplayer    199
3-chainsaw          165
Name: img_class, dtype: int64

## 2-Pruned

In [335]:
# Correct answers for 2pruned, splitted by every class, maximum 250 per class
exp2['img_class'].loc[(exp2['correct'] == 1) & (exp2['algorithm'] == '2pruned')].value_counts()

8-golfball          243
1-dog               238
5-frenchhorn        235
0-fish              231
4-church            231
6-garbagetruck      220
2-cassetteplayer    206
9-parachute         201
7-gaspump           190
3-chainsaw          165
Name: img_class, dtype: int64

## 32pruned

In [336]:
# Correct answers for 32pruned, splitted by every class, maximum 250 per class
exp2['img_class'].loc[(exp2['correct'] == 1) & (exp2['algorithm'] == '32pruned')].value_counts()

1-dog               240
5-frenchhorn        236
4-church            221
8-golfball          220
6-garbagetruck      211
7-gaspump           197
2-cassetteplayer    194
9-parachute         192
0-fish              191
3-chainsaw          131
Name: img_class, dtype: int64

In [None]:
chisquare([231, 191], [228, 228])