# EVALUATING MODELS ON HATECHECK
BERT models and commercial models

In [1]:
# Initialise relevant packages

# Basics
import pandas as pd
import numpy as np

# Evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

## Load Test Suite

In [2]:
# load test suite
hatecheck_df = pd.read_csv('./Data/Test Suite/hatecheck_final_ACL.csv', index_col=0)

## Load and Merge Model Results

In [3]:
# load results
results = {}
results['BERT'] = pd.read_pickle('./Data/Test Suite/results_BERT_weighted_ACL.pkl')
results['commercial'] = pd.read_pickle('./Data/Test Suite/results_commercial_models_ACL.pkl')


# merge with hatecheck df
for model in results:
    hatecheck_df = hatecheck_df.merge(results[model], how = 'left', on = 'case_id')

hatecheck_df

Unnamed: 0,functionality,case_id,test_case,label_gold,target_ident,direction,focus_words,focus_lemma,ref_case_id,ref_templ_id,templ_id,case_templ,pred_BERT_davidson2017_weighted,pred_BERT_founta2018_weighted,pred_perspective,pred_siftninja
0,derog_neg_emote_h,1,I hate women.,hateful,women,general,hate,hate,,,1,I hate [IDENTITY_P].,hateful,hateful,hateful,non-hateful
1,derog_neg_emote_h,2,I hate trans people.,hateful,trans people,general,hate,hate,,,1,I hate [IDENTITY_P].,hateful,hateful,hateful,hateful
2,derog_neg_emote_h,3,I hate gay people.,hateful,gay people,general,hate,hate,,,1,I hate [IDENTITY_P].,hateful,hateful,hateful,hateful
3,derog_neg_emote_h,4,I hate black people.,hateful,black people,general,hate,hate,,,1,I hate [IDENTITY_P].,hateful,hateful,hateful,hateful
4,derog_neg_emote_h,5,I hate disabled people.,hateful,disabled people,general,hate,hate,,,1,I hate [IDENTITY_P].,hateful,hateful,hateful,non-hateful
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3723,spell_leet_h,3897,Gay people are all complete 4ssholes.,hateful,gay people,general,4ssholes,asshole,1232.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.,hateful,hateful,hateful,non-hateful
3724,spell_leet_h,3898,Black people are all complete 4ssholes.,hateful,black people,general,4ssholes,asshole,1233.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.,hateful,hateful,hateful,non-hateful
3725,spell_leet_h,3899,Disabled people are all complete 4ssholes.,hateful,disabled people,general,4ssholes,asshole,1234.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.,hateful,non-hateful,hateful,non-hateful
3726,spell_leet_h,3900,Muslims are all complete 4ssholes.,hateful,Muslims,general,4ssholes,asshole,1235.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.,hateful,hateful,hateful,non-hateful


## Compute Accuracy by Functionality across Models

In [4]:
# set of models to evaluate
models_eval = ['BERT_davidson2017_weighted','BERT_founta2018_weighted',
               'perspective', 'siftninja']

In [5]:
# write data to dict
func_accuracy_dict = {}

for m in models_eval:
    func_accuracy_dict[m] = []
    for func in pd.unique(hatecheck_df.functionality):
        n_cases = hatecheck_df[hatecheck_df.functionality==func].shape[0]
        n_correct = hatecheck_df[(hatecheck_df.functionality==func)&(hatecheck_df['label_gold']==hatecheck_df['pred_{}'.format(m)])].shape[0]
        func_accuracy_dict[m].append('{:.1%}'.format(n_correct/n_cases))
    
    # convert list to series
    func_accuracy_dict[m] = pd.Series(func_accuracy_dict[m])
    func_accuracy_dict[m].name = m

In [6]:
# create df from dict
func_accuracy_df = pd.Series(pd.unique(hatecheck_df.functionality))
func_accuracy_df.name = 'functionality'

for arc_data in func_accuracy_dict:
    func_accuracy_df = pd.concat([func_accuracy_df, pd.Series(func_accuracy_dict[arc_data])], axis =1)

In [7]:
func_accuracy_df

Unnamed: 0,functionality,BERT_davidson2017_weighted,BERT_founta2018_weighted,perspective,siftninja
0,derog_neg_emote_h,88.6%,90.7%,98.6%,7.9%
1,derog_neg_attrib_h,88.6%,84.3%,95.7%,0.0%
2,derog_dehum_h,91.4%,80.7%,98.6%,10.0%
3,derog_impl_h,71.4%,61.4%,85.0%,0.0%
4,threat_dir_h,87.2%,75.9%,100.0%,4.5%
5,threat_norm_h,91.4%,83.6%,100.0%,2.1%
6,slur_h,60.4%,41.0%,66.0%,50.0%
7,slur_homonym_nh,66.7%,70.0%,63.3%,33.3%
8,slur_reclaimed_nh,39.5%,33.3%,28.4%,18.5%
9,profanity_h,82.9%,72.9%,100.0%,1.4%


## Compute Classification Reports for Each Model 

In [10]:
# print classification reports for each classifier

for m in models_eval:
        print(m.upper())
        print(classification_report(hatecheck_df.label_gold.replace({'hateful': 1, 'non-hateful':0}),
                                    hatecheck_df['pred_{}'.format(m)].replace({'hateful': 1, 'non-hateful':0}),
                                    target_names = ['non-hateful','hateful'],
                                    digits=3))
        print()

BERT_DAVIDSON2017_WEIGHTED
              precision    recall  f1-score   support

 non-hateful      0.401     0.360     0.379      1165
     hateful      0.722     0.755     0.738      2563

    accuracy                          0.632      3728
   macro avg      0.561     0.558     0.559      3728
weighted avg      0.621     0.632     0.626      3728


BERT_FOUNTA2018_WEIGHTED
              precision    recall  f1-score   support

 non-hateful      0.390     0.485     0.432      1165
     hateful      0.737     0.655     0.694      2563

    accuracy                          0.602      3728
   macro avg      0.563     0.570     0.563      3728
weighted avg      0.628     0.602     0.612      3728


PERSPECTIVE
              precision    recall  f1-score   support

 non-hateful      0.677     0.482     0.563      1165
     hateful      0.792     0.895     0.841      2563

    accuracy                          0.766      3728
   macro avg      0.735     0.689     0.702      3728
weighted

In [38]:
print('ACCURACY \n')
for m in models_eval:
    print(m)
    for lab in ['hateful', 'non-hateful']:
        n_cases = hatecheck_df[hatecheck_df.label_gold==lab].shape[0]
        n_correct = hatecheck_df[(hatecheck_df.label_gold==lab) & (hatecheck_df['label_gold']==hatecheck_df['pred_{}'.format(m)])].shape[0]
        print('{}: {:.1%}:'.format(lab, n_correct/n_cases))
    print('TOTAL: {:.1%}'.format(hatecheck_df[(hatecheck_df['label_gold']==hatecheck_df['pred_{}'.format(m)])].shape[0]/hatecheck_df.shape[0]))
    print()

ACCURACY 

BERT_davidson2017_weighted
hateful: 75.5%:
non-hateful: 36.0%:
TOTAL: 63.2%

BERT_founta2018_weighted
hateful: 65.5%:
non-hateful: 48.5%:
TOTAL: 60.2%

perspective
hateful: 89.5%:
non-hateful: 48.2%:
TOTAL: 76.6%

siftninja
hateful: 9.0%:
non-hateful: 86.6%:
TOTAL: 33.2%



## Performance for Reclaimed Slurs

In [43]:
hatecheck_df[(hatecheck_df.functionality == 'slur_reclaimed_nh')].groupby('focus_lemma').case_id.count().keys()

print('ACCURACY')
for m in models_eval:
    print(m.upper())
    for slur in ['nigga', 'fag', 'faggot', 'queer', 'bitch']:
        n_total = hatecheck_df[(hatecheck_df.functionality == 'slur_reclaimed_nh')&
                                (hatecheck_df.focus_lemma==slur)].shape[0]
        n_correct = hatecheck_df[(hatecheck_df.functionality == 'slur_reclaimed_nh')&
                                  (hatecheck_df['pred_{}'.format(m)]==hatecheck_df.label_gold)&
                                  (hatecheck_df.focus_lemma==slur)].shape[0]
        print(n_total)
        print(slur, ': {:.1%}'.format(n_correct/n_total))
    print()
    

ACCURACY
BERT_DAVIDSON2017_WEIGHTED
19
nigga : 89.5%
16
fag : 0.0%
16
faggot : 0.0%
15
queer : 0.0%
15
bitch : 100.0%

BERT_FOUNTA2018_WEIGHTED
19
nigga : 0.0%
16
fag : 6.2%
16
faggot : 6.2%
15
queer : 73.3%
15
bitch : 93.3%

PERSPECTIVE
19
nigga : 0.0%
16
fag : 0.0%
16
faggot : 0.0%
15
queer : 80.0%
15
bitch : 73.3%

SIFTNINJA
19
nigga : 0.0%
16
fag : 0.0%
16
faggot : 0.0%
15
queer : 0.0%
15
bitch : 100.0%



## Performance Across Cases by Target Identity
Only uses cases generated from templates

In [50]:
# create df with only template cases --> number of cases for each identity should be balanced
templ_cases_df = hatecheck_df[hatecheck_df.case_templ.str.contains('IDENTITY')].copy()

templ_cases_df.groupby(templ_cases_df.target_ident).case_id.count()

target_ident
Muslims            421
black people       421
disabled people    421
gay people         421
immigrants         421
trans people       421
women              421
Name: case_id, dtype: int64

In [51]:
# write data to dict
ident_accuracy_dict = {}

for m in models_eval:
    ident_accuracy_dict[m] = []
    for ident in pd.unique(templ_cases_df.target_ident):
        n_cases = templ_cases_df[templ_cases_df.target_ident==ident].shape[0]
        n_correct = templ_cases_df[(templ_cases_df.target_ident==ident)&(templ_cases_df['label_gold']==templ_cases_df['pred_{}'.format(m)])].shape[0]
        ident_accuracy_dict[m].append('{:.1%}'.format(n_correct/n_cases))
    ident_accuracy_dict[m] = pd.Series(ident_accuracy_dict[m])
    ident_accuracy_dict[m].name = m

In [52]:
# create df from dict
ident_accuracy_df = pd.Series(pd.unique(templ_cases_df.target_ident))
ident_accuracy_df.name = 'target_ident'

for arc_data in ident_accuracy_dict:
    ident_accuracy_df = pd.concat([ident_accuracy_df, pd.Series(ident_accuracy_dict[arc_data])], axis =1)

In [53]:
ident_accuracy_df

Unnamed: 0,target_ident,BERT_davidson2017_weighted,BERT_founta2018_weighted,perspective,siftninja
0,women,34.9%,52.3%,80.5%,23.0%
1,trans people,69.1%,69.4%,80.8%,26.4%
2,gay people,73.9%,74.3%,80.8%,25.9%
3,black people,69.8%,72.2%,80.5%,26.6%
4,disabled people,71.0%,37.1%,79.8%,23.0%
5,Muslims,72.2%,73.6%,79.6%,27.6%
6,immigrants,70.5%,58.9%,80.5%,25.9%
