#### Load The Result of Models

##### Load the debias models train result

In [112]:
# load result file
import pandas as pd
import numpy as np
df = pd.read_csv('../result/csv/TrainByDev.csv')
# Remove rows with duplicated values
df = df.drop_duplicates()
# drop Fusion column
df = df.drop(columns=['Fusion'])
df_init = pd.read_csv('../result/csv/init_results.csv')
df = df.join(df_init.set_index(['Base_Model','Dataset_Name']), on=['Base_Model','Dataset_Name'], rsuffix='_init')
df['InitAcc'] = df['Init_ACC']
df['InitF1'] = df['Init_F1']
df['InitFairness'] = df['Init_Word_Fairness']
df.drop(columns=['Init_ACC','Init_F1','Init_Word_Fairness'], inplace=True)
df.loc[df['Ngram'] == 2, 'Stereotype'] = 'Bigram'


##### Load the init train result

In [113]:
# We join the init results with the final results to compare the difference
# Joining With Init Columns
df_trim = df[['Base_Model','Dataset_Name','Stereotype','FinalF1','FinalBaseF1','InitF1','InitFairness','FinalFairness','NEpoch']]
# sort by Dataset_Name and Base_Model and Stereotype then move Dataset_Name to first column
df_trim = df_trim.sort_values(by=['Dataset_Name','Base_Model','Stereotype'])
df_trim = df_trim[['Dataset_Name','Base_Model','Stereotype','FinalBaseF1','FinalF1','InitF1','InitFairness','FinalFairness','NEpoch']]
#put background color if InitF1 is greater than FinalF1

df_normal = df_trim[(df_trim['Stereotype'] == 'Normal') & (df_trim['NEpoch'] == 40)]
df_normal = df_normal.reset_index(drop=True)


##### Compare the Init Training and The Debias Result

We willl compare the Macro F1 score and the word fairness, and highlight the rows where init models is better than the final models. 

In [114]:

def highlight_larger(df):
    mask = df['InitF1'] > df['FinalF1']
    df_style = pd.DataFrame('', index=df.index, columns=df.columns)
    df_style.loc[mask, ['InitF1', 'FinalF1']] = 'background-color: brown'
    return df_style

df_normal.style.apply(highlight_larger, axis=None)


Unnamed: 0,Dataset_Name,Base_Model,Stereotype,FinalBaseF1,FinalF1,InitF1,InitFairness,FinalFairness,NEpoch
0,ARC,TextCNN,Normal,0.578714,0.504524,0.528883,42.03656,41.246891,40.0
1,ARC,TextRCNN,Normal,0.516946,0.516946,0.569049,41.446255,41.773226,40.0
2,Amazon,TextRCNN,Normal,0.738298,0.738298,0.743151,16.210424,16.149594,40.0
3,Economy,TextCNN,Normal,0.578216,0.551573,0.581171,18.323976,19.498599,40.0
4,Economy,TextRCNN,Normal,0.449775,0.537123,0.601323,17.857359,20.569344,40.0
5,HyperPartisan,TextCNN,Normal,0.759514,0.718615,0.725159,18.360448,18.262161,40.0
6,HyperPartisan,TextRCNN,Normal,0.761088,0.761088,0.763636,17.478048,17.699153,40.0
7,SCIERC,TextCNN,Normal,0.373544,0.395714,0.448077,37.65443,37.305413,40.0
8,SCIERC,TextRCNN,Normal,0.594942,0.605226,0.624815,37.93524,36.505452,40.0
9,Twitter,TextCNN,Normal,0.818077,0.822917,0.820453,19.458007,18.651174,40.0


Here we can see that the Init F1 and Final F1 score look very similiar. We will try to quantify the difference between the two scores, by summing them up.

(Admittedly, This might not be a very precise method.)


In [115]:
# average difference of Final F1 and Init F1
(sum(df_normal['FinalF1']) - sum(df_normal['InitF1']))/len(df_normal)

-0.022559504600939917

In [116]:
# average difference of Word Fairness for Final models and Init models and NORMALIZE 
(sum(df_normal['FinalFairness']) - sum(df_normal['InitFairness']))/len(df_normal)


0.05831974613159797

##### Compare the word fairness of the final model and the init model

In [117]:
def highlight_larger(df):
    mask = df['InitFairness'] > df['FinalFairness']
    df_style = pd.DataFrame('', index=df.index, columns=df.columns)
    df_style.loc[mask, ['InitFairness', 'FinalFairness']] = 'background-color: brown'
    return df_style

df_normal.style.apply(highlight_larger, axis=None)

Unnamed: 0,Dataset_Name,Base_Model,Stereotype,FinalBaseF1,FinalF1,InitF1,InitFairness,FinalFairness,NEpoch
0,ARC,TextCNN,Normal,0.578714,0.504524,0.528883,42.03656,41.246891,40.0
1,ARC,TextRCNN,Normal,0.516946,0.516946,0.569049,41.446255,41.773226,40.0
2,Amazon,TextRCNN,Normal,0.738298,0.738298,0.743151,16.210424,16.149594,40.0
3,Economy,TextCNN,Normal,0.578216,0.551573,0.581171,18.323976,19.498599,40.0
4,Economy,TextRCNN,Normal,0.449775,0.537123,0.601323,17.857359,20.569344,40.0
5,HyperPartisan,TextCNN,Normal,0.759514,0.718615,0.725159,18.360448,18.262161,40.0
6,HyperPartisan,TextRCNN,Normal,0.761088,0.761088,0.763636,17.478048,17.699153,40.0
7,SCIERC,TextCNN,Normal,0.373544,0.395714,0.448077,37.65443,37.305413,40.0
8,SCIERC,TextRCNN,Normal,0.594942,0.605226,0.624815,37.93524,36.505452,40.0
9,Twitter,TextCNN,Normal,0.818077,0.822917,0.820453,19.458007,18.651174,40.0


In [118]:
##### Compare each of the Stereotype to the Normal Stereotype

In [119]:
# Group data by Dataset_name and Base_Model, then for each Stereotype we compare the FinalF1 and FinalFairness with Stereotype = Normal
normal = df_trim[df_trim['Stereotype'] == 'Normal']
# drop column where NEpoch is 40
df_trim = df_trim[df_trim['NEpoch'] != 40]
# Merge the normal DataFrame back into the original DataFrame based on 'Dataset_Name' and 'Base_Model'
merged = pd.merge(df_trim, normal, on=['Dataset_Name', 'Base_Model'], suffixes=('', '_Normal'))

# Calculate the difference in 'Accuracy' compared to 'Normal' stereotype
merged['Accuracy_Difference'] = merged['FinalF1'] - merged['FinalF1_Normal']
merged['Fairness_Difference'] = merged['FinalFairness'] - merged['FinalFairness_Normal']
merged = merged[['Dataset_Name','Base_Model','Stereotype',"FinalF1","FinalFairness","Accuracy_Difference",'Fairness_Difference']] 
merged



Unnamed: 0,Dataset_Name,Base_Model,Stereotype,FinalF1,FinalFairness,Accuracy_Difference,Fairness_Difference
0,ARC,TextCNN,Bigram,0.5234,41.357744,0.004933,0.263218
1,ARC,TextCNN,Bigram,0.5234,41.357744,0.018876,0.110853
2,ARC,TextCNN,Idiom,0.515303,41.04194,-0.003163,-0.052586
3,ARC,TextCNN,Idiom,0.515303,41.04194,0.010779,-0.20495
4,ARC,TextCNN,Normal,0.518467,41.094526,0.0,0.0
5,ARC,TextCNN,Normal,0.518467,41.094526,0.013943,-0.152365
6,ARC,TextCNN,Noun,0.517699,40.976736,-0.000768,-0.117791
7,ARC,TextCNN,Noun,0.517699,40.976736,0.013175,-0.270155
8,ARC,TextCNN,RandomMask,0.510161,40.988152,-0.008306,-0.106374
9,ARC,TextCNN,RandomMask,0.510161,40.988152,0.005637,-0.258738


In [109]:
pd.set_option('display.max_rows', None)
#sort the merged dataframe by Stereotype then Dataset_Name then Base_Model
merged = merged.sort_values(by=['Stereotype','Dataset_Name','Base_Model'])
# remove duplicate rows
merged = merged.drop_duplicates()
merged





Unnamed: 0,Dataset_Name,Base_Model,Stereotype,FinalF1,FinalFairness,Accuracy_Difference,Fairness_Difference
0,ARC,TextCNN,Bigram,0.5234,41.357744,0.004933,0.263218
1,ARC,TextCNN,Bigram,0.5234,41.357744,0.018876,0.110853
12,ARC,TextRCNN,Bigram,0.535906,41.779726,-0.065096,-0.174198
13,ARC,TextRCNN,Bigram,0.535906,41.779726,0.018961,0.0065
24,Amazon,TextCNN,Bigram,0.70634,16.215899,-0.001811,0.108398
30,Amazon,TextRCNN,Bigram,0.748316,16.15077,0.018749,-0.029698
31,Amazon,TextRCNN,Bigram,0.748316,16.15077,0.010018,0.001176
44,ChemProt,TextCNN,Bigram,0.302311,48.969398,-0.004071,0.30174
50,ChemProt,TextRCNN,Bigram,0.389011,49.911164,0.001339,0.101828
55,Economy,TextCNN,Bigram,0.552845,18.893564,0.001272,-0.605034


In [121]:
# get sum of FinalF1 and FinalBaseF1
(df_trim["FinalF1"].sum() - df_trim["InitF1"].sum())/len(df_trim)

-0.01266836115944649

In [122]:
def highlight_larger_fairness(df):
    mask = df['FinalFairness'] > df['InitFairness']
    df_style = pd.DataFrame('', index=df.index, columns=df.columns)
    df_style.loc[mask, ['FinalFairness', 'InitFairness']] = 'background-color: brown'
    return df_style

df_trim.style.apply(highlight_larger_fairness, axis=None)

Unnamed: 0,Dataset_Name,Base_Model,Stereotype,FinalBaseF1,FinalF1,InitF1,InitFairness,FinalFairness,NEpoch
76,ARC,TextCNN,Bigram,0.522761,0.5234,0.528883,42.03656,41.357744,
16,ARC,TextCNN,Idiom,0.501817,0.515303,0.528883,42.03656,41.04194,
26,ARC,TextCNN,Normal,0.536376,0.518467,0.528883,42.03656,41.094526,
52,ARC,TextCNN,Noun,0.529593,0.517699,0.528883,42.03656,40.976736,
24,ARC,TextCNN,RandomMask,0.529666,0.510161,0.528883,42.03656,40.988152,
82,ARC,TextRCNN,Bigram,0.535906,0.535906,0.569049,41.446255,41.779726,
53,ARC,TextRCNN,Idiom,0.533658,0.533658,0.569049,41.446255,41.709315,
27,ARC,TextRCNN,Normal,0.601003,0.601003,0.569049,41.446255,41.953924,
54,ARC,TextRCNN,Noun,0.565773,0.557942,0.569049,41.446255,41.726289,
21,ARC,TextRCNN,RandomMask,0.570077,0.570077,0.569049,41.446255,42.191697,


In [179]:
(df_trim["FinalFairness"].sum() - df_trim["InitFairness"].sum())/len(df_trim)

0.08701799044890844

In [163]:
# if Ngram == 2 set Stereotype to Bigram


In [67]:
sum(df['FinalFairness'] - df['InitFairness'])/len(df)

-0.24080997922028965

In [68]:
sum(df['FinalFairness'] - df['FinalBaseFairness'])/len(df)

-0.18177984947052148

In [69]:
sum(df['FinalF1'] - df['InitF1'])/len(df)

0.026067676740432866

In [70]:
sum(df['FinalF1'] - df['FinalBaseF1'])/len(df)

0.007871892024729533

In [71]:
sum(df['FinalAcc'] - df['FinalBaseAcc'])/len(df)

0.00044227175142268904

In [72]:
sum(df['FinalAcc'] - df['InitAcc'])/len(df)

0.015193846226511494

In [73]:
group_df = df.groupby(['Base_Model','Dataset_Name','Stereotype']).mean().unstack()

In [94]:
metrics = ['FinalAcc','FinalF1','FinalFairness']

In [110]:
# get the difference between Stereotype: "Normal" and "Bigram"
for metric in metrics:
    group_df[metric,'Difference'] = group_df[metric]['Bigram'] - group_df[metric]['Normal']
    print(f"{metric} difference between Bigram and Normal: {group_df[metric,'Difference'].sum()}")

FinalAcc difference between Bigram and Normal: 0.043788959968162455
FinalF1 difference between Bigram and Normal: 0.002090610926741343
FinalFairness difference between Bigram and Normal: -0.15767066933288554


In [111]:
for metric in metrics:
    group_df[metric,'Difference'] = group_df[metric]['RandomMask'] - group_df[metric]['Normal']
    print(f"{metric} difference between Random and Normal: {group_df[metric,'Difference'].sum()}")

FinalAcc difference between Random and Normal: 0.0379653620388376
FinalF1 difference between Random and Normal: 0.054506526536841016
FinalFairness difference between Random and Normal: -1.6722630071792643


In [112]:
for metric in metrics:
    group_df[metric,'Difference'] = group_df[metric]['Idiom'] - group_df[metric]['Normal']
    print(f"{metric} difference between Random and Normal: {group_df[metric,'Difference'].sum()}")

FinalAcc difference between Random and Normal: 0.01626704222176123
FinalF1 difference between Random and Normal: -0.006144645618923417
FinalFairness difference between Random and Normal: 1.0919160452429484


In [105]:
group_df['FinalFairness','Difference'].unstack()

Dataset_Name,ARC,Amazon,ChemProt,Economy,HyperPartisan,News,Parties,SCIERC,Twitter,Yelp_Hotel
Base_Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
TextCNN,0.054165,0.071983,0.224031,0.569106,-0.588919,,-0.297137,-0.140036,0.037021,
TextRCNN,0.16225,-0.043818,0.055338,0.030544,-0.20509,,-0.025254,0.206851,-0.268705,


In [None]:
data = group_df.groupby(['Base_Model','Dataset_Name'])
data[]

In [41]:
import pandas as pd
import numpy as np

np.random.seed(0)

# Assume we have the following DataFrame
data = {
    'Gender': np.random.choice(['Male', 'Female'], 200),
    'Class': np.random.choice(['Class_'+str(i+1) for i in range(10)], 200),
    'Score': np.random.randint(60, 100, 200)
}
df = pd.DataFrame(data)

# Group by 'Class' and 'Gender', then calculate mean score
grouped = df.groupby(['Class', 'Gender'])['Score'].mean().unstack()

# Create a new column for the difference between male and female scores
grouped['Score_Difference'] = grouped['Male'] - grouped['Female']

# Now 'grouped' is a new DataFrame where each row is a class, and there's a column for the score difference between male and female
print(grouped)


Gender       Female       Male  Score_Difference
Class                                           
Class_1   73.750000  80.066667          6.316667
Class_10  74.000000  78.312500          4.312500
Class_2   77.875000  85.250000          7.375000
Class_3   82.333333  84.375000          2.041667
Class_4   82.333333  74.777778         -7.555556
Class_5   80.166667  77.909091         -2.257576
Class_6   76.777778  83.083333          6.305556
Class_7   83.500000  77.200000         -6.300000
Class_8   77.250000  78.000000          0.750000
Class_9   76.888889  77.375000          0.486111


In [42]:
grouped['Male']

Class
Class_1     80.066667
Class_10    78.312500
Class_2     85.250000
Class_3     84.375000
Class_4     74.777778
Class_5     77.909091
Class_6     83.083333
Class_7     77.200000
Class_8     78.000000
Class_9     77.375000
Name: Male, dtype: float64

In [76]:
import pandas as pd
import numpy as np

np.random.seed(0)

# Assume we have the following DataFrame
data = {
    'Gender': np.random.choice(['Male', 'Female'], 200),
    'Class': np.random.choice(['Class_'+str(i+1) for i in range(10)], 200),
    'Score': np.random.randint(60, 100, 200),
    'Age': np.random.choice(['Young', 'Old'], 200)
}
df = pd.DataFrame(data)

# Group by 'Class', 'Age' and 'Gender', then calculate mean score
grouped = df.groupby(['Class', 'Age', 'Gender']).mean().unstack()

# Create a new column for the difference between male and female scores
grouped['Score_Difference'] = grouped['Male'] - grouped['Female']

# Now 'grouped' is a new DataFrame where each row is a class with a specific age group, and there's a column for the score difference between male and female
print(grouped)


KeyError: 'Male'

In [85]:
m1 = df.groupby(['Class', 'Age', 'Gender'])['Score'].mean().unstack()


In [86]:
metrics = ['']

Unnamed: 0_level_0,Gender,Female,Male
Class,Age,Unnamed: 2_level_1,Unnamed: 3_level_1
Class_1,Old,77.0,66.0
Class_1,Young,64.0,85.181818
Class_10,Old,77.333333,80.142857
Class_10,Young,64.0,76.888889
Class_2,Old,75.333333,76.5
Class_2,Young,85.5,88.166667
Class_3,Old,82.25,79.0
Class_3,Young,82.5,87.6
Class_4,Old,80.2,73.0
Class_4,Young,85.0,76.2


In [91]:
m2 = df.groupby(['Class', 'Age', 'Gender']).mean().unstack()


In [92]:
m2

Unnamed: 0_level_0,Unnamed: 1_level_0,Score,Score
Unnamed: 0_level_1,Gender,Female,Male
Class,Age,Unnamed: 2_level_2,Unnamed: 3_level_2
Class_1,Old,77.0,66.0
Class_1,Young,64.0,85.181818
Class_10,Old,77.333333,80.142857
Class_10,Young,64.0,76.888889
Class_2,Old,75.333333,76.5
Class_2,Young,85.5,88.166667
Class_3,Old,82.25,79.0
Class_3,Young,82.5,87.6
Class_4,Old,80.2,73.0
Class_4,Young,85.0,76.2


In [93]:
m2['Score']

Unnamed: 0_level_0,Gender,Female,Male
Class,Age,Unnamed: 2_level_1,Unnamed: 3_level_1
Class_1,Old,77.0,66.0
Class_1,Young,64.0,85.181818
Class_10,Old,77.333333,80.142857
Class_10,Young,64.0,76.888889
Class_2,Old,75.333333,76.5
Class_2,Young,85.5,88.166667
Class_3,Old,82.25,79.0
Class_3,Young,82.5,87.6
Class_4,Old,80.2,73.0
Class_4,Young,85.0,76.2
