# Nutritional Labels for ADS: The Effect of Word Embeddings on Bias
# Responsible Data Science: Final Project


In [1]:
import numpy as np
import pandas as pd
import altair as alt
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from collections import defaultdict
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

## A. Data Pre-Processing

### Import Predictions

In [2]:
# Path for word embedding predictions
path = 'preds/'
filenames = ['fast_preds.npy', 'glove_preds.npy', 'numberbatch_preds.npy', 'without_pretrained.npy', 'meta_preds.npy']
pred_names = ['FastText', 'GloVe', 'ConceptNet', 'Keras Embeddings', 'Meta (FT+GV+CN)']

# Import Predictions
y_preds_raw = []
for i in filenames:
    y_preds_raw.append(np.load(path+i))

### Data Split like ADS

In [3]:
def preprocess(text):
    s_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
    specials = ["’", "‘", "´", "`"]
    p_mapping = {"_":" ", "`":" "}    
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([s_mapping[t] if t in s_mapping else t for t in text.split(" ")])
    for p in p_mapping:
        text = text.replace(p, p_mapping[p])    
    for p in punct:
        text = text.replace(p, f' {p} ')     
    return text.split()

# ADS - Import original data
train = pd.read_csv('data/train.csv')
columns = ['comment_text', 'target', 'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white',  'psychiatric_or_mental_illness']
protected_groups = ['male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish', 'muslim', 'black', 'white',  'psychiatric_or_mental_illness']
# ADS - Split data
_, _, _, AUX_val = train_test_split(train['comment_text'], train[columns], test_size=.2, random_state=42)


### Custom Indexing the Split Data for Analysis

In [4]:
# Preprocess text
AUX_val['comment_text'] = AUX_val['comment_text'].apply(lambda x: preprocess(x.lower()))
# Reset Index
AUX_val.reset_index(drop=True, inplace=True)
# Drop nan
AUX_val = AUX_val[columns].dropna()
non_nan_idx = list(AUX_val.index)
AUX_val.reset_index(drop=True, inplace=True)
# Normalizing target
AUX_val['target'] = np.where(AUX_val['target'] >= 0.5, 1, 0)
# Extracting preds
y_val = AUX_val['target'].values
# Normalizing auxilary values
for c in protected_groups:
    AUX_val[c] = np.where(AUX_val[c] >= 0.5, 1, 0)
    

In [5]:
y_preds = []
# Normalize Predictions
for i in range(len(y_preds_raw)):
    temp = np.where(y_preds_raw[i] >= 0.5, 1, 0)
    temp = np.take(temp, non_nan_idx)
    y_preds.append(temp)
    

## B. Calculating Fairness Metrics

### Overall Accuracy

In [6]:
report_global_accuracy = []
for i in range(len(y_preds)):
    report_global_accuracy.append([pred_names[i], accuracy_score(y_preds[i], y_val)])
# Convert to dataframe
report_a = pd.DataFrame(report_global_accuracy, columns = ['Embeddings', 'Global Accuracy'])


### Unprivileged Group Accuracy - Classwise

In [7]:
report_class_wise_accuracy_unpriv = []
for c in protected_groups:
    for i in range(len(y_preds)):
        u_index = AUX_val.index[AUX_val[c]==1].tolist()
        temp =  accuracy_score(np.take(y_preds[i], u_index), np.take(y_val, u_index))
        report_class_wise_accuracy_unpriv.append([c, pred_names[i], temp])
# Convert to dataframe
report_b = pd.DataFrame(report_class_wise_accuracy_unpriv, columns = ['Identity', 'Embeddings', 'Group Accuracy'])
report_b['Group']=['Unprivileged']*9*5

In [8]:
#display(plot_on_class(report_b, 'Unprivileged Accuracy', [0.76, 0.94]))

### Privileged Group Accuracy - Classwise

In [9]:
report_class_wise_accuracy_priv = []
for c in protected_groups:
    for i in range(len(y_preds)):
        p_index = AUX_val.index[AUX_val[c]==0].tolist()
        temp =  accuracy_score(np.take(y_preds[i], p_index), np.take(y_val, p_index))
        report_class_wise_accuracy_priv.append([c, pred_names[i], temp])
# Convert to dataframe
report_c = pd.DataFrame(report_class_wise_accuracy_priv, columns = ['Identity', 'Embeddings', 'Group Accuracy'])
report_c['Group']=['Privileged']*9*5

In [10]:
report_bc = pd.concat([report_b,report_c])

### Combined Accuracy

In [11]:
#display(plot_on_class(report_c, 'Privileged Accuracy', [0.76, 0.94]))

### Difference in Accuracy (Unprivileged - Privileged)

In [12]:
report_class_wise_accuracy_delta = []
for i in range(len(report_class_wise_accuracy_unpriv)):
    c = report_class_wise_accuracy_unpriv[i][0]
    w = report_class_wise_accuracy_unpriv[i][1]
    temp = report_class_wise_accuracy_priv[i][2] - report_class_wise_accuracy_unpriv[i][2]
    report_class_wise_accuracy_delta.append([c, w, temp])
# Convert to dataframe
report_d = pd.DataFrame(report_class_wise_accuracy_delta, columns = ['Identity', 'Embeddings', 'Accuracy Difference'])


In [13]:
#display(plot_on_class(report_d, 'Accuracy Difference', [-0.1, 0.16]))

### Disparate Impact for Non Toxicity - Classwise

In [14]:
report_class_wise_disparate_impact0 = []
report_class_wise_disparate_impact1 = []
# Values less than 80% will signify disparate impact i.e. unprivileged class is less represented in positive outcomes
for c in protected_groups:
    for i in range(len(y_preds)):
        u_index = AUX_val.index[AUX_val[c]==1].tolist()
        p_index = AUX_val.index[AUX_val[c]==0].tolist()
        
        y_pred_unpriv = np.take(y_val, u_index)
        y_pred_priv = np.take(y_preds[i], p_index)
        # We have inverse mappings of 0 (Non Toxic) and 1 (Toxic) i.e. 0 is desirable and 1 is not desirable
        di0 = (np.count_nonzero(y_pred_unpriv == 0)/len(y_pred_unpriv)) / (np.count_nonzero(y_pred_priv == 0)/len(y_pred_priv))
        di1 = (np.count_nonzero(y_pred_unpriv == 1)/len(y_pred_unpriv)) / (np.count_nonzero(y_pred_priv == 1)/len(y_pred_priv))
        report_class_wise_disparate_impact0.append([c, pred_names[i], di0])
        report_class_wise_disparate_impact1.append([c, pred_names[i], di1])
# Convert to dataframe
report_e0 = pd.DataFrame(report_class_wise_disparate_impact0, columns = ['Identity', 'Embeddings', 'Disparate Impact - Non Toxicity'])
report_e1 = pd.DataFrame(report_class_wise_disparate_impact1, columns = ['Identity', 'Embeddings', 'Disparate Impact - Toxicity'])


### False Positive Rate Difference - Classwise

In [15]:
report_class_wise_diff_fpr = []
for c in protected_groups:
    for i in range(len(y_preds)):
        u_index = AUX_val.index[AUX_val[c]==1].tolist()
        tn_u, fp_u, _, _ =  confusion_matrix(np.take(y_preds[i], u_index), np.take(y_val, u_index)).ravel()
        p_index = AUX_val.index[AUX_val[c]==0].tolist()
        tn_p, fp_p, _, _ =  confusion_matrix(np.take(y_preds[i], p_index), np.take(y_val, p_index)).ravel()
        fpr  = (fp_u/(fp_u+tn_u)) - (fp_p/(fp_p+tn_p))
        report_class_wise_diff_fpr.append([c, pred_names[i], fpr])
# Convert to dataframe
report_f = pd.DataFrame(report_class_wise_diff_fpr, columns = ['Identity', 'Embeddings', 'False Positive Rate Difference'])


## C. Visualization for Identities


In [16]:
# Plots by aggregating on Identities
def plot_on_class(source, y_ax, y_dom):
    bars = alt.Chart().mark_bar(size=20).encode(
        x=alt.X('Identity:O', axis=alt.Axis(labels=False)),
        y=alt.Y('mean('+y_ax+'):Q', scale=alt.Scale(domain=y_dom), title='Mean'),
        color=alt.Color('Identity:N'),
    )

    error_bars = alt.Chart().mark_errorbar(extent='ci').encode(
        x='Identity:O',
        y=y_ax+':Q'
    )

    chart = alt.layer(bars, error_bars, data=source).properties(
                width=350,
               height=350
           ).configure_axis(
               labelFontSize=16,
               titleFontSize=16
           )
    return chart

### Accuracy Privileged v/s Unprivileged

In [17]:
bars = alt.Chart().mark_bar(width=30).encode(
    x=alt.X('Group:O', axis=alt.Axis(labels=False)),
    y=alt.Y('mean('+'Group Accuracy'+'):Q',  title='Mean', scale=alt.Scale(domain=[0.75, 1.0])),
    color=alt.Color('Group:N'),
).interactive()

error_bars = alt.Chart().mark_errorbar(extent='ci').encode(
    x='Group:O',
    y='Group Accuracy'+':Q'
)

alt.layer(bars, error_bars, data=report_bc).properties(
    width=100
).facet(
    column='Identity:N',
).configure_axis(
    labelFontSize=16,
    titleFontSize=14
)

- Social Identites like Blacks, Homosexual Gay or Lesbian, Muslim and White have lower accuracy as compared to other identities and their Privileged counterparts.

### Disparate Impact

In [18]:
display(plot_on_class(report_e1, 'Disparate Impact - Toxicity', [1, 4.5]))
display(plot_on_class(report_e0, 'Disparate Impact - Non Toxicity', [0.65, 1]))

### False Positive Rate

In [19]:
display(plot_on_class(report_f, 'False Positive Rate Difference', [0.0, 0.16]))

## D. Visualizations for Embeddings

### Accuracy

In [20]:
alt.Chart(report_a).mark_bar(size=30).encode(
    x=alt.X('Embeddings', axis=alt.Axis(labels=False)),
    y=alt.Y('Global Accuracy',scale=alt.Scale(domain=[0.918, 0.930])),
    color=alt.Color('Embeddings:N'),
).properties(
    width=300,
    height=300
)


In [21]:
# Plots by aggregating on Identities
def plot_on_emb(source, y_ax, y_dom, t):
    
    chart = alt.Chart(source,title = t ).mark_boxplot(size=30).encode(
        x=alt.X('Embeddings:O', axis=alt.Axis(labels=False)),
        y=alt.Y(y_ax+':Q',scale=alt.Scale(domain=y_dom)),
        color=alt.Color('Embeddings:N', legend=None)
    ).properties(
        width=350,
        height=350
    ).configure_axis(
       labelFontSize=14,
       titleFontSize=14
    )

    return chart

In [22]:
bars = alt.Chart().mark_bar(width=30).encode(
    x=alt.X('Group:O', axis=alt.Axis(labels=False)),
    y=alt.Y('mean('+'Group Accuracy'+'):Q',  title='Mean', scale=alt.Scale(domain=[0.8, 1.0])),
    color=alt.Color('Group:N'),
).interactive()

error_bars = alt.Chart().mark_errorbar(extent='ci').encode(
    x='Group:O',
    y='Group Accuracy'+':Q'
)

alt.layer(bars, error_bars, data=report_bc).properties(
    width=100
).facet(
    column='Embeddings:N',
).configure_axis(
    labelFontSize=14,
    titleFontSize=14
)

### Disparate Impact

In [23]:
display(plot_on_emb(report_e1, 'Disparate Impact - Toxicity', [0.5, 5.0], 'Disparate Impact on Toxicity'))
display(plot_on_emb(report_e0, 'Disparate Impact - Non Toxicity', [0.70, 1], 'Disparate Impact on Non Toxicity'))

### False Positive Rate Difference

In [24]:
display(plot_on_emb(report_f, 'False Positive Rate Difference', [0, 0.18], 'False Positive Rate Difference'))

- GloVe Embeddings has lower median False Positive Rates.
- Fast Text Embeddings have the lowest IQR, i.e. it has lesser spread of False positive across all identities.

## E. FPR Detailed

In [25]:
# FPR detailed analysis
alt.Chart(report_f).mark_bar().encode(
    x=alt.X('Embeddings:O', axis=alt.Axis(labels=False)),#  alt.X('Horsepower', axis=alt.Axis(labels=False))
    y='False Positive Rate Difference:Q',
    color=alt.Color('Embeddings:N'),
    column='Identity:N'
).configure_axis(
    labelFontSize=14,
    titleFontSize=14
)

## F. Original Author's Bias Detection Scores

In [26]:
# Import data
auc = pd.read_excel('auc.xls')
display(auc)

Unnamed: 0,subgroup,bnsp_auc,bpsn_auc,subgroup_auc,subgroup_size
0,black,0.818785,0.973196,0.833273,1530
1,homosexual_gay_or_lesbian,0.827367,0.972482,0.842704,1022
2,white,0.840458,0.971736,0.844978,2561
3,muslim,0.859703,0.969727,0.859745,2523
4,psychiatric_or_mental_illness,0.886718,0.978235,0.909133,272
5,jewish,0.905592,0.967983,0.911325,773
6,male,0.92886,0.968446,0.922873,3792
7,female,0.92907,0.96726,0.923345,6047
8,christian,0.942399,0.957218,0.926088,3310


### Subgroup AUC

In [27]:
alt.Chart(auc, title="Subgroup AUC").mark_bar(size=25).encode(
    x=alt.X('subgroup', axis=alt.Axis(labels=False)),
    y=alt.Y('subgroup_auc',scale=alt.Scale(domain=[0.80, 0.95])),
    color=alt.Color('subgroup:N'),
).properties(
    width=350,
    height=350
)


### Subgroup BNSP AUC

In [28]:
alt.Chart(auc, title="Subgroup BNSP AUC").mark_bar(size=25).encode(
    x=alt.X('subgroup', axis=alt.Axis(labels=False)),
    y=alt.Y('bnsp_auc',scale=alt.Scale(domain=[0.80, 0.95])),
    color=alt.Color('subgroup:N'),
).properties(
    width=350,
    height=350
)

### Subgroup BPSN AUC

In [29]:
alt.Chart(auc, title="Subgroup BPSN AUC").mark_bar(size=25).encode(
    x=alt.X('subgroup', axis=alt.Axis(labels=False)),
    y=alt.Y('bpsn_auc',scale=alt.Scale(domain=[0.95, 0.98])),
    color=alt.Color('subgroup:N'),
).properties(
    width=350,
    height=350
)

#### Insight:
- Subgroup AUC and Subgroup BNSP AUC show same trends. And they are inversely correlated to the FPR difference.
- Subgroup AUC and Subgroup BNSP AUC also show same trend as Disparate Impact - Non Toxicity.
- Subgroup AUC BPSN shows almost similar trends to FPR difference.
- Subgroup AUC and its variants are very effective measures for fairness.


## Conclusion:
- Blacks, homosexuals, muslims and whites are the identities with most unfair results. The word embedding and LSTM based classification techniques have all failed at being unbiased in predicting the results for these groups.

## G. Explaining the False Positive Misclassification of Blacks & Homosexuals
Identifying the Common Tokens for all the False Positive Cases (across all 5 embeddings)

In [30]:
# Indexing the false positives for blacks & homosexuals
fp=defaultdict(int)
fn=[]
for i in range(len(y_preds)):
    y_pred = y_preds[i]
    for i in range(len(y_pred)):
        if y_pred[i]==1 and y_val[i]==0:
            if AUX_val['black'][i] == 1:    
                fp[('black', i)]+=1
            elif AUX_val['homosexual_gay_or_lesbian'][i]==1:
                fp[('homosexual_gay_or_lesbian', i)]+=1
                
        elif y_pred[i]==0 and y_val[i]==1:
            fn.append(i)

In [31]:
# Getting tokens and count of these identities where misclassification occurred for all 5 cases
WORDS_homosexuals = defaultdict(int)
WORDS_blacks = defaultdict(int)
COUNT_blacks_FP_all = 0
COUNT_homosexuals_FP_all = 0

for k, v in fp.items():
    if v==5 and k[0]=='black':
        COUNT_blacks_FP_all+=1
        for w in AUX_val['comment_text'][k[1]]:
            if w not in stopwords and len(w)>2:
                WORDS_blacks[w]+=1
    if v==5 and k[0]=='homosexual_gay_or_lesbian':
        COUNT_homosexuals_FP_all+=1
        for w in AUX_val['comment_text'][k[1]]:
            if w not in stopwords and len(w)>2:
                WORDS_homosexuals[w]+=1

In [32]:
print('Black False Positives for all 5 embeddings: ', COUNT_blacks_FP_all)
print('Homosexuals False Positives for all 5 embeddings: ', COUNT_homosexuals_FP_all)

Black False Positives for all 5 embeddings:  85
Homosexuals False Positives for all 5 embeddings:  54


In [33]:
# Sorting words
WORDS_homosexuals = list(WORDS_homosexuals.items())
WORDS_homosexuals.sort(key=lambda x : x[1], reverse=True)
WORDS_blacks = list(WORDS_blacks.items())
WORDS_blacks.sort(key=lambda x : x[1], reverse=True)
# dataframes
WORDS_homosexuals_DF = pd.DataFrame(WORDS_homosexuals, columns=['Word', 'Count'])
WORDS_blacks_DF = pd.DataFrame(WORDS_blacks, columns=['Word', 'Count'])

### Top 20 FP misclassifying words for Blacks

In [34]:
# Plotting the top 20 words that cause misclassification for 
alt.Chart(WORDS_blacks_DF[:20], title='Top 20 False positive Misclassifying Words - Blacks').mark_circle().encode(
    alt.X('Word', scale=alt.Scale(zero=False)),
    alt.Y('Count', scale=alt.Scale(zero=False, padding=1)),
    color=alt.Color('Word', legend=None),
    size=alt.Size('Count', legend=None)
).properties(
    width=400,
    height=200
).configure_axis(
    labelFontSize=14,
    titleFontSize=14
)


- Words like black, white, people, trump, racist etc. cause FP misclassification of comments.

### Top 20 FP misclassifying words for Homosexuals

In [35]:
# Plotting the top 20 words that cause misclassification for 
alt.Chart(WORDS_homosexuals_DF[:20], title='Top 20 False positive Misclassifying Words - Blacks').mark_circle().encode(
    alt.X('Word', scale=alt.Scale(zero=False)),
    alt.Y('Count', scale=alt.Scale(zero=False, padding=1)),
    color=alt.Color('Word', legend=None),
    size=alt.Size('Count', legend=None)
).properties(
    width=400,
    height=200
).configure_axis(
    labelFontSize=14,
    titleFontSize=14
)


- Words like gay, people, one, trump, sex etc. cause FP misclassification of comments.