In [1]:
import pandas as pd
import numpy as np
import pickle
from tqdm.notebook import tqdm

with open("../data/processed/pretrained_distance.pkl","rb") as file:
    pretrained_data = pickle.load(file)

In [2]:
pretrained_data = pretrained_data[[
    "Person",
    "Race",
    "Gender",
    "cos_distance",
    "sentiment_polarity",
    "sentiment_confidence"
]]

pretrained_data["cos_distance"] = pd.to_numeric(pretrained_data["cos_distance"])
pretrained_data["sentiment_confidence"] = np.subtract(np.multiply(pretrained_data["sentiment_confidence"],2),1)

In [3]:
pretrained_data.head()

Unnamed: 0,Person,Race,Gender,cos_distance,sentiment_polarity,sentiment_confidence
0,Alonzo,African-American,male,0.470043,NEGATIVE,0.074782
1,Jamel,African-American,male,0.414978,NEGATIVE,0.074782
2,Alphonse,African-American,male,0.510046,NEGATIVE,0.074782
3,Jerome,African-American,male,0.512263,NEGATIVE,0.074782
4,Leroy,African-American,male,0.543624,NEGATIVE,0.074782


In [4]:
def getScoreForNames(data, names):
    
    score_sum = 0

    for name in names:

        data_subset = data[data["Person"]==name]
        
        grouped = data_subset.groupby('sentiment_polarity')
        get_weighted_avg = lambda g: np.average(g['cos_distance'], weights=g['sentiment_confidence'])
        polarity_groupby = grouped.apply(get_weighted_avg)

        score_sum += (polarity_groupby["POSITIVE"] - polarity_groupby["NEGATIVE"])

    return score_sum

In [5]:
afam_names = pretrained_data[
    pretrained_data["Race"]=="African-American"
]["Person"].unique()

european_names = pretrained_data[
    pretrained_data["Race"]=="European"
]["Person"].unique()

def getTestStatisticForRace(data):
    return getScoreForNames(data, afam_names) - getScoreForNames(data, european_names)

In [6]:
race_test_statistic = getTestStatisticForRace(pretrained_data)
race_test_statistic

-0.008242118085543926

In [7]:
female_names = pretrained_data[
    pretrained_data["Gender"]=="female"
]["Person"].unique()

male_names = pretrained_data[
    pretrained_data["Gender"]=="male"
]["Person"].unique()

def getTestStatisticForGender(data):
    return getScoreForNames(data, female_names) - getScoreForNames(data, male_names)

In [8]:
gender_test_statistic = getTestStatisticForGender(pretrained_data)
gender_test_statistic

-0.09082521605430965

In [9]:
NUM_ITERATIONS = 1000
race_permutation_test_statistics = []
gender_permutation_test_statistics = []
shuffled_data = pretrained_data.copy()

for i in tqdm(range(NUM_ITERATIONS)):
    shuffled_data["cos_distance"] = shuffled_data["cos_distance"].sample(frac=1).reset_index().drop("index", axis=1)
    
    new_race_test_statistic = getTestStatisticForRace(shuffled_data)
    race_permutation_test_statistics.append(new_race_test_statistic)
    
    new_gender_test_statistic = getTestStatisticForGender(shuffled_data)
    gender_permutation_test_statistics.append(new_gender_test_statistic)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [10]:
np.std(race_permutation_test_statistics)

0.01688458835502464

In [11]:
np.std(gender_permutation_test_statistics)

0.016748794476626166