In [74]:
%load_ext autoreload
%autoreload 2

import numpy as np
from tqdm.notebook import tqdm
import random
from sklearn.neural_network import MLPClassifier
from collections import Counter

import matplotlib.pyplot as plt
import torch
from torch import nn
from torch import optim
import sklearn
import math

from utils import *
import copy
from sklearn import tree
from sklearn.metrics import fbeta_score
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from scipy.spatial.distance import cosine
from sklearn.linear_model import LogisticRegression
import copy
import pandas as pd

from models.toxicity_data_models import *
from models.train_toxicity_detection import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset

In [2]:
# replace location below with the location of the glove pretrained model on your system

glove_model = loadGloveModel('../All Code/models/glove.twitter.27B.25d.txt')
vocab = glove_model.keys()
len(vocab)

Loading Glove Model


1193514

### Data sources

There are two data sources being used for this analysis.

1. First is the main Jigsaw dataset. Please download the Jigsaw dataset from this link to execute the code below - https://github.com/Nihal2409/Jigsaw-Unintended-Bias-in-Toxicity-Classification/blob/master/Jigsaw_Unintended_Bias_in_Toxicity_Classification.ipynb

2. Second is the specialized rater pool data collected by Goyal et al. Please download this data from the following link - https://www.kaggle.com/datasets/google/jigsaw-specialized-rater-pools-dataset

In [75]:
## replace location below with the location of the train.csv file from link in (1) above  
jigsaw_loc = "jigsaw-unintended-bias-in-toxicity-classification/train.csv"

## replace location below with the location of the specialized_rater_pools_data.csv file from link in (2) above  
spec_raters_loc = "jigsaw-unintended-bias-in-toxicity-classification/specialized_rater_pools_data.csv"


In [91]:
## Fetch and preprocess data from the above links
all_data, spec_data = getJigsawData(jigsaw_loc, spec_raters_loc)

In [77]:
spec_data

Unnamed: 0,id,unique_contributor_id,identity_attack,insult,obscene,threat,toxic_score,comment_text,rater_group
0,6080885,5202370157,1.0,1.0,1.0,1.0,1.0,LW2 - A little humor answering those inevitabl...,AA
1,6080885,860598359,1.0,1.0,1.0,1.0,1.0,LW2 - A little humor answering those inevitabl...,AA
2,6080885,8302541685,1.0,1.0,1.0,1.0,1.0,LW2 - A little humor answering those inevitabl...,AA
3,6080885,2601547272,1.0,1.0,1.0,1.0,1.0,LW2 - A little humor answering those inevitabl...,AA
4,6080885,3101946413,1.0,1.0,1.0,1.0,1.0,LW2 - A little humor answering those inevitabl...,AA
...,...,...,...,...,...,...,...,...,...
382495,6047764,2121525947,1.0,1.0,1.0,1.0,1.0,"Respect is not a God given right, as far back ...",Control
382496,6047764,3223755939,1.0,1.0,1.0,1.0,1.0,"Respect is not a God given right, as far back ...",Control
382497,6047764,5722501435,1.0,1.0,1.0,1.0,1.0,"Respect is not a God given right, as far back ...",Control
382498,6047764,5421794538,-1.0,0.0,1.0,1.0,-1.0,"Respect is not a God given right, as far back ...",Control


### Ground truth

We present analysis with respect to both "subjective" (defined by annotators whose demographic matches that of the targeted demographic of a post) and "objective" ground truth (defined by the initial Jigsaw data annotators). Please check out the paper for more discussion on these definitions.

In this code, you can select "subjective" or "objective" ground truth by changing the variable below.

In [78]:
analysis_type = "subjective"

In [82]:
# Extracting 25-dimensional feature for each post using the pretrained model

postsToFeatures, postsToLabels, postsToIds = getFeatures(all_data, spec_data, glove_model, analysis_type)
len(postsToFeatures)

22200

### Human annotators

In [83]:
# Get all annotator ids
experts = getExperts(all_data, spec_data)

953 annotators


In [84]:
expert_pools = {}
for ann in experts:
    rows = spec_data[spec_data['unique_contributor_id'] == ann]
    group = list(rows["rater_group"])[0]
    expert_pools[ann] = group

## Distribution of annotator demographics from Goyal et al. data
sum([1 for _, g in expert_pools.items() if g == "LGBTQ"]), sum([1 for _, g in expert_pools.items() if g == "AA"]), sum([1 for _, g in expert_pools.items() if g == "Control"])


(318, 313, 322)

## Online training

#### Generate 25 random train-test splits

In [85]:
train_parts, test_parts = [], []
dSims = []
for rep in (range(25)):
    train, test = getDictPartition(postsToFeatures)        
    train_parts.append(list(train))
    test_parts.append(list(test))
    
    train, test = train_parts[rep], test_parts[rep]

#### Select the training algorithm by setting the appropriate value to the variable below

In [86]:
algorithm = "Strict-Matching" ## replace with Smooth-Matching if using that

In [None]:

X = list(postsToFeatures.values())
ps = list(postsToFeatures.keys())

accs, f_scores, roc_scores, prec_scores, rec_scores = [], [], [], [], []
rocs_by_k = {k:[] for k in range(1,8)}
rocs_group = [[], [], []]

reps = 25

### Repeat experiment #rep times with different dataset splits
for rep in (range(reps)):
    print ("\nStarting rep", rep)
    
    train, test = train_parts[rep], test_parts[rep]
    deferrer = getDeferrer(algorithm, experts, expert_pools, all_data, postsToFeatures, postsToIds, train)    

    ## Train model using the train partition
    deferrer = train_allocation(algorithm, train, deferrer, experts, expert_pools, postsToLabels, postsToIds, X, ps, all_data, spec_data)
    
    ## Testing allocation model    
    acc, f1, roc = test_allocation(test, deferrer, postsToLabels, postsToIds, X, ps, all_data, spec_data)

    accs.append(acc)
    f_scores.append(f1)
    roc_scores.append(roc)
    

In [67]:
print (np.mean(accs), np.std(accs), np.mean(roc_scores), np.std(roc_scores))
print (np.mean(rocs_group, axis=1), np.std(rocs_group, axis=1))

0.8362072072072071 0.007395150831686315 0.6646538064145601 0.012958056420711692
[0.66021521 0.65057174 0.66804211] [0.01826828 0.02338322 0.02088908]


In [72]:
train_accs_strict_2 = np.array(accs)
accs_mean = np.mean(train_accs_strict_2, axis=0)
accs_std = np.std(train_accs_strict_2, axis=0)
xs = np.array(range(len(accs_mean))) * batch_size
plt.errorbar(xs, accs_mean, accs_std, fmt="-o", label="Strict Matching")

train_accs_smooth_2 = np.array(train_accs_smooth_2)
accs_mean = np.mean(train_accs_smooth_2, axis=0)
accs_std = np.std(train_accs_smooth_2, axis=0)
xs = np.array(range(len(accs_mean))) * batch_size
plt.errorbar(xs, accs_mean, accs_std, fmt="-o", label="Smooth Matching")

train_accs_nodsim_2 = np.array(train_accs_nodsim_2)
accs_mean = np.mean(train_accs_nodsim_2, axis=0)
accs_std = np.std(train_accs_nodsim_2, axis=0)
xs = np.array(range(len(accs_mean))) * batch_size
plt.errorbar(xs, accs_mean, accs_std, fmt="-o", label="Training with no dSim")


plt.ylabel("Accuracy", fontsize=18)
plt.xlabel("Training iterations", fontsize=18)
plt.legend(fontsize=13)