In [2]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn import metrics
from collections import OrderedDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score, f1_score, precision_score, recall_score

In [3]:
import seaborn as sns
from aequitas.group import Group
from aequitas.bias import Bias
from aequitas.fairness import Fairness
from aequitas.plotting import Plot

In [4]:
# read in data
df = pd.read_csv('data/enwiki.labeled_revisions.20k_2015.csv')
df = df.dropna()
df.head()

Unnamed: 0,rev_id,auto_labeled,damaging,goodfaith,feature.english.badwords.revision.diff.match_delta_decrease,feature.english.badwords.revision.diff.match_delta_increase,feature.english.badwords.revision.diff.match_delta_sum,feature.english.badwords.revision.diff.match_prop_delta_decrease,feature.english.badwords.revision.diff.match_prop_delta_increase,feature.english.badwords.revision.diff.match_prop_delta_sum,...,feature.wikitext.revision.parent.external_links,feature.wikitext.revision.parent.headings,feature.wikitext.revision.parent.ref_tags,feature.wikitext.revision.parent.tags,feature.wikitext.revision.parent.templates,feature.wikitext.revision.parent.wikilinks,feature.wikitext.revision.ref_tags,feature.wikitext.revision.tags,feature.wikitext.revision.templates,feature.wikitext.revision.wikilinks
0,644933637.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,574.0,121.0,0.0,1097.0,1.0,373.0,0.0,1119.0,1.0,381.0
1,629393521.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,13.0,0.0,142.0,33.0,44.0,0.0,142.0,33.0,44.0
2,655365754.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,322.0,1.0,73.0,0.0,355.0,1.0,73.0
3,616502017.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,12.0,6.0,5.0,26.0,12.0,59.0,5.0,26.0,12.0,59.0
4,651762922.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,17.0,6.0,16.0,0.0,26.0,7.0,20.0


In [5]:
# Combine anon and new to a 3-category new feature, anonymous, newcomers, experienced
newcomer_seconds = 3.637819e+06

conditions = [
    (df['feature.revision.user.is_anon'] == True),
    (df['feature.revision.user.is_anon'] == False) & (df['feature.temporal.revision.user.seconds_since_registration'] < newcomer_seconds),
    (df['feature.revision.user.is_anon'] == False) & (df['feature.temporal.revision.user.seconds_since_registration'] >= newcomer_seconds)]
choices = [0,1,2]
df['user.type'] = np.select(conditions, choices)
df.head()

Unnamed: 0,rev_id,auto_labeled,damaging,goodfaith,feature.english.badwords.revision.diff.match_delta_decrease,feature.english.badwords.revision.diff.match_delta_increase,feature.english.badwords.revision.diff.match_delta_sum,feature.english.badwords.revision.diff.match_prop_delta_decrease,feature.english.badwords.revision.diff.match_prop_delta_increase,feature.english.badwords.revision.diff.match_prop_delta_sum,...,feature.wikitext.revision.parent.headings,feature.wikitext.revision.parent.ref_tags,feature.wikitext.revision.parent.tags,feature.wikitext.revision.parent.templates,feature.wikitext.revision.parent.wikilinks,feature.wikitext.revision.ref_tags,feature.wikitext.revision.tags,feature.wikitext.revision.templates,feature.wikitext.revision.wikilinks,user.type
0,644933637.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,121.0,0.0,1097.0,1.0,373.0,0.0,1119.0,1.0,381.0,2
1,629393521.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,13.0,0.0,142.0,33.0,44.0,0.0,142.0,33.0,44.0,2
2,655365754.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,322.0,1.0,73.0,0.0,355.0,1.0,73.0,2
3,616502017.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,5.0,26.0,12.0,59.0,5.0,26.0,12.0,59.0,0
4,651762922.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,17.0,6.0,16.0,0.0,26.0,7.0,20.0,2


In [6]:
# add in sample weights
df['sample_weight'] = np.where(df['damaging']==True, 10, 1)

# delete the two sensitive features
df = df.drop(['feature.revision.user.is_anon', 'feature.temporal.revision.user.seconds_since_registration'], axis=1)

# convert user.type to categorical
df['user.type'] = pd.Categorical(df['user.type'])

# divide into X, X_weights and y
y = df["damaging"]
X_with_weights = df.iloc[:,4:].copy()
X_with_weights.head()

Unnamed: 0,feature.english.badwords.revision.diff.match_delta_decrease,feature.english.badwords.revision.diff.match_delta_increase,feature.english.badwords.revision.diff.match_delta_sum,feature.english.badwords.revision.diff.match_prop_delta_decrease,feature.english.badwords.revision.diff.match_prop_delta_increase,feature.english.badwords.revision.diff.match_prop_delta_sum,feature.english.dictionary.revision.diff.dict_word_delta_decrease,feature.english.dictionary.revision.diff.dict_word_delta_increase,feature.english.dictionary.revision.diff.dict_word_delta_sum,feature.english.dictionary.revision.diff.dict_word_prop_delta_decrease,...,feature.wikitext.revision.parent.ref_tags,feature.wikitext.revision.parent.tags,feature.wikitext.revision.parent.templates,feature.wikitext.revision.parent.wikilinks,feature.wikitext.revision.ref_tags,feature.wikitext.revision.tags,feature.wikitext.revision.templates,feature.wikitext.revision.wikilinks,user.type,sample_weight
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66.0,66.0,0.0,...,0.0,1097.0,1.0,373.0,0.0,1119.0,1.0,381.0,2,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,142.0,33.0,44.0,0.0,142.0,33.0,44.0,2,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,322.0,1.0,73.0,0.0,355.0,1.0,73.0,2,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,5.0,26.0,12.0,59.0,5.0,26.0,12.0,59.0,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0,44.0,0.0,...,0.0,17.0,6.0,16.0,0.0,26.0,7.0,20.0,2,1


In [7]:
# split into train and test set
X_with_weights_train, X_with_weights_test, y_train, y_test = train_test_split(X_with_weights, y, test_size=0.3, random_state=42)

# split train with weight to train and weight
X_train = X_with_weights_train.iloc[:,:-1].copy()
X_train_weights = X_with_weights_train.iloc[:,-1].copy()
X_test = X_with_weights_test.iloc[:,:-1].copy()

In [8]:
# parameters from 
#https://github.com/wikimedia/editquality/blob/master/model_info/enwiki.damaging.md
params= {'min_impurity_decrease': 0.0, 
         'loss': 'deviance', 
         'n_estimators': 700, 
         'min_impurity_split': None, 
         'verbose': 0, 
         'criterion': 'friedman_mse', 
         'subsample': 1.0, 
         #'center': True, 
         #'scale': True, 
         'presort': 'auto', 
         'init': None, 
         #'multilabel': False, 
         'max_depth': 7, 
         'random_state': None, 
         'learning_rate': 0.01, 
         'validation_fraction': 0.1, 
         'warm_start': False, 
         'min_samples_split': 2, 
         'min_samples_leaf': 1, 
         'min_weight_fraction_leaf': 0.0, 
         'n_iter_no_change': None, 
         'max_leaf_nodes': None, 
         'tol': 0.0001, 
         'max_features': 'log2'}
         #'labels': [True, False], 
         #'label_weights': OrderedDict([(True, 10)])

## 3. Dropping the sensitive features

In [9]:
# drop the sensitive features
X_train_nosen = X_train.iloc[:,:-1].copy()
X_test_nosen = X_test.iloc[:,:-1].copy()

In [10]:
gb_clf_nosen = GradientBoostingClassifier(**params)
gb_clf_nosen.fit(X_train_nosen, y_train, sample_weight=X_train_weights)



GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=7,
                           max_features='log2', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=700,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [11]:
# test on test set
y_pred_test = gb_clf_nosen.predict(X_test_nosen)

print("accuracy: ", accuracy_score(y_test, y_pred_test).round(4))
print("balanced accuracy: ", balanced_accuracy_score(y_test, y_pred_test).round(4))
print("f1: ", f1_score(y_test, y_pred_test).round(4))
print("recall: ", recall_score(y_test, y_pred_test).round(4))
print("precision: ", precision_score(y_test, y_pred_test).round(4))
print("auc: ", roc_auc_score(y_test, y_pred_test).round(4))

accuracy:  0.9462
balanced accuracy:  0.7184
f1:  0.371
recall:  0.4742
precision:  0.3046
auc:  0.7184


### 3.1 bias for no sensitive feature model

In [12]:
# No sensitive feature model
df_bias_nosen = pd.DataFrame(columns = ['score', 'label_value', 'user.type'])
df_bias_nosen['label_value'] = y_test
df_bias_nosen['user.type'] = X_test.iloc[:,-1].copy().astype(str)
df_bias_nosen['score'] = gb_clf_nosen.predict(X_test_nosen)

g = Group()
xtab, _ = g.get_crosstabs(df_bias_nosen)
absolute_metrics = g.list_absolute_metrics(xtab)
xtab[[col for col in xtab.columns if col not in absolute_metrics]]

model_id, score_thresholds 0 {'rank_abs': [302]}


  col_group = df.fillna({col: pd.np.nan}).groupby(col)


Unnamed: 0,model_id,score_threshold,k,attribute_name,attribute_value,pp,pn,fp,fn,tn,tp,group_label_pos,group_label_neg,group_size,total_entities
0,0,binary 0/1,302,user.type,0,173,821,105,63,758,68,131,863,994,5796
1,0,binary 0/1,302,user.type,1,42,365,25,28,337,17,45,362,407,5796
2,0,binary 0/1,302,user.type,2,87,4308,80,11,4297,7,18,4377,4395,5796


In [13]:
xtab[['attribute_name', 'attribute_value'] + absolute_metrics].round(4)

Unnamed: 0,attribute_name,attribute_value,tpr,tnr,for,fdr,fpr,fnr,npv,precision,ppr,pprev,prev
0,user.type,0,0.5191,0.8783,0.0767,0.6069,0.1217,0.4809,0.9233,0.3931,0.5728,0.174,0.1318
1,user.type,1,0.3778,0.9309,0.0767,0.5952,0.0691,0.6222,0.9233,0.4048,0.1391,0.1032,0.1106
2,user.type,2,0.3889,0.9817,0.0026,0.9195,0.0183,0.6111,0.9974,0.0805,0.2881,0.0198,0.0041


## 4. Balance # of damaging edits within each group

In [14]:
# group by user.type
grouped = df.groupby(df["user.type"])
df_anon = grouped.get_group(0)
df_new = grouped.get_group(1)
df_exp = grouped.get_group(2)

In [15]:
# for each group, seperate to damaging and non-damaging
# anon
grouped_anon = df_anon.groupby(df["damaging"])
df_anon_pos = grouped_anon.get_group(True)
df_anon_neg = grouped_anon.get_group(False)
print(len(df_anon_pos))
print(len(df_anon_neg))
print(len(df_anon_pos)/len(df_anon))

479
2995
0.13788140472078295


In [16]:
# new
grouped_new = df_new.groupby(df["damaging"])
df_new_pos = grouped_new.get_group(True)
df_new_neg = grouped_new.get_group(False)
print(len(df_new_pos))
print(len(df_new_neg))
print(len(df_new_pos)/len(df_new))

197
1159
0.14528023598820058


In [17]:
# exp
grouped_exp = df_exp.groupby(df["damaging"])
df_exp_pos = grouped_exp.get_group(True)
df_exp_neg = grouped_exp.get_group(False)
print(len(df_exp_pos))
print(len(df_exp_neg))
print(len(df_exp_pos)/len(df_exp))

71
14418
0.004900269169714956


In [18]:
# balance each group by randomly sample non-damaging edits
ratio = 15/85

df_anon_neg_sampled = df_anon_neg.sample(n=math.floor(len(df_anon_pos)/ratio), random_state=42)
df_new_neg_sampled = df_new_neg.sample(n=math.floor(len(df_new_pos)/ratio), random_state=42)
df_exp_neg_sampled = df_exp_neg.sample(n=math.floor(len(df_exp_pos)/ratio), random_state=42)

print(len(df_anon_pos)/len(df_anon_neg_sampled))
print(len(df_new_pos)/len(df_new_neg_sampled))
print(len(df_exp_pos)/len(df_exp_neg_sampled))

0.17649226234340457
0.17652329749103943
0.17661691542288557


In [19]:
dfs = [df_anon_pos, df_new_pos,df_exp_pos, df_anon_neg_sampled, df_new_neg_sampled, df_exp_neg_sampled]
df_balanced = pd.concat(dfs, ignore_index=True)

In [44]:
# divide into X, X_weights and y
y = df_balanced ["damaging"]
X_with_weights = df_balanced .iloc[:,4:].copy()
X_with_weights.head()

Unnamed: 0,feature.english.badwords.revision.diff.match_delta_decrease,feature.english.badwords.revision.diff.match_delta_increase,feature.english.badwords.revision.diff.match_delta_sum,feature.english.badwords.revision.diff.match_prop_delta_decrease,feature.english.badwords.revision.diff.match_prop_delta_increase,feature.english.badwords.revision.diff.match_prop_delta_sum,feature.english.dictionary.revision.diff.dict_word_delta_decrease,feature.english.dictionary.revision.diff.dict_word_delta_increase,feature.english.dictionary.revision.diff.dict_word_delta_sum,feature.english.dictionary.revision.diff.dict_word_prop_delta_decrease,...,feature.wikitext.revision.parent.ref_tags,feature.wikitext.revision.parent.tags,feature.wikitext.revision.parent.templates,feature.wikitext.revision.parent.wikilinks,feature.wikitext.revision.ref_tags,feature.wikitext.revision.tags,feature.wikitext.revision.templates,feature.wikitext.revision.wikilinks,user.type,sample_weight
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,13.0,178.0,13.0,79.0,13.0,177.0,13.0,79.0,0,10
1,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,1.0,-1.0,-1.5,...,96.0,1008.0,431.0,793.0,96.0,1008.0,431.0,793.0,0,10
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,41.0,99.0,51.0,179.0,41.0,99.0,51.0,179.0,0,10
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,7.0,19.0,13.0,32.0,7.0,19.0,13.0,32.0,0,10
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,6.0,0.0,...,1.0,9.0,5.0,15.0,1.0,9.0,5.0,15.0,0,10


In [47]:
# split into train and test set
X_with_weights_train, X_with_weights_test, y_train, y_test = train_test_split(X_with_weights, y, test_size=0.3, random_state=42)
# split train with weight to train and weight
X_train = X_with_weights_train.iloc[:,:-1].copy()
X_train_weights = X_with_weights_train.iloc[:,-1].copy()
X_test = X_with_weights_test.iloc[:,:-1].copy()

len(X_train)

3485

In [48]:
# Training
gb_clf_balanced = GradientBoostingClassifier(**params)
gb_clf_balanced.fit(X_train, y_train, sample_weight=X_train_weights)



GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.01, loss='deviance', max_depth=7,
                           max_features='log2', max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=700,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [50]:
# test on test set
y_pred_test = gb_clf_balanced.predict(X_test)

print("accuracy: ", accuracy_score(y_test, y_pred_test).round(4))
print("balanced accuracy: ", balanced_accuracy_score(y_test, y_pred_test).round(4))
print("f1: ", f1_score(y_test, y_pred_test).round(4))
print("recall: ", recall_score(y_test, y_pred_test).round(4))
print("precision: ", precision_score(y_test, y_pred_test).round(4))
print("auc: ", roc_auc_score(y_test, y_pred_test).round(4))

accuracy:  0.8119
balanced accuracy:  0.7136
f1:  0.4973
recall:  0.5673
precision:  0.4427
auc:  0.7136


In [51]:
# Replicate model
df_bias_balanced = pd.DataFrame(columns = ['score', 'label_value', 'user.type'])
df_bias_balanced['label_value'] = y_test
df_bias_balanced['user.type'] = X_test.iloc[:,-1].copy().astype(str)
df_bias_balanced['score'] = gb_clf_balanced.predict(X_test)

g = Group()
xtab, _ = g.get_crosstabs(df_bias_balanced)
absolute_metrics = g.list_absolute_metrics(xtab)
xtab[[col for col in xtab.columns if col not in absolute_metrics]]

model_id, score_thresholds 0 {'rank_abs': [314]}


  col_group = df.fillna({col: pd.np.nan}).groupby(col)


Unnamed: 0,model_id,score_threshold,k,attribute_name,attribute_value,pp,pn,fp,fn,tn,tp,group_label_pos,group_label_neg,group_size,total_entities
0,0,binary 0/1,314,user.type,0,214,739,124,66,673,90,156,797,953,1494
1,0,binary 0/1,314,user.type,1,78,321,40,28,293,38,66,333,399,1494
2,0,binary 0/1,314,user.type,2,22,120,11,12,108,11,23,119,142,1494


In [52]:
xtab[['attribute_name', 'attribute_value'] + absolute_metrics].round(4)

Unnamed: 0,attribute_name,attribute_value,tpr,tnr,for,fdr,fpr,fnr,npv,precision,ppr,pprev,prev
0,user.type,0,0.5769,0.8444,0.0893,0.5794,0.1556,0.4231,0.9107,0.4206,0.6815,0.2246,0.1637
1,user.type,1,0.5758,0.8799,0.0872,0.5128,0.1201,0.4242,0.9128,0.4872,0.2484,0.1955,0.1654
2,user.type,2,0.4783,0.9076,0.1,0.5,0.0924,0.5217,0.9,0.5,0.0701,0.1549,0.162


4979

In [22]:
df_anon_balanced = pd.concat([df_anon_pos, df_anon_neg_sampled], ignore_index=True)
df_new_balanced = pd.concat([df_new_pos, df_new_neg_sampled], ignore_index=True)
df_exp_balanced = pd.concat([df_exp_pos, df_exp_neg_sampled], ignore_index=True)

In [23]:
df_balanced_new = pd.concat([df_anon_balanced.sample(n=100), 
                             df_new_balanced.sample(n=100),
                             df_exp_balanced.sample(n=100)], ignore_index=True)

In [25]:
df_balanced_new.head()

Unnamed: 0,rev_id,auto_labeled,damaging,goodfaith,feature.english.badwords.revision.diff.match_delta_decrease,feature.english.badwords.revision.diff.match_delta_increase,feature.english.badwords.revision.diff.match_delta_sum,feature.english.badwords.revision.diff.match_prop_delta_decrease,feature.english.badwords.revision.diff.match_prop_delta_increase,feature.english.badwords.revision.diff.match_prop_delta_sum,...,feature.wikitext.revision.parent.ref_tags,feature.wikitext.revision.parent.tags,feature.wikitext.revision.parent.templates,feature.wikitext.revision.parent.wikilinks,feature.wikitext.revision.ref_tags,feature.wikitext.revision.tags,feature.wikitext.revision.templates,feature.wikitext.revision.wikilinks,user.type,sample_weight
0,615971434.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,12.0,9.0,18.0,5.0,14.0,9.0,18.0,0,1
1,627654566.0,False,True,False,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,8.0,3.0,19.0,1.0,8.0,3.0,19.0,0,10
2,611741449.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,16.0,25.0,22.0,99.0,16.0,25.0,22.0,98.0,0,1
3,620501782.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,169.0,3.0,49.0,4.0,171.0,3.0,49.0,0,1
4,632609272.0,False,False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,15.0,28.0,8.0,156.0,15.0,28.0,8.0,159.0,0,1


In [26]:
df_balanced_new.to_csv("all_balanced.csv", index=False)