In [1]:
import warnings
warnings.filterwarnings( 'ignore' )
import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

import pickle

In [2]:
import All_RUT_Models
import RUT_Utils

Using TensorFlow backend.


In [3]:
# hyper parameters for this model

penalty = 'l2'
C = 18
solver = 'newton-cg'
class_weight='balanced'

In [4]:
modelname = 'LR_debiased'

modelpath = './Models/' + modelname + '/'

if not os.path.exists( modelpath ):
    os.makedirs( modelpath )
if not os.path.exists( './Results/' ):
    os.makedirs( './Results/' )

In [5]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [6]:
def remove_sc(x):
    alphanumeric = [character for character in x if (character.isalnum()) | (character==' ')]
    alphanumeric = "".join(alphanumeric)
    return alphanumeric

df = pd.read_csv('wiki_debias_train.csv')
# df = pd.read_csv(r'training_data/wiki_train.csv')
df = df[['comment', 'is_toxic']]
df.columns = ['Comment', 'Toxic']
df.Comment = df.Comment.astype( 'str' )
df.Comment = df.Comment.apply(lambda x: x.lower())
df.Comment = df.Comment.apply(lambda x: remove_sc(x))
df['Toxic'][df['Toxic'].astype(bool) == True] = 1
df['Toxic'][df['Toxic'].astype(bool) == False] = 0
df.shape

(99157, 2)

In [7]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()

# valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
# testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
# com_text, com_label, com_predicted, com_prob = [], [], [], []
# com_indices = []

# fold = 1
# for train_index, test_index in skf.split( df.Comment, df.Toxic ):
# tf.idf vectorization    

X_train, X_test, y_train, y_test  = train_test_split( df.Comment, df.Toxic, test_size = 0.20, random_state = 0)

vectorizer = TfidfVectorizer(  )
vectorizer.fit( X_train.values )

xtrain = vectorizer.transform( X_train.values )
xtest = vectorizer.transform( X_test.values )
ytrain = y_train.values
ytest = y_test.values

# split train and val
xtrain, xval, ytrain, yval = train_test_split( xtrain, ytrain, test_size=0.10, random_state=1 )

# define a model
model = All_RUT_Models.LR_Model( pen=penalty, c=C, sol=solver, class_weight=class_weight )

# train the model
model.fit( xtrain, ytrain )

# save the model
with open( modelpath + modelname + '.pkl', 'wb' ) as f:
    pickle.dump( model, f )

# load saved model
with open( modelpath + modelname + '.pkl', 'rb' ) as f:
    model = pickle.load( f )

# get predictions (probabilities) for validation and test sets respectively
valpredictions = model.predict_proba( xval )[ :, 1 ]
testpredictions = model.predict_proba( xtest )[ :, 1 ]

# optimizer threshold on validation set
threshold = 0.5 #RUT_Utils.optimize_threshold( yval, valpredictions )

# save accuracy, precision, recall, f1 and confusion matrices
vallabels = (valpredictions>=threshold).astype( 'int32' )
testlabels = (testpredictions>=threshold).astype( 'int32' )

valaccuracy = accuracy_score( yval, vallabels )
valprecision = precision_score( yval, vallabels )
valrecall =  recall_score( yval, vallabels )
valf1 =  f1_score( yval, vallabels )
valcm =  confusion_matrix( yval, vallabels )    

testaccuracy =  accuracy_score( ytest, testlabels )
testprecision =  precision_score( ytest, testlabels )
testrecall =  recall_score( ytest, testlabels )
testf1 =  f1_score( ytest, testlabels )
testcm =  confusion_matrix( ytest, testlabels )

# save for future analysis and ensemble
# com_indices.extend( test_index.tolist() )
# com_text.extend( df.loc[ test_index ][ 'Comment' ] )
# com_label.extend( df.loc[ test_index ][ 'Toxic' ].tolist() )
# com_predicted.extend( testlabels )
# com_prob.extend( testpredictions )

# print( 'Fold: {:02d} out of {:02d} completed.'.format( fold, skf.get_n_splits() ) )

# fold = fold + 1
time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")

Total runtime: 0:00:15.38


In [8]:
synthetic = pd.read_csv('Synthetic_debias_test.csv')

synthetic_comments = vectorizer.transform( synthetic['Comment'].values )

synthetic_y = synthetic['Toxic'].values

synthetic_predictions = model.predict_proba( synthetic_comments )[ :, 1 ]

synthetic_labels = (synthetic_predictions>=threshold).astype( 'int32' )
synthetic_accuracy =  accuracy_score( synthetic_y, synthetic_labels )
synthetic_precision =  precision_score( synthetic_y, synthetic_labels )
synthetic_recall =  recall_score( synthetic_y, synthetic_labels )
synthetic_f1 =  f1_score( synthetic_y, synthetic_labels )
synthetic_cm =  confusion_matrix( synthetic_y, synthetic_labels )

print( 'Test Accuracy: ' +  str(synthetic_accuracy) + '\n' )

print( 'Test Precision: ' +  str(synthetic_precision) + '\n' )

print( 'Test Recall: ' +  str(synthetic_recall) + '\n' )

print( 'Test F1: ' + str(synthetic_f1) )

print( np.rot90(np.rot90(synthetic_cm)) )

_tn, _fp, _fn, _tp = synthetic_cm.ravel()
print ( '\n\nTN :' +  str(_tn) )
print ( 'FP :' +  str(_fp) )
print ( 'FN :' +  str(_fn) )
print ( 'TP :' +  str(_tp) )

Test Accuracy: 0.7990961809727809

Test Precision: 0.9654093163157467

Test Recall: 0.6204221305051982

Test F1: 0.755390878442847
[[23751 14531]
 [  851 37431]]


TN :37431
FP :851
FN :14531
TP :23751


In [9]:
synthetic.drop(columns='PredictedLabel', inplace=True)
synthetic['Predicted_Probability'] = synthetic_predictions
synthetic['Predicted_Label'] = synthetic_labels

In [10]:
synthetic.head()

Unnamed: 0,Comment,Toxic,Jew,Muslim,Christian,Hindu,Asian,Black,White,Female,Male,Heterosexual,Homosexual_gay_or_lesbian,Sum,Predicted_Probability,Predicted_Label
0,murder american,1,0,0,0,0,0,0,0,0,0,0,0,0,0.882743,1
1,respect trans,0,0,0,0,0,0,0,0,0,0,0,0,0,0.002476,0
2,respect homosexual,0,0,0,0,0,0,0,0,0,0,0,1,1,0.01213,0
3,hug mexican,0,0,0,0,0,0,0,0,0,0,0,0,0,0.101476,0
4,kill sikh,1,0,0,0,0,0,0,0,0,0,0,0,0,0.985878,1


In [11]:
synthetic.to_csv('./Results/'+modelname+'_synthetic_predictions.csv', index=False)