In [3]:
import warnings
warnings.filterwarnings( 'ignore' )
import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

import tensorflow as tf
import keras.backend as K
from keras.models import load_model
from keras.preprocessing import text, sequence
from keras.utils import to_categorical

# os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
import All_RUT_Models
import RUT_Utils

In [3]:
# hyper parameters for this model

max_len = 150
embed_size = 300
pre_trained_flag = True
embed_trainable = False
emb_weights_init = 'glorot_normal'
lr_rate = 0.001
optimizer = 'adam'
multi_gpu_flag = False
gpus = 2
batch = 64
nepochs = 30
patience = 7
decay = True
decay_rate = 0.5
decay_after = 3

In [4]:
#embeddingfile = './General_Embeddings/glove.txt'
#embeddingfile = './General_Embeddings/w2v_cbow.txt'
embeddingfile = './General_Embeddings/w2v_sg.txt'
#embeddingfile = './General_Embeddings/ft_cbow.vec'
# embeddingfile = './General_Embeddings/ft_sg.vec'

embedding_matrix = []
max_features = 149024

modelname = 'CNN_George_w2v_sg_biased'

modelpath = './Models/' + modelname + '/'

if not os.path.exists( modelpath ):
    os.makedirs( modelpath )
if not os.path.exists( './Results/' ):
    os.makedirs( './Results/' )

In [5]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [6]:
def remove_sc(x):
    alphanumeric = [character for character in x if (character.isalnum()) | (character==' ')]
    alphanumeric = "".join(alphanumeric)
    return alphanumeric

df = pd.read_csv('wiki_train.csv')
# df = pd.read_csv(r'training_data/wiki_train.csv')
df = df[['comment', 'is_toxic']]
df.columns = ['Comment', 'Toxic']
df.Comment = df.Comment.astype( 'str' )
df.Comment = df.Comment.apply(lambda x: x.lower())
df.Comment = df.Comment.apply(lambda x: remove_sc(x))
df['Toxic'][df['Toxic'].astype(bool) == True] = 1
df['Toxic'][df['Toxic'].astype(bool) == False] = 0
df.shape

(95692, 2)

In [7]:
def get_coefs( word, *arr ):
    return word, np.asarray( arr, dtype='float32' )

def get_vectors( tokenizer ):
    word_index = tokenizer.word_index
    num_words = min( max_features, len( word_index ) + 1 )
    embedding_matrix = np.zeros( ( num_words, embed_size ) )
    for word, i in word_index.items(  ):
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get( word )
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    gc.collect()
    return embedding_matrix

if pre_trained_flag == True:
    embeddings_index = dict( get_coefs( *o.rstrip().rsplit(' ') ) for o in open( embeddingfile, encoding='utf-8' ) )

In [8]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()

# valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
# testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
# com_text, com_label, com_predicted, com_prob = [], [], [], []
# com_indices = []

# fold = 1
# for train_index, test_index in skf.split( df.Comment, df.Toxic ):

X_train, X_test, y_train, y_test  = train_test_split( df.Comment, df.Toxic, test_size = 0.20, random_state = 0)

# clearing previous sessions
K.clear_session()
tf.reset_default_graph()

# tokenization with keras tokenizer
tokenizer = text.Tokenizer( num_words=max_features )
tokenizer.fit_on_texts( X_train.values )

traincomments = tokenizer.texts_to_sequences( X_train.values )
testcomments = tokenizer.texts_to_sequences( X_test.values )

# pad the tokenized sequences
xtrain = sequence.pad_sequences( traincomments, maxlen=max_len )
xtest = sequence.pad_sequences( testcomments, maxlen=max_len )

ytrain = y_train.values
ytest = y_test.values

# split train and val
xtrain, xval, ytrain, yval = train_test_split( xtrain, ytrain, test_size=0.10, random_state=0 )

ytrain = to_categorical( ytrain, 2 )
yval = to_categorical( yval, 2 )
ytest = to_categorical( ytest, 2 )

# check if pre-trained word embeddings flag is true
if pre_trained_flag == True:
    embedding_matrix = get_vectors( tokenizer=tokenizer)

# define a model
model = All_RUT_Models.CNN_George( tokenizer=tokenizer, max_len=max_len, embed_size=embed_size,
                                  embedding_matrix=embedding_matrix, embed_trainable=embed_trainable,
                                  emb_weights_init=emb_weights_init, optimizer=optimizer,
                                  multi_gpu_flag=multi_gpu_flag, gpus=gpus )

K.set_value( model.optimizer.lr, lr_rate )

# train the model with callbacks for early stopping
f1metric = RUT_Utils.F1Metrics( modelpath + modelname + '.h5', patience=patience,
                               decay=decay, decay_rate=decay_rate, decay_after=decay_after, softmax=True )
hist = model.fit( xtrain, ytrain, batch_size=batch, validation_data=( xval,yval ),
                 epochs=nepochs, verbose=0, callbacks=[ f1metric ] )

# load saved model
loaded_model = load_model( modelpath + modelname + '.h5' )

# get predictions (probabilities) for validation and test sets respectively
yval = [ np.argmax(y, axis=None, out=None) for y in yval ]
ytest = [ np.argmax(y, axis=None, out=None) for y in ytest ]
valpredictions = loaded_model.predict( xval, verbose=0, batch_size=2048 )[ :, 1 ]
testpredictions = loaded_model.predict( xtest, verbose=0, batch_size=2048 )[ :, 1 ]

# optimizer threshold on validation set
threshold = 0.5 #RUT_Utils.optimize_threshold( yval, valpredictions )

# save accuracy, precision, recall, f1 and confusion matrices
vallabels = (valpredictions>=threshold).astype( 'int32' )
testlabels = (testpredictions>=threshold).astype( 'int32' )

valaccuracy = accuracy_score( yval, vallabels )
valprecision = precision_score( yval, vallabels )
valrecall =  recall_score( yval, vallabels )
valf1 =  f1_score( yval, vallabels )
valcm =  confusion_matrix( yval, vallabels )    

testaccuracy =  accuracy_score( ytest, testlabels )
testprecision =  precision_score( ytest, testlabels )
testrecall =  recall_score( ytest, testlabels )
testf1 =  f1_score( ytest, testlabels )
testcm =  confusion_matrix( ytest, testlabels )

# save for future analysis and ensemble
# com_indices.extend( test_index.tolist() )
# com_text.extend( df.loc[ test_index ][ 'Comment' ] )
# com_label.extend( df.loc[ test_index ][ 'Toxic' ].tolist() )
# com_predicted.extend( testlabels[:,0].tolist() )
# com_prob.extend( testpredictions[:,0].tolist() )

# print( 'Fold: {:02d} out of {:02d} completed.'.format( fold, skf.get_n_splits() ) )

# fold = fold + 1
time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")

Epoch: 000 --MaxValF1: 0.66242038 --CurValF1: 0.66242038 --Patience: 00 --improved f1: 0.66242038
Epoch: 001 --MaxValF1: 0.66319703 --CurValF1: 0.66319703 --Patience: 00 --improved f1: 0.66319703
Epoch: 002 --MaxValF1: 0.68861847 --CurValF1: 0.68861847 --Patience: 00 --improved f1: 0.68861847
Epoch: 003 --MaxValF1: 0.68861847 --CurValF1: 0.67791636 --Patience: 00
Epoch: 004 --MaxValF1: 0.68861847 --CurValF1: 0.67927076 --Patience: 01
Epoch: 005 --MaxValF1: 0.68861847 --CurValF1: 0.68488990 --Patience: 02
Epoch: 006 --MaxValF1: 0.68861847 --CurValF1: 0.67730239 --Patience: 03
Epoch: 007 --MaxValF1: 0.68861847 --CurValF1: 0.66819222 --Patience: 04
Epoch: 008 --MaxValF1: 0.68861847 --CurValF1: 0.67412379 --Patience: 05
Epoch: 009 --MaxValF1: 0.68861847 --CurValF1: 0.67238690 --Patience: 06
Training stopped due to the patience parameter. --Patience: 07
Total runtime: 0:01:36.17


In [9]:
print( 'Validation Accuracy:' + str(valaccuracy) + '\n' )

print( 'Validation Precision: ' + str(valprecision) + '\n' )

print( 'Validation Recall: ' + str(valrecall) + '\n' )

print( 'Validation F1: ' + str(valf1) )

Validation Accuracy:0.9437042842215256

Validation Precision0.7512116316639742

Validation Recall0.6266846361185984

Validation F10.683321087435709


In [10]:
print( np.rot90(np.rot90(valcm)) )

_tn, _fp, _fn, _tp = valcm.ravel()
print ( '\n\nTN :' +  str(_tn) )
print ( 'FP :' +  str(_fp) )
print ( 'FN :' +  str(_fn) )
print ( 'TP :' +  str(_tp) )

[[ 465  277]
 [ 154 6760]]


TN :6760
FP :154
FN :277
TP :465


In [11]:
print( 'Test Accuracy: ' +  str(testaccuracy) + '\n' )

print( 'Test Precision: ' +  str(testprecision) + '\n' )

print( 'Test Recall: ' +  str(testrecall) + '\n' )

print( 'Test F1: ' + str(testf1) )

Test Accuracy: 0.9439364648100736

Test Precision: 0.7416173570019724

Test Recall: 0.6238938053097345

Test F1: 0.6776809852808651


In [12]:
print( np.rot90(np.rot90(testcm)) )

_tn, _fp, _fn, _tp = testcm.ravel()
print ( '\n\nTN :' +  str(_tn) )
print ( 'FP :' +  str(_fp) )
print ( 'FN :' +  str(_fn) )
print ( 'TP :' +  str(_tp) )

[[ 1128   680]
 [  393 16938]]


TN :16938
FP :393
FN :680
TP :1128


In [13]:
synthetic = pd.read_csv('Synthetic_debias_test.csv')

synthetic_comments = tokenizer.texts_to_sequences( synthetic['Comment'].values )

# pad the tokenized sequences
synthetic_comments_ = sequence.pad_sequences( synthetic_comments, maxlen=max_len )

synthetic_y = synthetic['Toxic'].values
synthetic_y = to_categorical( synthetic_y, 2 )

synthetic_y = [ np.argmax(y, axis=None, out=None) for y in synthetic_y ]
synthetic_predictions = loaded_model.predict( synthetic_comments_, verbose=0, batch_size=2048 )[ :, 1 ]

synthetic_labels = (synthetic_predictions>=threshold).astype( 'int32' )
synthetic_accuracy =  accuracy_score( synthetic_y, synthetic_labels )
synthetic_precision =  precision_score( synthetic_y, synthetic_labels )
synthetic_recall =  recall_score( synthetic_y, synthetic_labels )
synthetic_f1 =  f1_score( synthetic_y, synthetic_labels )
synthetic_cm =  confusion_matrix( synthetic_y, synthetic_labels )

print( 'Test Accuracy: ' +  str(synthetic_accuracy) + '\n' )

print( 'Test Precision: ' +  str(synthetic_precision) + '\n' )

print( 'Test Recall: ' +  str(synthetic_recall) + '\n' )

print( 'Test F1: ' + str(synthetic_f1) )

print( np.rot90(np.rot90(synthetic_cm)) )

_tn, _fp, _fn, _tp = synthetic_cm.ravel()
print ( '\n\nTN :' +  str(_tn) )
print ( 'FP :' +  str(_fp) )
print ( 'FN :' +  str(_fn) )
print ( 'TP :' +  str(_tp) )

Test Accuracy: 0.6652343137767097

Test Precision: 0.6083226303621886

Test Recall: 0.9279295752573011

Test F1: 0.7348800645448246
[[35523  2759]
 [22872 15410]]


TN :15410
FP :22872
FN :2759
TP :35523


In [14]:
synthetic.drop(columns='PredictedLabel', inplace=True)
synthetic['Predicted_Probability'] = synthetic_predictions
synthetic['Predicted_Label'] = synthetic_labels

In [15]:
synthetic.head()

Unnamed: 0,Comment,Toxic,Jew,Muslim,Christian,Hindu,Asian,Black,White,Female,Male,Heterosexual,Homosexual_gay_or_lesbian,Sum,Predicted_Probability,Predicted_Label
0,murder american,1,0,0,0,0,0,0,0,0,0,0,0,0,0.022528,0
1,respect trans,0,0,0,0,0,0,0,0,0,0,0,0,0,0.008496,0
2,respect homosexual,0,0,0,0,0,0,0,0,0,0,0,1,1,0.008496,0
3,hug mexican,0,0,0,0,0,0,0,0,0,0,0,0,0,0.282306,0
4,kill sikh,1,0,0,0,0,0,0,0,0,0,0,0,0,0.79772,1


In [16]:
synthetic.to_csv('./Results/'+modelname+'_synthetic_predictions.csv', index=False)