In [1]:
import pandas as pd
import numpy as np
import pickle 

import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import time

# Helpers

In [2]:
def count_words(text):
    counter = 0
    for i in text:
        counter += len(i.split())
    return counter

def count_token(text):
    s = set()
    for i in text:
        tokenize = i.split()
        for j in tokenize:
            s.add(j)    
    return len(s)


def load_dataset(ds):
    if ds == 1:
        dataset_name = "GabHateCorpus"
    elif ds == 2:
        dataset_name = "Implicit_hate_corpus"
    elif ds == 3:
        dataset_name = "SE2019"
    else:
        dataset_name = "Balanced"

    filepath = "Dataset/"+dataset_name
    df = pd.read_csv(filepath+"/data_final.csv")
    
    print(df['class'].value_counts(normalize=True))
    return df, dataset_name

def split_data(df):
    test_size = 0.20
    x = np.array(df["text"])
    y = np.array(df["class"])

    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = test_size, random_state=42) #random state ensure same sample
    print("Train Set :", x_train.shape, y_train.shape) 
    print("Test Set  :", x_test.shape, y_test.shape) 
    print("Total ", len(df))
    # y in digit form
    y_train_binary = np.array(list(map(lambda x:1 if x=="Hate" else 0, y_train)))
    y_test_binary = np.array(list(map(lambda x:1 if x=="Hate" else 0, y_test)))
    return x_train, y_train, y_train_binary, x_test, y_test, y_test_binary

# Deep Learning

In [3]:
from keras.models import Sequential
from keras.utils.data_utils import pad_sequences
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding
from keras.layers import Conv1D, LSTM, SpatialDropout1D, Bidirectional, GRU, SimpleRNN, TextVectorization

from keras.metrics import BinaryAccuracy,Precision,Recall
import keras
from keras.models import load_model
import tensorflow as tf

from gensim.models import FastText, Word2Vec, KeyedVectors

## Helpers

In [5]:
class TimingCallback(keras.callbacks.Callback):
    def __init__(self, logs={}):
        super(TimingCallback, self).__init__()

    def on_train_begin(self, epoch, logs={}):
        self.starttime = time.time()
    def on_train_end(self, epoch, logs={}):
        self.stoptime = time.time()
        print(f"training time {self.stoptime - self.starttime}")
        
def get_classification_report(i, cr):
    return [i, cr['accuracy'], cr['macro avg']['precision'], 
            cr['macro avg']['recall'], cr['macro avg']['f1-score'],
            cr['Hate']['f1-score'],cr['Non-Hate']['f1-score'], 
            cr['Hate']['support'],cr['Non-Hate']['support']]

def get_result_table():
    c = ['Model', 'Accuracy', 'precision', 'recall', 'f1-score', 'hate f1', "non-hate f1", 'hate support', 'non-hate support']
    result_table = pd.DataFrame(columns=c)
    return result_table

def get_result_single(y_test, y_test_pred, model_name, result_table):
    cr = classification_report(y_test, y_test_pred, labels=["Hate","Non-Hate"], output_dict=True)
    result_table.loc[len(result_table)] = get_classification_report(model_name, cr)

# def get_result_multiple(x_test, y_test, model_to_load):
#     c = ['Model', 'Accuracy', 'precision', 'recall', 'f1-score', 'hate f1', "non-hate f1", 'hate support', 'non-hate support']
#     result_table = pd.DataFrame(columns=c)
#     for i in model_to_load:
#         filename = f"models/{i}"
#         print(filename)
#         old_model = load_model(filename)

#         y_test_pred = old_model.predict(x_test, verbose=0)
#         y_test_pred = np.where(y_test_pred > 0.5, "Hate", "Non-Hate") 
#         y_test_pred = y_test_pred.flatten()

#         cr = classification_report(y_test, y_test_pred, labels=["Hate","Non-Hate"], output_dict=True)
#         result_table.loc[len(result_table)] = get_classification_report(i, cr)
#     return result_table.style.highlight_max(color = 'red', axis = 0)

def nn_predict(model,x_test, y_test_binary):
    score = model.evaluate(x_test, y_test_binary, verbose=0)
    print("Score: ", score[0])
    print("Accuracy: ", score[1])

    y_test_pred_percent = model.predict(x_test, verbose=0)
    y_test_pred = np.where(y_test_pred_percent > 0.5, "Hate", "Non-Hate") 
    y_test_pred = y_test_pred.flatten()

    return y_test_pred

def save_model_nn(model, model_name, embedding_name, dataset_name):
    filename = f"models/{dataset_name}_{embedding_name}_{model_name}"
    model.save(filename)
    return filename

def load_model_nn(model_name):
    filename = f"models/{model_name}"
    print(filename)
    return load_model(filename) 

METRICS = [
    BinaryAccuracy(name="accuracy"),
    Precision(name="precision"),
    Recall(name="recall")
]

def compile_fit_save(x_train, y_train_binary, x_test,y_test_binary, model, model_name, embedding_name, dataset_name, save, epoch=5, batch_size=32, lr=0.01):    
    opt = keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=opt,
                loss='binary_crossentropy',
                metrics=METRICS)
    
    history = model.fit(x_train, y_train_binary, epochs=epoch,
                        validation_data=(x_test,y_test_binary),
                        batch_size = batch_size,
                        callbacks=[TimingCallback()])

    if save: 
        save_model_nn(model, model_name, embedding_name, dataset_name)        
    print(f"acc {history.history['val_accuracy'][0]}")
    return model, history

## Embedding

### Glove

In [7]:
def glove_em(x_train):
    embedding_name = "glove"
    text_length = 50 #pad/truncate text to this long, such that each text after token will be this long

    custom_encoder = TextVectorization(
        standardize = None,
        output_sequence_length=text_length, 
    )
    custom_encoder.adapt(x_train)
    vocab = custom_encoder.get_vocabulary()
    print(f"total vocab {len(vocab)}")
    vocab_dict = dict(zip(vocab, range(len(vocab))))

    # load glove to dictionay
    embeddings_dic = dict()
    glove_file = open("Dataset/trained/glove.42B.300d.txt", encoding="utf8")

    for line in glove_file:
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dic[word] = vector_dimensions
    glove_file.close()
    print("Total words ", len(embeddings_dic))

    # create vocab length is the size of token in dictionary
    # Size of the vocabulary
    vocab_length = len(vocab) + 1
    embedding_dim = 300 #each glove word is 100 long

    hits = 0
    miss = 0
    missWord = []

    # create embedding matrix having 100 col
    # for all vocab word we give it a vector value from glove
    # for those not found in glove will be empty 0
    # size of embedding_matriz = size of word_tokenizer.word_index.items()
    # embedding_matrix is the weight 
    embedding_matrix = np.zeros((vocab_length, embedding_dim))
    for word, index in vocab_dict.items():
        embedding_vector = embeddings_dic.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
            hits += 1
        else:
            miss +=1
            missWord.append(word)
    print("Converted %d words (%d misses)" % (hits, miss))

    custom_embedding = Embedding(vocab_length, embedding_dim, 
                embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                trainable = False,
                input_length=text_length,
                mask_zero=True)
    
    return custom_encoder, custom_embedding, embedding_name, missWord

### FastText, Word2Vec

In [6]:
def get_fasttext_model():
    model_name="fasttext_trained"
    return KeyedVectors.load_word2vec_format("./Dataset/trained/wiki-news-300d-1M-subword.vec", binary=False), model_name

def get_word2vec_model():
    model_name = "word2vec_trained"
    return KeyedVectors.load_word2vec_format("./Dataset/trained/GoogleNews-vectors-negative300.bin", binary=True), model_name

def pre_trained_em(x_train, model_em, embedding_name):
    text_length = 50 #pad/truncate text to this long, such that each text after token will be this long

    custom_encoder = TextVectorization(
        standardize = None,
        output_sequence_length=text_length, 
    )
    custom_encoder.adapt(x_train)
    vocab = custom_encoder.get_vocabulary()
    print(f"total vocab {len(vocab)}")
    vocab_dict = dict(zip(vocab, range(len(vocab))))

    vocab_length = len(vocab) + 1
    embedding_dim = 300 

    hits = 0
    miss = 0
    missWord = []

    embedding_matrix = np.zeros((vocab_length, embedding_dim))
    keyVector_key = model_em.index_to_key
    print(f"total vector {len(keyVector_key)}")
    for word, index in vocab_dict.items():
        if word in keyVector_key:
            embedding_vector = np.array(model_em[word])
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector
                hits += 1
        else:
            miss +=1
            missWord.append(word)
            
    print("Converted %d words (%d misses)" % (hits, miss))

    custom_embedding = Embedding(vocab_length, embedding_dim, 
                embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                trainable = False,
                input_length=text_length,
                mask_zero=True)
    
    return custom_encoder, custom_embedding, embedding_name, missWord

### No pre-trained embedding

In [7]:
def noTrained_em(x_train):
    embedding_name = "no_train"
    text_length = 50 #pad/truncate text to this long, such that each text after token will be this long
    vector_size= 300

    custom_encoder = TextVectorization(
        standardize = None,
        output_sequence_length=text_length, 
    )
    custom_encoder.adapt(x_train)
    vocab = custom_encoder.get_vocabulary()
    print(f"total vocab {len(vocab)}")
    vocab_dict = dict(zip(vocab, range(len(vocab))))

    vocab_length = len(vocab) + 1
    embedding_dim = vector_size

    custom_embedding = Embedding(vocab_length, embedding_dim,
                input_length=text_length,
                mask_zero=True)
    return custom_encoder, custom_embedding, embedding_name

## Models

In [8]:
from keras.layers import BatchNormalization
def add_connected_layer(model):
    # model.add(Dropout(0.2))
    # model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    # model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid'))

In [10]:
def cnn(x_train, y_train_binary, x_test,y_test_binary,custom_encoder, custom_embedding, embedding_name, dataset_name, save = True, epoch = 10, batch_size=32, lr=0.01):
    model_name = "cnn"
    print(model_name)
    model = Sequential()
    model.add(custom_encoder)
    model.add(custom_embedding)
    model.add(Conv1D(128, 3, activation='relu'))
    model.add(GlobalMaxPooling1D())
    add_connected_layer(model)
    return compile_fit_save(x_train, y_train_binary, x_test,y_test_binary,model, model_name, embedding_name, dataset_name, save, epoch, batch_size, lr)

def rnn(x_train, y_train_binary, x_test,y_test_binary,custom_encoder, custom_embedding, embedding_name, dataset_name, save = True, epoch = 10, batch_size=32, lr=0.01):
    model_name = "rnn"
    print(model_name)
    model = Sequential()
    model.add(custom_encoder)
    model.add(custom_embedding)
    model.add(SimpleRNN(128))
    add_connected_layer(model)
    return compile_fit_save(x_train, y_train_binary, x_test,y_test_binary,model, model_name, embedding_name, dataset_name, save, epoch, batch_size, lr)

def lstm(x_train, y_train_binary, x_test,y_test_binary,custom_encoder, custom_embedding, embedding_name, dataset_name, save = True, epoch = 10, batch_size=32, lr=0.01):
    model_name = "lstm"
    print(model_name)
    model = Sequential()
    model.add(custom_encoder)
    model.add(custom_embedding)
    # model.add(SpatialDropout1D(0.2))
    model.add(LSTM(128))
    add_connected_layer(model)
    return compile_fit_save(x_train, y_train_binary, x_test,y_test_binary,model, model_name, embedding_name, dataset_name, save, epoch, batch_size, lr)

def gru(x_train, y_train_binary, x_test,y_test_binary,custom_encoder, custom_embedding, embedding_name, dataset_name, save = True, epoch = 10, batch_size=32, lr=0.01):
    model_name = "gru"
    print(model_name)
    model = Sequential()
    model.add(custom_encoder)
    model.add(custom_embedding)
    # model.add(SpatialDropout1D(0.2))
    model.add(GRU(128))
    add_connected_layer(model)
    return compile_fit_save(x_train, y_train_binary, x_test,y_test_binary,model, model_name, embedding_name, dataset_name, save, epoch, batch_size, lr)

# Test

In [11]:
df, dataset_name = load_dataset(4) 
x_train, y_train, y_train_binary, x_test, y_test, y_test_binary = split_data(df)
df_result = get_result_table()
print(dataset_name)
x_train[0]

class
Hate        0.500427
Non-Hate    0.499573
Name: proportion, dtype: float64
Train Set : (27178,) (27178,)
Test Set  : (6795,) (6795,)
Total  33973
Balanced


' if you think im sweating about your petty ass think again bitch bitchimnotscared obviouslyyouare youthreatenedhoe '

In [13]:
def model_start_train(x_train, y_train_binary, x_test,y_test_binary,custom_encoder, custom_embedding, embedding_name, dataset_name):
    model, h = cnn(x_train, y_train_binary, x_test,y_test_binary,custom_encoder, custom_embedding, embedding_name, dataset_name,save=False, epoch=8, batch_size=256, lr=0.001)
    y_test_pred = nn_predict(model, x_test, y_test_binary)
    get_result_single(y_test, y_test_pred, dataset_name+"_"+embedding_name+"_cnn", df_result)

    model, h = rnn(x_train, y_train_binary, x_test,y_test_binary,custom_encoder, custom_embedding, embedding_name, dataset_name,save=False, epoch=8, batch_size=256, lr=0.001)
    y_test_pred = nn_predict(model, x_test, y_test_binary)
    get_result_single(y_test, y_test_pred, dataset_name+"_"+embedding_name+"_rnn", df_result)

    model, h = lstm(x_train, y_train_binary, x_test,y_test_binary,custom_encoder, custom_embedding, embedding_name, dataset_name,save=False, epoch=8, batch_size=256, lr=0.001)
    y_test_pred = nn_predict(model, x_test, y_test_binary)
    get_result_single(y_test, y_test_pred, dataset_name+"_"+embedding_name+"_lstm", df_result)

    model, h = gru(x_train, y_train_binary, x_test,y_test_binary,custom_encoder, custom_embedding, embedding_name, dataset_name,save=False, epoch=8, batch_size=256, lr=0.001)
    y_test_pred = nn_predict(model, x_test, y_test_binary)
    get_result_single(y_test, y_test_pred, dataset_name+"_"+embedding_name+"_gru", df_result)

In [None]:
# word2vec word embedding 
pre_trained_model, model_name = get_word2vec_model()
custom_encoder, custom_embedding, embedding_name, missWord = pre_trained_em(x_train, pre_trained_model, model_name)
print(embedding_name)
print(dataset_name)

model_start_train(x_train, y_train_binary, x_test,y_test_binary,custom_encoder, custom_embedding, embedding_name, dataset_name)

In [None]:
# fasttext word embedding 
pre_trained_model, model_name = get_fasttext_model()
custom_encoder, custom_embedding, embedding_name, missWord = pre_trained_em(x_train, pre_trained_model, model_name)
print(embedding_name)
print(dataset_name)

#model_start_train(x_train, y_train_binary, x_test,y_test_binary,custom_encoder, custom_embedding, embedding_name, dataset_name)

In [13]:
# glove word embedding
custom_encoder, custom_embedding, embedding_name, missWord = glove_em(x_train)
print(embedding_name)
print(dataset_name)

#model_start_train(x_train, y_train_binary, x_test,y_test_binary,custom_encoder, custom_embedding, embedding_name, dataset_name)

total vocab 33332
Total words  1917494
Converted 25654 words (7678 misses)
glove
Balanced


In [None]:
# learned word embedding
custom_encoder, custom_embedding, embedding_name = noTrained_em(x_train)
print(embedding_name)
print(dataset_name)

#model_start_train(x_train, y_train_binary, x_test,y_test_binary,custom_encoder, custom_embedding, embedding_name, dataset_name)

In [26]:
df_result

Unnamed: 0,Model,Accuracy,precision,recall,f1-score,hate f1,non-hate f1,hate support,non-hate support
0,Balanced_glove_cnn,0.721413,0.72247,0.721545,0.721157,0.72961,0.712703,3384.0,3411.0
1,Balanced_glove_cnn,0.724945,0.724994,0.72497,0.724941,0.725913,0.72397,3384.0,3411.0
2,Balanced_glove_cnn,0.587196,0.601943,0.587934,0.572833,0.651163,0.494504,3384.0,3411.0
3,Balanced_glove_cnn,0.720677,0.721616,0.720801,0.720451,0.728392,0.712511,3384.0,3411.0
4,Balanced_glove_cnn,0.71273,0.714015,0.71257,0.712195,0.699785,0.724605,3384.0,3411.0
