# Import Dataset

In [87]:
import os
import sys
import gensim
import pandas as pd
import numpy as np
from gensim.models.doc2vec import LabeledSentence

csv = 'hate_speech_dataset.csv'
data = pd.read_csv(csv,index_col=0)
data.head()

Unnamed: 0,sentence,physics,race,religion
0,I don't get why negroes always traveling to wh...,-1,1,-1
1,All of you fattys need to stop trying to make ...,1,-1,-1
2,lmao how funny that true does know where the c...,-1,-1,-1
3,@art_is_forever when did she publicly thank him?,-1,-1,-1
4,Post a picture of Khloe already!!!!! Come on!!!!,-1,-1,-1


In [88]:
X = data.sentence
y_physics = data.physics
y_race = data.race
y_religion = data.religion

# Preprocess Data

In [3]:
CONTRACTION_MAP = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
                   "can't've": "cannot have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", 
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have" }

In [4]:
import re, nltk, string
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from nltk.corpus import wordnet

stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def expand_contractions(text) :
    pattern = re.compile("({})".format("|".join(CONTRACTION_MAP.keys())),flags = re.DOTALL| re.IGNORECASE)
    
    def replace_text(t):
        txt = t.group(0)
        if txt.lower() in CONTRACTION_MAP.keys():
            return CONTRACTION_MAP[txt.lower()]
        
    expand_text = pattern.sub(replace_text,text)
    return expand_text

def remove_repeated_characters(word):
    pattern = re.compile(r"(\w*)(\w)\2(\w*)")
    substitution_pattern = r"\1\2\3"
    while True:
        if wordnet.synsets(word):
            return word
        new_word = pattern.sub(substitution_pattern,word)
        if new_word != word:
            word = new_word
            continue
        else:
            return new_word

def spelling_checker(word):
    checker = suggest(word)
    return checker[0][0]

def clean_symbol(text):
    cleanr = re.compile('&#[0-9]+;')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

def clean_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
    
def normalizer(text):
    text = re.sub(r"http\S+", "", text.lower(), flags=re.MULTILINE) #remove url
    text = re.sub('@[^\s]+','',text) #remove username
    text = clean_emoji(text)
    text = clean_symbol(text)
    expand = expand_contractions(text)
    pattern = re.compile("[{}]".format(re.escape(string.punctuation)))
    filter_char =  filter(None,[pattern.sub('' ,expand)])
    text_filter_char =  " ".join(filter_char)
    tokens = nltk.WhitespaceTokenizer().tokenize(text_filter_char)

    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    stems = [stemmer.stem(t) for t in lemmas]
    filtered_result = list(filter(lambda l: l not in stop_words, stems))
    concate = ' '.join(filtered_result)
    return concate

In [89]:
X = X.apply(normalizer)
X.head()

0    get whi negro alway travel white countri take ...
1                   fatti need stop tri make fat thing
2                  lmao funni true doe know camara lol
3                                       publicli thank
4                       post pictur khloe alreadi come
Name: sentence, dtype: object

# Grouping Data

In [90]:
import copy
from keras.preprocessing import sequence
from keras.utils import to_categorical

X_physics_clean = []
X_race_clean = []
X_religion_clean = []

X_physics_no_hate = []
X_race_no_hate = []
X_religion_no_hate = []

X_physics_hate = []
X_race_hate = []
X_religion_hate = []

y_physics_clean = []
y_race_clean = []
y_religion_clean = []

y_physics_no_hate = []
y_race_no_hate = []
y_religion_no_hate = []

y_physics_hate = []
y_race_hate = []
y_religion_hate = []

for i in range(len(y_physics)):
    if y_physics[i] != -1:
        if y_physics[i] == 0:
            X_physics_no_hate.append(X[i])
            y_physics_no_hate.append(y_physics[i])
        else:
            X_physics_hate.append(X[i])
            y_physics_hate.append(y_physics[i])

for i in range(len(y_race)):
    if y_race[i] != -1:
        if y_race[i] == 0:
            X_race_no_hate.append(X[i])
            y_race_no_hate.append(y_race[i])
        else:
            X_race_hate.append(X[i])
            y_race_hate.append(y_race[i])

for i in range(len(y_religion)):
    if y_religion[i] != -1:
        if y_religion[i] == 0:
            X_religion_no_hate.append(X[i])
            y_religion_no_hate.append(y_religion[i])
        else:
            X_religion_hate.append(X[i])
            y_religion_hate.append(y_religion[i])

# Undersampling Imbalance Data

In [91]:
from numpy import random
import numpy as np

def is_balance(no_hate, hate):
    if no_hate == hate:
        return True
    elif no_hate > hate:
        return (no_hate <= round(3/2 * hate))
    else: # hate > no_hate
        return (hate <= round(3/2 * no_hate))

def get_distribution(no_hate, hate):
    if no_hate == hate:
        return no_hate, hate
    elif no_hate > hate:
        return round(3/2 * hate), hate
    else: # hate > no_hate
        return no_hate, round(3/2 * no_hate)

# undesampling physics aspect (if needed)
if is_balance(len(y_physics_no_hate), len(y_physics_hate)):
    X_physics_clean.extend(X_physics_no_hate)
    y_physics_clean.extend(y_physics_no_hate)
    X_physics_clean.extend(X_physics_hate)
    y_physics_clean.extend(y_physics_hate)
else:
    no_hate, hate = get_distribution(len(y_physics_no_hate), len(y_physics_hate))
    print("physics" + " " + str(no_hate) + " " + str(hate))
    X_physics_clean.extend(X_physics_no_hate[:no_hate])
    y_physics_clean.extend(y_physics_no_hate[:no_hate])
    X_physics_clean.extend(X_physics_hate[:hate])
    y_physics_clean.extend(y_physics_hate[:hate])

# undesampling race aspect (if needed)
if is_balance(len(y_race_no_hate), len(y_race_hate)):
    X_race_clean.extend(X_race_no_hate)
    y_race_clean.extend(y_race_no_hate)
    X_race_clean.extend(X_race_hate)
    y_race_clean.extend(y_race_hate)
else:
    no_hate, hate = get_distribution(len(y_race_no_hate), len(y_race_hate))
    print("race" + " " + str(no_hate) + " " + str(hate))
    X_race_clean.extend(X_race_no_hate[:no_hate])
    y_race_clean.extend(y_race_no_hate[:no_hate])
    X_race_clean.extend(X_race_hate[:hate])
    y_race_clean.extend(y_race_hate[:hate])

# undesampling religion aspect (if needed)
if is_balance(len(y_religion_no_hate), len(y_religion_hate)):
    X_religion_clean.extend(X_religion_no_hate)
    y_religion_clean.extend(y_religion_no_hate)
    X_religion_clean.extend(X_religion_hate)
    y_religion_clean.extend(y_religion_hate)
else:
    no_hate, hate = get_distribution(len(y_religion_no_hate), len(y_religion_hate))
    print("religion" + " " + str(no_hate) + " " + str(hate))
    X_religion_clean.extend(X_religion_no_hate[:no_hate])
    y_religion_clean.extend(y_religion_no_hate[:no_hate])
    X_religion_clean.extend(X_religion_hate[:hate])
    y_religion_clean.extend(y_religion_hate[:hate])

X_physics_clean = np.asarray(X_physics_clean)
X_race_clean = np.asarray(X_race_clean)
X_religion_clean = np.asarray(X_religion_clean)

y_physics_clean = np.asarray(y_physics_clean)
y_race_clean = np.asarray(y_race_clean)
y_religion_clean = np.asarray(y_religion_clean)

# random order physics dataset
random_idxs = random.choice(len(y_physics_clean), len(y_physics_clean), replace=False)
X_physics_clean = X_physics_clean[random_idxs]
y_physics_clean = y_physics_clean[random_idxs]

# random order race dataset
random_idxs = random.choice(len(y_race_clean), len(y_race_clean), replace=False)
X_race_clean = X_race_clean[random_idxs]
y_race_clean = y_race_clean[random_idxs]

# random order religion dataset
random_idxs = random.choice(len(y_religion_clean), len(y_religion_clean), replace=False)
X_religion_clean = X_religion_clean[random_idxs]
y_religion_clean = y_religion_clean[random_idxs]

print(len(y_physics_clean))
print(len(y_race_clean))
print(len(y_religion_clean))

physics 754 1131
race 112 168
1885
280
344


In [114]:
df_physics = pd.DataFrame({ 'sentence': X_physics_clean, 'physics': y_physics_clean }, columns = ['sentence', 'physics'])
df_race = pd.DataFrame({ 'sentence': X_race_clean, 'race': y_race_clean }, columns = ['sentence', 'race'])
df_religion = pd.DataFrame({ 'sentence': X_religion_clean, 'religion': y_religion_clean }, columns = ['sentence', 'religion'])

# Split Data Train and Test

In [115]:
from sklearn.model_selection import train_test_split

x_train_physics, x_test_physics, y_train_physics, y_test_physics = train_test_split(df_physics['sentence'], df_physics['physics'], 
                                                                                    test_size = 0.2, random_state = 10)

x_train_race, x_test_race, y_train_race, y_test_race = train_test_split(df_race['sentence'], df_race['race'], 
                                                                        test_size = 0.2, random_state = 10)

x_train_religion, x_test_religion, y_train_religion, y_test_religion = train_test_split(df_religion['sentence'], df_religion['religion'], 
                                                                                        test_size = 0.2, random_state = 10)

In [116]:
def labelize_text(text,label):
    result = []
    prefix = label
    for i, t in zip(text.index, text):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result

all_x_w2v = labelize_text(X, 'ALL')

# physics
x_train_physics = labelize_text(x_train_physics, 'TRAIN')
x_test_physics = labelize_text(x_test_physics, 'TEST')

# race
x_train_race = labelize_text(x_train_race, 'TRAIN')
x_test_race = labelize_text(x_test_race, 'TEST')

# religion
x_train_religion = labelize_text(x_train_religion, 'TRAIN')
x_test_religion = labelize_text(x_test_religion, 'TEST')

  """


# Train Word2Vec

In [11]:
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
from sklearn import utils
import numpy as np

model_w2v = Word2Vec(size=200, min_count=20)
model_w2v.build_vocab([x.words for x in tqdm(all_x_w2v)])
model_w2v.train([x.words for x in tqdm(all_x_w2v)], total_examples=len(all_x_w2v), epochs=1)

100%|██████████████████████████████████████████████████████████████████████████| 4174/4174 [00:00<00:00, 454822.43it/s]
100%|█████████████████████████████████████████████████████████████████████████| 4174/4174 [00:00<00:00, 1733368.80it/s]


(7942, 26188)

## Build Document Vector using Average Word Vector With TF-IDF

In [12]:
from sklearn.preprocessing import scale
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer=lambda x: x)
matrix = vectorizer.fit_transform([x.words for x in all_x_w2v])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

def build_Word_Vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: 
            
            continue
    if count != 0:
        vec /= count
    return vec

In [117]:
# physics
train_vecs_physics = np.concatenate([build_Word_Vector(z, 200) for z in tqdm(map(lambda x: x.words, x_train_physics))])
test_vecs_physics = np.concatenate([build_Word_Vector(z, 200) for z in tqdm(map(lambda x: x.words, x_test_physics))])

# race
train_vecs_race = np.concatenate([build_Word_Vector(z, 200) for z in tqdm(map(lambda x: x.words, x_train_race))])
test_vecs_race = np.concatenate([build_Word_Vector(z, 200) for z in tqdm(map(lambda x: x.words, x_test_race))])

# religion
train_vecs_religion = np.concatenate([build_Word_Vector(z, 200) for z in tqdm(map(lambda x: x.words, x_train_religion))])
test_vecs_religion = np.concatenate([build_Word_Vector(z, 200) for z in tqdm(map(lambda x: x.words, x_test_religion))])

1508it [00:00, 7546.53it/s]
377it [00:00, 10961.21it/s]
224it [00:00, 6312.61it/s]
56it [00:00, 5238.55it/s]
275it [00:00, 6961.69it/s]
69it [00:00, 3416.93it/s]


# Bi-LSTM Hate Detection Model for Physics

In [14]:
from keras.optimizers import SGD

batch_size = 1
num_epochs = 100
hidden_size = 10
timesteps = 1
num_class = 1
data_dim = len(train_vecs_physics[0])
num_data = len(train_vecs_physics)
num_data_test = len(test_vecs_physics)

train_vecs_physics = train_vecs_physics.reshape((num_data, timesteps, data_dim))
y_train_physics = y_train_physics.reshape((num_data, num_class))
test_vecs_physics = test_vecs_physics.reshape((num_data_test, timesteps, data_dim))
y_test_physics = y_test_physics.reshape((num_data_test, num_class))

model_hd_physics = Sequential()
model_hd_physics.add(Bidirectional(LSTM(hidden_size, input_shape=(timesteps, data_dim)), merge_mode='ave'))
model_hd_physics.add(Dropout(0.5))
model_hd_physics.add(Dense(1, activation='sigmoid'))
model_hd_physics.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_hd_physics.fit(train_vecs_physics, y_train_physics, epochs=num_epochs, validation_data=[test_vecs_physics, y_test_physics])

  del sys.path[0]
  from ipykernel import kernelapp as app


Train on 1508 samples, validate on 377 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100


Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100


Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1d6a5498b70>

In [15]:
from sklearn.metrics import accuracy_score, classification_report

prediction = {}
prediction['hd_physics'] = model_hd_physics.predict(test_vecs_physics)

for i in range(len(prediction['hd_physics'])):
    prediction['hd_physics'][i][0] = round(prediction['hd_physics'][i][0])

accuracy = {}
accuracy['hd_physics'] = accuracy_score(y_test_physics, prediction['hd_physics'])
print("Accuracy: ", accuracy['hd_physics'], "\n")
print(classification_report(y_test_physics, prediction['hd_physics'], labels = [0, 1]))

Accuracy:  0.7320954907161804 

             precision    recall  f1-score   support

          0       0.74      0.54      0.62       156
          1       0.73      0.87      0.79       221

avg / total       0.73      0.73      0.72       377



# MLP Hate Detection Model for Race

In [175]:
prediction = {}
accuracy = {}
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model_hd_race = MLPClassifier(hidden_layer_sizes=(10,10), alpha=0.005, activation='identity', learning_rate='adaptive', learning_rate_init=0.05, solver='adam')
model_hd_race.fit(train_vecs_race, y_train_race)

prediction['hd_race'] = model_hd_race.predict(test_vecs_race)
accuracy['hd_race'] = accuracy_score(y_test_race, prediction['hd_race'])
print("Accuracy: ", accuracy['hd_race'], "\n")
print(classification_report(y_test_race, prediction['hd_race'], labels = [0, 1]))

Accuracy:  0.8214285714285714 

             precision    recall  f1-score   support

          0       0.82      0.53      0.64        17
          1       0.82      0.95      0.88        39

avg / total       0.82      0.82      0.81        56



## Save Model

In [100]:
from sklearn.externals import joblib
joblib.dump(model_hd_race, 'mlp_hd_race.model')
model_hd_race = joblib.load('mlp_hd_race.model')

# MLP Hate Detection Model for Religion

In [163]:
prediction = {}
accuracy = {}
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model_hd_religion = MLPClassifier(hidden_layer_sizes=(10,10), alpha=0.005, activation='identity', learning_rate='adaptive', learning_rate_init=0.05, solver='adam')
model_hd_religion.fit(train_vecs_religion, y_train_religion)

prediction['hd_religion'] = model_hd_religion.predict(test_vecs_religion)
accuracy['hd_religion'] = accuracy_score(y_test_religion, prediction['hd_religion'])
print("Accuracy: ", accuracy['hd_religion'], "\n")
print(classification_report(y_test_religion, prediction['hd_religion'], labels = [0, 1]))

Accuracy:  0.927536231884058 

             precision    recall  f1-score   support

          0       0.94      0.91      0.92        33
          1       0.92      0.94      0.93        36

avg / total       0.93      0.93      0.93        69



## Save Model

In [181]:
from keras import models
from sklearn.externals import joblib

model_hd_physics.save('bilstm_hd_physics.model')
joblib.dump(model_hd_race, 'mlp_hd_race.model')
joblib.dump(model_hd_religion, 'mlp_hd_religion.model')


['mlp_hd_religion.model']

# Load Model

In [166]:
from keras import models
from sklearn.externals import joblib

model_hd_physics = models.load_model('bilstm_hd_physics.model')
model_hd_race = joblib.load('mlp_hd_race.model')
model_hd_religion = joblib.load('mlp_hd_religion.model')

# Aspect Detection Model for Predict New Data 

In [186]:
# 1:hate, 0:no hate

def predict_aspect(text):
    text = normalizer(text)
    tokens = nltk.WhitespaceTokenizer().tokenize(text)
    vecs = build_Word_Vector(tokens, 200)
    vecs_reshape = vecs.reshape((1, 1, 200))
    aspect = {}
    aspect['physics'] = int(round(model_hd_physics.predict(vecs_reshape)[0][0]))
    aspect['race'] = int(round(model_hd_race.predict(vecs)[0]))
    aspect['religion'] = int(round(model_hd_religion.predict(vecs)[0]))
    return aspect

In [187]:
predict_aspect("ugly")



{'physics': 1, 'race': 0, 'religion': 0}