# Start

In [2]:
import pandas as pd
import numpy as np
import pickle 

import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import re
import nltk
from nltk.util import pr
from nltk.corpus import stopwords
import string 

In [1]:
# maximum text
# sb.set()
# pd.set_option('display.max_colwidth', None)

# Load dataset

## SE2019 dataset

In [223]:
df = pd.read_csv("Dataset/SE2019/notstemmer_data.csv")
df['class'].value_counts()

class
Non-Hate    5195
Hate        3781
Name: count, dtype: int64

In [227]:
df.head(5)

Unnamed: 0,class,text,hate
0,Hate,hurray saving us many ways #lockthemup #buildt...,1
1,Hate,would young fighting age men vast majority one...,1
2,Hate,illegals dump kids border like road kill refu...,1
3,Non-Hate,ny times nearly white states pose array proble...,0
4,Non-Hate,orban brussels european leaders ignoring peopl...,0


In [228]:
df[df['text'].isnull()]

Unnamed: 0,class,text,hate


# Split dataset

In [229]:
x = np.array(df["text"])
y = np.array(df["class"])

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.30, random_state=42) #random state ensure same sample
print("Train Set :", x_train.shape, y_train.shape) 
print("Test Set  :", x_test.shape, y_test.shape) 

Train Set : (6283,) (6283,)
Test Set  : (2693,) (2693,)


array(['austria wonder countri refug bill see socialwelfar also help get televisionset refug famili styria ',

In [243]:
x_train[:10]

array(['austria wonderful country refugees bill see socialwelfare also help get televisionset refugees family styria ',
       'please tell bitch next piercing line judgmental everyone fucking sees shut fuck',
       'afghan migrant whose deportation thwarted hero swedish student actually sentenced assault #foxnews',
       ' slavery post white women rape many black women men children u could ever imagine cause ',
       'bewildered eu leaders various plans preventing migrant refugee boat arrivals amp send heres helpful overview ',
       'waking today like ',
       'theyre sending best lot rapists scumbags lowest form dna fake family separatedxxf#stoptheinvasion#deportthemall #noamnesty#buildthewall ',
       'rt #chelseahandler #kimkardashian #kanye #kanyewest #cent #comedy #lol #lmao #memes #bruh #petty #funnyshit #truth ',
       'douglas todd trudeau government goes silent canadas syrian refugees via add alarming list governments fails ',
       'redhead girls hot ginger girls us

# Feature engineering - Word embeding

- https://www.analyticsvidhya.com/blog/2017/06/word-embeddings-count-word2veec/
- CountVectorizer, Tfidftransformer & Tfidfvectorizer are Frequency based Word Embedding technique
- Tfidftransformer acts on sparse matrix and Tfidfvectorizer acts on raw text data
- Tfidfvectorizer = countVectorizater + Tfidftransformer

- https://www.analyticsvidhya.com/blog/2018/07/hands-on-sentiment-analysis-dataset-python/
- vectorizer = word embedding process of converting text data to numerical vector

In [134]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Word2Vec

- https://spotintelligence.com/2023/02/15/word2vec-for-text-classification/#:~:text=Word2Vec%20is%20a%20popular%20algorithm,a%20large%20corpus%20of%20text
- Word2vec is not a single algorithm but a combination of two techniques – CBOW(Continuous bag of words) and Skip-gram model.

In [None]:
from gensim.models import Word2Vec

In [135]:
class w2vVectorizer():
    def __init__(self) -> None:
        self.w2v_model = None
    
    def w2v_vectorizer(self,sentence):
        # vectorize the text data
        words = sentence.split()
        words_vec = [self.w2v_model.wv[word] for word in words if word in self.w2v_model.wv]
        if len(words_vec) == 0:
            return np.zeros(100)
        words_vec = np.array(words_vec)
        return words_vec.mean(axis=0)
    
    def fit(self, x, y=None):
        # train the model when fit the pipeline
        sentences = [sentence.split() for sentence in x]
        self.w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)
        return self
    
    def transform(self, x, y=None):
        # when use fit or transform on the pipeline 
        return np.array([self.w2v_vectorizer(sentence) for sentence in x])

## Glove

# Machine learning

In [10]:
# save model
def save_model(model, model_name):
    filename = f"models/{model_name}.pickle"
    pickle.dump(model, open(filename,"wb"))

## Decision Tree

In [14]:
#convert given text to a vector base
from sklearn.tree import DecisionTreeClassifier

In [231]:
model = Pipeline([('vect', CountVectorizer()),
               ('clf', DecisionTreeClassifier()),
              ])
model_name = "dtc"
model.fit(x_train, y_train)
save_model(model,model_name)

In [232]:
model = Pipeline([('vect', TfidfVectorizer()),
               ('clf', DecisionTreeClassifier()),
              ])
model_name = "dtc-tfid"
model.fit(x_train, y_train)
save_model(model,model_name)

In [233]:
model = Pipeline([('vect', w2vVectorizer()),
               ('clf', DecisionTreeClassifier()),
              ])
model_name = "dtc-w2v"
model.fit(x_train, y_train)
save_model(model,model_name)

## Support Vector Machine

- supervisied learning algorithm
- Unlike neural networks, SVMs can work with very small datasets and are not prone to overfitting.

In [39]:
from sklearn.linear_model import SGDClassifier

In [234]:
model = Pipeline([('vect', CountVectorizer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
model_name = "svm"
model.fit(x_train, y_train)
save_model(model, model_name)

In [235]:
model = Pipeline([('vect', TfidfVectorizer()),
               ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
              ])
model_name = "svm-tfid"
model.fit(x_train, y_train)
save_model(model, model_name)

In [236]:
model = Pipeline([('vect', w2vVectorizer()),
               ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
              ])
model_name = "svm-w2v"
model.fit(x_train, y_train)
save_model(model, model_name)

## Logistic Regression 

In [46]:
from sklearn.linear_model import LogisticRegression

In [237]:
model = Pipeline([('vect', CountVectorizer()),
        ('clf', LogisticRegression(n_jobs=1, C=1e5,max_iter=6300)),
        ])
model_name = "lr"
model.fit(x_train, y_train)
save_model(model, model_name)

In [238]:
model = Pipeline([('vect', TfidfVectorizer()),
        ('clf', LogisticRegression(n_jobs=1, C=1e5,max_iter=6300)),
        ])
model_name = "lr-tfid"
model.fit(x_train, y_train)
save_model(model, model_name)

In [239]:
model = Pipeline([('vect', w2vVectorizer()),
        ('clf', LogisticRegression(n_jobs=1, C=1e5,max_iter=6300)),
        ])
model_name = "lr-w2v"
model.fit(x_train, y_train)
save_model(model, model_name)

# Deep Neural network

In [274]:
from keras.preprocessing.text import one_hot, Tokenizer
from keras.models import Sequential
from keras.utils.data_utils import pad_sequences
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM

from keras.metrics import BinaryAccuracy,Precision,Recall

In [263]:
y_train_no = np.array(list(map(lambda x:1 if x=="Hate" else 0, y_train)))
y_test_no = np.array(list(map(lambda x:1 if x=="Hate" else 0, y_test)))

Embeding layer convert text to numeric form which is used as the first layer for the deep learning model

In [264]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(x_train)

x_train_token = word_tokenizer.texts_to_sequences(x_train)
x_test_token = word_tokenizer.texts_to_sequences(x_test)

In [265]:
# max word in a sentences
maxx = 0
index = 0
count = 0
for i in x_train_token:
    le = len(i)
    if le > maxx: 
        maxx=le
        index = count
    count+=1

print(maxx, index, count)

39 5201 6283


In [266]:
vocab_length = len(word_tokenizer.word_index) + 1
print(vocab_length)
# can set higher depend, find from above, max is 38 word so roughly use 50 word
maxlen = 50 

# pad so all text is 50 length
x_train_pad = pad_sequences(x_train_token, padding = 'post', maxlen=maxlen)
x_test_pad = pad_sequences(x_test_token, padding = 'post', maxlen=maxlen)

14084


In [267]:
len(x_train)

6283

In [268]:
# glove embedding 
embeddings_dic = dict()
glove_file = open("Dataset/glove_embedding.txt", encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dic[word] = vector_dimensions
glove_file.close()

In [248]:
len(embeddings_dic['the'])

100

In [269]:
# create embedding matrix having 100 col
# for all vocab word in tokenizer we give it a vector from glove
# for those not found in glove will be empty 0
embbedding_matrix = np.zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dic.get(word)
    if embedding_vector is not None:
        embbedding_matrix[index] = embedding_vector
        
embbedding_matrix.shape

## Simple Neural network

In [272]:
model = Sequential()
embedding_layer = Embedding(vocab_length, 100, weights = [embbedding_matrix], input_length=maxlen, trainable=False)

model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [275]:
METRICS = [
    BinaryAccuracy(name="accuracy"),
    Precision(name="precision"),
    Recall(name="recall")
]
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [276]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 100)           1408400   
                                                                 
 flatten (Flatten)           (None, 5000)              0         
                                                                 
 dense (Dense)               (None, 1)                 5001      
                                                                 
Total params: 1,413,401
Trainable params: 5,001
Non-trainable params: 1,408,400
_________________________________________________________________
None


In [279]:
model.fit(x_train_pad, y_train_no, batch_size=128, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1f9086d0210>

In [280]:
model.evaluate(x_test_pad, y_test_no)



[0.658920407295227, 0.6554028987884521, 0.5896057486534119, 0.5833333134651184]

In [289]:
y_test_pred = model.predict(x_test_pad)
y_test_pred.flatten()

y_test_pred = np.where(y_test_pred > 0.5, 1, 0) 




In [290]:
print(classification_report(y_test_no, y_test_pred))


              precision    recall  f1-score   support

           0       0.70      0.71      0.70      1565
           1       0.59      0.58      0.59      1128

    accuracy                           0.66      2693
   macro avg       0.65      0.65      0.65      2693
weighted avg       0.65      0.66      0.66      2693



## CNN/RNN

## BERT

- https://towardsdatascience.com/bert-explained-state-of-the-art-language-model-for-nlp-f8b21a9b6270
- https://www.youtube.com/watch?v=hOCDJyZ6quA
- tensorflow hub bert https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
- bert will convert sentence into embeding vector which will feed to neural network for training 
- consist of preprocess and embeding 
- (4)BERT-RNN: The corresponding representational word vectors were trained by BERT model for the input text, which were then classified by RNN neural network. (5)word2vec-RNN: This model is a traditional text classification model. 4.3.
- BERT is a neural-network-based technique for language processing pre-training
- it is not a classification algorithm 
- BERT generates <b>contextual embeddings</b>, the input to the model is a sentence rather than a single word.

# Results

In [None]:
# Predict data
print("Test Data Accuracy  :\t", model.score(x_test, y_test))
y_test_pred = model.predict(x_test)

In [115]:
# Classification report
print(classification_report(y_test, y_test_pred, labels=["Hate","Non-Hate"]))

              precision    recall  f1-score   support

        Hate       0.69      0.67      0.68      1115
    Non-Hate       0.77      0.78      0.78      1583

    accuracy                           0.74      2698
   macro avg       0.73      0.73      0.73      2698
weighted avg       0.74      0.74      0.74      2698



In [None]:
# Heatmap
ax = plt.subplot()

# Plot the two-way Confusion Matrix
sb.heatmap(confusion_matrix(y_test, y_test_pred, labels=["Hate","Non-Hate"]), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax=ax)

ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.xaxis.set_ticklabels(["Hate","Non-Hate"])
ax.yaxis.set_ticklabels(["Hate","Non-Hate"])

# Count
df1 = pd.DataFrame({'Actual':y_test, 'Predict':y_test_pred})
# print(df1.describe())
print(f"Count: {df1['Actual'].value_counts()}")
print()
print(f"Count: {df1['Predict'].value_counts()}")
print()

# Model compare

In [66]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

def get_score(y_test, y_test_pred):
    # print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.2f}")
    # score = precision_recall_fscore_support(y_test, y_test_pred, average="weighted")
    # print(f"Precision: {score[0]:.2f}")
    # print(f"Recall: {score[1]:.2f}")
    # print(f"F1-score: {score[2]:.2f}")
    # print()
    a = accuracy_score(y_test, y_test_pred)
    prf = precision_recall_fscore_support(y_test, y_test_pred, average="weighted")
    return a, prf[0], prf[1], prf[2]
    # print(precision_recall_fscore_support(y_test, y_test_pred, average="micro"))

In [147]:
# load model
def get_result():
    model_to_load = ["dtc", "dtc-tfid", "dtc-w2v","svm", "svm-tfid", "svm-w2v", "lr", "lr-tfid","lr-w2v"]
    c = ["Model", "Accuracy", "Precision", "Recall", "F1-Score"]
    result_table = pd.DataFrame(columns=c)

    for i in model_to_load:
        filename = f"models/{i}.pickle"
        old_model = pickle.load(open(filename,"rb"))
        
        y_test_pred = old_model.predict(x_test)

        a = accuracy_score(y_test, y_test_pred)
        prf = precision_recall_fscore_support(y_test, y_test_pred, average="weighted")

        result_table.loc[len(result_table)] = [i, a, prf[0], prf[1], prf[2]]

    return result_table.style.highlight_max(color = 'red', axis = 0)

In [149]:
get_result()

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,dtc,0.74935,0.747932,0.74935,0.748249
1,dtc-tfid,0.74378,0.743531,0.74378,0.743649
2,dtc-w2v,0.586706,0.589898,0.586706,0.588024
3,svm,0.768659,0.767394,0.768659,0.766195
4,svm-tfid,0.746008,0.760112,0.746008,0.731861
5,svm-w2v,0.590048,0.611645,0.590048,0.461541
6,lr,0.684738,0.690064,0.684738,0.686326
7,lr-tfid,0.694393,0.697312,0.694393,0.695449
8,lr-w2v,0.708504,0.705758,0.708504,0.705809


In [241]:
get_result()

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,dtc,0.762347,0.762231,0.762347,0.762287
1,dtc-tfid,0.744523,0.744717,0.744523,0.744616
2,dtc-w2v,0.583364,0.585593,0.583364,0.584338
3,svm,0.773115,0.774587,0.773115,0.768277
4,svm-tfid,0.74378,0.767172,0.74378,0.725699
5,svm-w2v,0.597104,0.605277,0.597104,0.498157
6,lr,0.726328,0.729749,0.726328,0.727414
7,lr-tfid,0.725956,0.728042,0.725956,0.726728
8,lr-w2v,0.705904,0.703003,0.705904,0.700796
