In [1]:
import pandas as pd 
import json
import math
import random
import numpy as np
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
#import json datafile and convert to dataframe
file = './Data_src/HateXplain.json'
with open(file) as train_file:
    dict_train = json.load(train_file)

train = pd.DataFrame.from_dict(dict_train, orient='index')
train.reset_index(level=0, inplace=True)

In [3]:
#pick a random annotator's label

def annotatorSelect(i):

    annontators = train["annotators"][i]
    annotator = annontators[math.floor(random.uniform(0, 1)*len(annontators))]
    
    return  0 if annotator['label'] == "normal" else 1 if annotator['label'] == "offensive" else 2

labels_list = []
for i in range(len(train)):
    labels_list.append(annotatorSelect(i))
    
labels = np.array(labels_list)

In [4]:
#convert tokenized text to string
def toString(tokens_list):
    return [' '.join(tokens) for tokens in tokens_list]

text = train['post_tokens'].to_numpy()
text_ = toString(text)

In [6]:
#load an nlp model from SpaCy
nlp = spacy.load("en_core_web_md")

In [7]:
#vectorize the text using word2vec
textVect = []

for string in text_:
     textVect.append(nlp(string).vector)

textVect = np.array(textVect)

In [8]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(textVect, labels, random_state = 228, test_size=0.2)

In [9]:
#model training

import threading
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

def TrainNN():
    nn = MLPClassifier(max_iter = 300, hidden_layer_sizes=[100, 100, 100, 100, 100])
    nn.fit(X_train, y_train)

    y_pred_nn = nn.predict(X_test)
    report_nn = classification_report(y_test, y_pred_nn)
    accuracy_nn = accuracy_score(y_test, y_pred_nn)
    #cm = confusion_matrix(y_test, y_pred)
    print('Report for MLP:', report_nn)
    print('Accuracy for MLP:', accuracy_nn)
    #sns.heatmap(cm, annot=True, fmt='d')

def TrainGB():
    clf = GradientBoostingClassifier(verbose=False)
    clf.fit(X_train, y_train)

    y_pred_gb = clf.predict(X_test)
    report_gb = classification_report(y_test, y_pred_gb)
    accuracy_gb = accuracy_score(y_test, y_pred_gb)
    #cm = confusion_matrix(y_test, y_pred)
    print('Report for Gradient Booster:', report_gb)
    print('Accuracy for Gradient Booster:', accuracy_gb)
    #sns.heatmap(cm, annot=True, fmt='d')

def TrainSVM():
    SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
    SVM.fit(X_train, y_train)

    y_pred_svm = SVM.predict(X_test)
    report_svm = classification_report(y_test, y_pred_svm)
    accuracy_svm = accuracy_score(y_test, y_pred_svm)
    #cm = confusion_matrix(y_test, y_pred)
    print('Report for Gradient Booster:', report_svm)
    print('Accuracy for Gradient Booster:', accuracy_svm)

    #sns.heatmap(cm, annot=True, fmt='d')

t1 = threading.Thread(target=TrainNN, args=())
t2 = threading.Thread(target=TrainGB, args=())
t3 = threading.Thread(target=TrainSVM, args=())

t1.start()
t2.start()
t3.start()


t1.join()
t2.join()
t3.join()


Report for MLP:               precision    recall  f1-score   support

           0       0.52      0.58      0.55      1587
           1       0.38      0.33      0.35      1238
           2       0.50      0.48      0.49      1205

    accuracy                           0.48      4030
   macro avg       0.46      0.47      0.46      4030
weighted avg       0.47      0.48      0.47      4030

Accuracy for MLP: 0.47543424317617866
Report for Gradient Booster:               precision    recall  f1-score   support

           0       0.51      0.73      0.60      1587
           1       0.46      0.23      0.30      1238
           2       0.53      0.51      0.52      1205

    accuracy                           0.51      4030
   macro avg       0.50      0.49      0.47      4030
weighted avg       0.50      0.51      0.49      4030

Accuracy for Gradient Booster: 0.509181141439206
Report for Gradient Booster:               precision    recall  f1-score   support

           0       0.5