In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score

In [2]:
df = pd.read_excel("toxicity.xlsx")
df = df.iloc[:, 1:]

In [3]:
df["label"].unique()

array(['Offensive', 'Very offensive', 'Neutral', 'Profanity',
       'Extremely offensive', 'Unknown', 'Hate speech'], dtype=object)

In [4]:
neutral_df = df[df["label"] == "Neutral"]
neutral_size = neutral_df.shape[0]
neutral_size

814

In [5]:
toxic_df = df[df["label"].isin(['Offensive', 'Very offensive', 'Profanity', 'Extremely offensive', 'Hate speech'])].sample(neutral_size)
toxic_df["label"] = "Toxic"

In [6]:
two_class_df = pd.concat([neutral_df, toxic_df])
two_class_df

Unnamed: 0,flirtation,identity_attack,insult,severe_toxicity,sexually_explicit,threat,label
3,0.503426,0.407557,0.796685,0.854638,0.955973,0.343336,Neutral
15,0.447052,0.242548,0.605072,0.718969,0.381421,0.926245,Neutral
18,0.234309,0.327410,0.411611,0.755813,0.067329,0.980107,Neutral
21,0.480975,0.495559,0.758837,0.747318,0.652246,0.415748,Neutral
42,0.385286,0.268339,0.656422,0.747318,0.200259,0.926273,Neutral
...,...,...,...,...,...,...,...
6741,0.588390,0.277701,0.924737,0.753536,0.391835,0.344125,Toxic
9242,0.655491,0.609066,0.877252,0.886901,0.902304,0.471083,Toxic
1139,0.510649,0.138321,0.702443,0.866124,0.799199,0.292326,Toxic
428,0.570768,0.528850,0.909464,0.888702,0.838369,0.319839,Toxic


In [7]:
X = two_class_df.iloc[:, :-1]
y = two_class_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [8]:
from sklearn.svm import SVC
svm = SVC()
clf = RandomizedSearchCV(svm, {
     'C': [1, 10], 
     'kernel': ( 'rbf', 'poly')
     },
     n_jobs=-1)
search = clf.fit(X_train, y_train)
search.best_params_

{'kernel': 'rbf', 'C': 10}

In [9]:
svm = SVC(kernel='rbf', C=10)
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))
y_pred = svm.predict(X)
print('Precision of SVM classifier: {:.2f}'
     .format(precision_score(y, y_pred, average='macro')))
print('Recall of SVM classifier: {:.2f}'
     .format(recall_score(y, y_pred, average='macro')))

Accuracy of SVM classifier on training set: 0.89
Accuracy of SVM classifier on test set: 0.83
Precision of SVM classifier: 0.88
Recall of SVM classifier: 0.88


In [10]:
from sklearn.neural_network import MLPClassifier
NN = MLPClassifier()
clf = RandomizedSearchCV(NN, {
    'activation': ['tanh', 'relu', 'logistic'],
    'solver': ['sgd', 'adam', 'lbfgs'],
    'alpha': [0.0001, 0.9],
    'learning_rate': ['constant','adaptive']
    },
    n_jobs=-1)
search = clf.fit(X_train, y_train)
search.best_params_

{'solver': 'lbfgs',
 'learning_rate': 'constant',
 'alpha': 0.0001,
 'activation': 'relu'}

In [11]:
NN = MLPClassifier(activation='relu',solver='lbfgs', alpha=0.0001, learning_rate='constant')
NN.fit(X_train, y_train)
print('Accuracy of NN classifier on training set: {:.2f}'
     .format(NN.score(X_train, y_train)))
print('Accuracy of NN classifier on test set: {:.2f}'
     .format(NN.score(X_test, y_test)))
y_pred = NN.predict(X)
print('Precision of NN classifier: {:.2f}'
     .format(precision_score(y, y_pred, average='macro')))
print('Recall of NN classifier: {:.2f}'
     .format(recall_score(y, y_pred, average='macro')))

Accuracy of NN classifier on training set: 0.88
Accuracy of NN classifier on test set: 0.83
Precision of SVM classifier: 0.87
Recall of SVM classifier: 0.87


In [12]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
clf = RandomizedSearchCV(RF, { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
    },
    n_jobs=-1)
search = clf.fit(X_train, y_train)
search.best_params_

{'n_estimators': 700, 'max_features': 'auto'}

In [13]:
RF = RandomForestClassifier(n_estimators=700, max_features='auto')
RF.fit(X_train, y_train)
print('Accuracy of RF classifier on training set: {:.2f}'
     .format(RF.score(X_train, y_train)))
print('Accuracy of RF classifier on test set: {:.2f}'
     .format(RF.score(X_test, y_test)))
y_pred = RF.predict(X)
print('Precision of RF classifier: {:.2f}'
     .format(precision_score(y, y_pred, average='macro')))
print('Recall of RF classifier: {:.2f}'
     .format(recall_score(y, y_pred, average='macro')))

Accuracy of RF classifier on training set: 1.00
Accuracy of RF classifier on test set: 0.83
Precision of SVM classifier: 0.96
Recall of SVM classifier: 0.96


The accuracy of the different models is about the same on the test set, but the precision, `tp / (tp + fp)`, and recall, `tp / (tp + fn)`, of the Random Forest classifier are much better than the other 2 models, which is important given the context of this model (possibly punishing players due to their toxic behaviour), so my selection is going to be a Random Forest.

In [None]:
RF = RandomForestClassifier(n_estimators=700, max_features='log2')
RF.fit(X, y)