# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from nltk.corpus import stopwords
# from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import re
import string

# Label Classes

In [12]:
stop = stopwords.words('english')
df = pd.read_csv("dataset/train.csv")
df2 = pd.read_csv("dataset/test.csv")
toxic=df["toxic"].copy()
severe_toxic=df["severe_toxic"].copy()
obscene=df["obscene"].copy()
threat=df["threat"].copy()
insult=df["insult"].copy()
identity_hate=df["identity_hate"].copy()

# Cleaning and preparing

In [13]:
#stemmer = LancasterStemmer()

#remove alphanumeric
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)

# '[%s]' % re.escape(string.punctuation),' ' - replace punctuation with white space
# .lower() - convert all strings to lowercase 
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

# Remove all '\n' in the string and replace it with a space
remove_n = lambda x: re.sub("\n", " ", x)

# Remove all non-ascii characters 
remove_non_ascii = lambda x: re.sub(r'[^\x00-\x7f]',r' ', x)

# Apply all the lambda functions wrote previously through .map on the comments column
df["comment_text"] = df["comment_text"].map(alphanumeric).map(punc_lower).map(remove_n).map(remove_non_ascii)

#stopwords
df["comment_text"] = df["comment_text"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [41]:
x = df["comment_text"].apply(lambda w:w.split())

In [26]:
lemmatizer=WordNetLemmatizer()
for i in x:
    for word in i:
        word=lemmatizer.lemmatize(word)

In [90]:
y_multilabel = np.c_[toxic, severe_toxic, obscene, threat, insult, identity_hate]
y_multilabel

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)

# Text processing

In [43]:
#tfidf
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix,precision_score,precision_recall_curve,recall_score,f1_score,classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split,cross_val_score
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=100000,max_df=0.99,min_df=0.01)
tfidf_vect.fit(df["comment_text"])

In [91]:
train_x, test_x, train_y, test_y = train_test_split(df['comment_text'],y_multilabel)

In [92]:
xtrain_tfidf =  tfidf_vect.transform(train_x)
xtest_tfidf =  tfidf_vect.transform(test_x)

# Model Training

In [114]:
def train_model(classifier, X_train, Y_train, X_test):
    classifier.fit(X_train,Y_train)
    predictions=classifier.predict(X_test)
    return predictions
# knn_clf = KNeighborsClassifier()
# knn_clf.fit(xtrain_tfidf, train_y)

In [117]:
def accuracy(predictions, test_y):
    allcorr=0
    for i in range(len(predictions)):
        corr=0
        for j in range(6):
            if predictions[i][j] == test_y[i][j]:
                corr=corr+1
        if corr==6:
            allcorr=allcorr+1
    print(allcorr/len(predictions))

# K Neighbours

In [115]:
knn_clf = KNeighborsClassifier()
knn_pred = train_model(knn_clf, xtrain_tfidf, train_y, xtest_tfidf)

In [118]:
knn_accuracy=accuracy(knn_pred, test_y)

0.8579199358283408


# Random Forest

In [119]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier()
forest_pred=train_model(forest_clf, xtrain_tfidf, train_y, xtest_tfidf)

In [120]:
forest_accuracy=accuracy(forest_pred,test_y)

0.900734464693054


# Evaluation

In [121]:
#cross_val_score
cross_val_score(forest_clf, xtrain_tfidf, train_y, cv=3, scoring="accuracy")

array([0.90451959, 0.90429399, 0.90286273])

In [124]:
print(precision_score(train_y, y_train_pred, average="weighted"))
print(recall_score(train_y, y_train_pred, average="weighted"))
print(f1_score(train_y, y_train_pred, average="weighted"))

0.7530765095646093
0.30299437837010973
0.42898422148254517


 model seems to be skewed

# 6 binary classes for each label instead of 1 Multilabel

toxic

In [66]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(xtrain_tfidf, train_y)
knn_clf.predict(xtest_tfidf)

array([0, 0, 1, ..., 0, 1, 0], dtype=int64)

In [8]:
#metrics and more algos

In [69]:
np.array(test_y)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [71]:
sum(knn_clf.predict(xtest_tfidf) == np.array(test_y))/len(test_y)

0.890106033639987

severe_toxic

In [73]:
train_x, test_x, train_y, test_y = train_test_split(df['comment_text'],severe_toxic)

In [74]:
knn_clf.fit(xtrain_tfidf, train_y)
sum(knn_clf.predict(xtest_tfidf) == np.array(test_y))/len(test_y)

0.9896974406537488

obscene

In [75]:
train_x, test_x, train_y, test_y = train_test_split(df['comment_text'],obscene)

In [76]:
knn_clf.fit(xtrain_tfidf, train_y)
sum(knn_clf.predict(xtest_tfidf) == np.array(test_y))/len(test_y)

0.9452786203093274

threat

In [77]:
train_x, test_x, train_y_threat, test_y_threat = train_test_split(df['comment_text'],threat)

In [78]:
knn_clf.fit(xtrain_tfidf, train_y_threat)
sum(knn_clf.predict(xtest_tfidf) == np.array(test_y_threat))/len(test_y_threat)

0.9970671546386584

insult

In [79]:
train_x, test_x, train_y_insult, test_y_insult = train_test_split(df['comment_text'],insult)

In [80]:
knn_clf.fit(xtrain_tfidf, train_y_insult)
sum(knn_clf.predict(xtest_tfidf) == np.array(test_y_insult))/len(test_y_insult)

0.9499912265309703

identity hate

In [81]:
train_x, test_x, train_y_idhate, test_y_idhate = train_test_split(df['comment_text'],identity_hate)

In [82]:
knn_clf.fit(xtrain_tfidf, train_y_idhate)
sum(knn_clf.predict(xtest_tfidf) == np.array(test_y_idhate))/len(test_y_idhate)

0.9913267991878274

In [86]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier()
forest_clf.fit(xtrain_tfidf, train_y)
sum(forest_clf.predict(xtest_tfidf) == np.array(test_y))/len(test_y)

0.9451783520918456

# Random Forest Binary

In [87]:
forest_clf.fit(xtrain_tfidf, train_y_idhate)
sum(forest_clf.predict(xtest_tfidf) == np.array(test_y_idhate))/len(test_y_idhate)

0.9908254581004187

# Exporting to predictions.csv

In [88]:
df2

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


In [128]:
X_valid = df2["comment_text"]
X_valid_tfidf =  tfidf_vect.transform(X_valid)
final_pred = forest_clf.predict(X_valid_tfidf)
print(final_pred)

[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 ...
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


In [129]:
toxic_final = final_pred[:,0]
severe_toxic_final = final_pred[:,1]
obscene_final = final_pred[:,2]
threat_final = final_pred[:,3]
insult_final = final_pred[:,4]
identity_hate_final = final_pred[:,5]
#df.insert(location, column_name, list_of_values)
df2.insert(2, 'toxic', toxic_final)
df2.insert(3, 'severe_toxic', severe_toxic_final)
df2.insert(4, 'obscene', obscene_final)
df2.insert(5, 'threat', threat_final)
df2.insert(6, 'insult', insult_final)
df2.insert(7, 'identity_hate', identity_hate_final)

In [130]:
df.to_csv(r'predictions.csv', index = None, header=True)