In [41]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd # to work with csv files

# matplotlib imports are used to plot confusion matrices for the classifiers
import matplotlib as mpl 
import matplotlib.cm as cm 
import matplotlib.pyplot as plt 
# Pretty display for notebooks
%matplotlib inline

# import feature extraction methods from sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import _stop_words

# pre-processing of text
import string
import re

# import classifiers from sklearn
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

# import different metrics to evaluate the classifiers
from sklearn.metrics import accuracy_score

# from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from sklearn import metrics
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords

# import time function from time module to track the training duration
from time import time

In [22]:
trainData = pd.read_csv("./data/train.csv")

In [23]:
# Sum of Classes
np.array(trainData.describe().iloc[1].tolist()) * 159571
# Mean of CLasses
trainData.describe().iloc[1]

toxic            0.095844
severe_toxic     0.009996
obscene          0.052948
threat           0.002996
insult           0.049364
identity_hate    0.008805
Name: mean, dtype: float64

In [24]:
stopWords = stopwords.words('english')

In [53]:
trainX = trainData["comment_text"]
trainY = trainData[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]

In [54]:
# trainX = trainX.apply(lambda x: ' '.join([word for word in x.split() if word not in (stopWords)]))
def clean(text):
    text = text.lower()
    # Remove website links
    template = re.compile(r'https?://\S+|www\.\S+') 
    text = template.sub(r'', text)
    
    # Remove HTML tags
    template = re.compile(r'<[^>]*>') 
    text = template.sub(r'', text)
    
    # Remove none ascii characters
    template = re.compile(r'[^\x00-\x7E]+') 
    text = template.sub(r'', text)
    
    # Replace none printable characters
    template = re.compile(r'[\x00-\x0F]+') 
    text = template.sub(r' ', text)

    # Remove special characters
    text = re.sub("'s", '', text)
    template = re.compile('["#$%&\'()\*\+-/:;<=>@\[\]\\\\^_`{|}~]') 
    text = template.sub(r' ', text)
    # Replace multiple punctuation 
    text = re.sub('[.!?]{2,}', '.', text)
    text = re.sub(',+', ',', text) 
    # Remove numbers
    text = re.sub('\d+', ' ', text) 
        
    # Remove extra spaces
    text = re.sub('\s+', ' ', text)
    
    # Remove spaces at the beginning and at the end of string
    text = text.strip()

    # Remove stop words from string
    text = " ".join([token for token in text.split() if token not in stopWords])
    return text


In [55]:
xTrain, xVal, yTrain, yVal = train_test_split(trainX, trainY, random_state = 1)
print(xTrain.shape, yTrain.shape)
print(xVal.shape, yVal.shape)

(119678,) (119678, 6)
(39893,) (39893, 6)


In [56]:
# Step 2-3: Preprocess and Vectorize train and test data
vect = CountVectorizer(preprocessor=clean) # instantiate a vectoriezer
xTrainDTM = vect.fit_transform(xTrain)# use it to extract features from training data
# transform testing data (using training data's features)
xValDTM = vect.transform(xVal)
print(xTrainDTM.shape, xValDTM.shape)

(119678, 139621) (39893, 139621)


In [None]:
dtree_model = DecisionTreeClassifier(max_depth = 2).fit(xTrainDTM, yTrain)
dtree_predictions = dtree_model.predict(xValDTM)

In [58]:
# cm = confusion_matrix(yVal, dtree_predictions)
a12 = RandomForestClassifier()
model1 = MultiOutputClassifier(estimator= a12)
model1.fit(xTrainDTM, yTrain)

MultiOutputClassifier(estimator=RandomForestClassifier())

In [59]:
predictionRandomForestVal = model1.predict(xValDTM)

In [60]:
from sklearn.metrics import accuracy_score, classification_report
list3 = yTrain.columns
print("Training Results")
for i in range(len(list3)):
    acc1 = accuracy_score(yVal.iloc[:, i], predictionRandomForestVal[:,i])
    print(list3[i]+ "==>", acc1)

Training Results
toxic==> 0.9534504800340912
severe_toxic==> 0.9900734464693054
obscene==> 0.97668763943549
threat==> 0.9965908806056201
insult==> 0.9694933948311734
identity_hate==> 0.9914020003509387


# TEST DATA

In [61]:
testData = pd.read_csv("./data/test.csv")
testX = testData["comment_text"]

In [62]:
testXDTM = vect.transform(testX)
print(testXDTM.shape)

(153164, 139621)


In [63]:
predictionRandomForestTest = model1.predict(testXDTM)