# Naïve Bayes classifier with BOW representation

In [1]:
TRAIN_SIZE = 5000
TEST_SIZE = 1000

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import spacy
nlp = spacy.load('en_core_web_sm')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

In [3]:
df = pd.read_csv('data/train.csv', usecols=range(1, 8))
df

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...
159566,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [4]:
x_train, x_test, y_train, y_test = train_test_split(
    df['comment_text'],  # X : string array of size (n,)
    df.drop(columns='comment_text'),  # Y : class labels 2d-array of size (n, k_classes)
    train_size=TRAIN_SIZE,
    test_size=TEST_SIZE,
    random_state=42)

## Convert corpus to bag-of-words representation

* Remove stop words and non-alphabetic like text to reduce dictionary size
* Can't use the faster HashingVectorizer since that doesn't support exporting a dictionary for later vectorizing the validation/test set

In [5]:
def tokenize(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

vectorizer = CountVectorizer(lowercase=True, tokenizer=tokenize)

In [6]:
train_bow = vectorizer.fit_transform(x_train)
train_bow

<5000x17577 sparse matrix of type '<class 'numpy.int64'>'
	with 108908 stored elements in Compressed Sparse Row format>

In [7]:
classifier = OneVsRestClassifier(MultinomialNB(), n_jobs=-1)
classifier.fit(train_bow, y_train)

OneVsRestClassifier(estimator=MultinomialNB(), n_jobs=-1)

### Evaluate the classifier

The metrics for the classifier is mean column-wise ROC AUC

In [8]:
y_train_pred = classifier.predict_proba(train_bow)

test_bow = vectorizer.transform(x_test)
y_test_pred = classifier.predict_proba(test_bow)

In [9]:
print('Train ROC AUC:', roc_auc_score(y_train, y_train_pred))
print('Test ROC AUC:', roc_auc_score(y_test, y_test_pred))

Train ROC AUC: 0.9341102932418798
Test ROC AUC: 0.7800159946707271
