#### Import required libraries

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from scipy import sparse

from common import tokenize, tokenize_with_stopw, tokenize_and_lemma, eval_pred

#### Read Data

In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
test_y = pd.read_csv('../data/test_labels.csv')
test_df = pd.concat([test_df, test_y.iloc[:,1:]], axis=1, sort=False)
print(train_df.shape, test_df.shape)

(159571, 8) (153164, 8)


#### Initialization

In [3]:
scores_tracker = {}
non_toxic_label = 'non_toxic'
comment_col = 'comment_text'

class_labels = train_df.columns.tolist()[2:]
class_labels

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [4]:
# Create non-toxic class and fillna
train_df[non_toxic_label] = 1 - train_df[class_labels].max(axis=1)
train_df[comment_col] = train_df[comment_col].fillna('unknown')
test_df[comment_col] = test_df[comment_col].fillna('unknown')

#### NB-SVM Model
- Reference paper: https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf
- Reference implementation: https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline
- Quoted from the reference implementation
        '''we use sklearn's logistic regression, rather than SVM,
        although in practice the two are nearly identical
        (sklearn uses the liblinear library behind the scenes)'''

##### Functions

In [5]:
def get_mdl(x, y, **kwargs):
    '''
    Parameters
    ----------
    x: sparse matrix
        vectorized train data
    y: sparse matrix
        vectorized train labels
    kwargs:
        parameters passed to LogisticRegression
    
    Returns
    ----------
    m: fitted LogisticRegression model
    r: sparse matrix of type float
        log of 
        (probability of x given y=1) / (probability of x given y=0)
    '''   
    
    def pr(x, y_i, y):
        '''Compute Naive Bayes probability output'''
        p = x[y==y_i].sum(0)
        return (p+1) / ((y==y_i).sum()+1)
    
    y = y.values
    r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
    m = LogisticRegression(**kwargs)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

@ignore_warnings(category=ConvergenceWarning)
def get_pred_nbsvm(train_y, train_x,
                   test_x,
                   class_labels=class_labels,
                   **kwargs):
    '''Get predictions for each label at a time'''
    # Initialize prediction output array
    preds = np.zeros((test_x.shape[0], len(class_labels)))

    # Get predictions for each label
    for idx, label in enumerate(class_labels):
        print('fit', label)
        m, r = get_mdl(train_x, train_y[label], **kwargs)
        preds[:, idx] = m.predict_proba(test_x.multiply(r))[:,1]
    return preds

def run_nbsvm(vectorizer,
              train_df, test_df,
              comment_col=comment_col,
              class_labels=class_labels,
              **kwargs):
    '''Run 1 nbsvm prediction cycle'''
    
    # Transform data
    train_x = vectorizer.fit_transform(train_df[comment_col])
    test_x = vectorizer.transform(test_df[comment_col])
    train_y = train_df[class_labels]
    test_y = test_df[class_labels]
    
    # Get prediction and score
    preds = get_pred_nbsvm(train_y, train_x,
                           test_x,
                           class_labels,
                           **kwargs)
    score = eval_pred(test_y, preds, class_labels)
    
    return score

##### RUN

In [6]:
# Define vectorizers
cntvec = CountVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                           min_df=3, max_df=0.9, strip_accents='unicode')
tfidf = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
                        min_df=3, max_df=0.9, strip_accents='unicode',
                        sublinear_tf=True)
tfidf_stopw = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize_with_stopw,
                              min_df=3, max_df=0.9, strip_accents='unicode',
                              sublinear_tf=True)
tfidf_lemma = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize_and_lemma,
                              min_df=3, max_df=0.9, strip_accents='unicode',
                              sublinear_tf=True)

##### 1.1 CountVectorizer

In [8]:
%%time
# Fit, predict and get scores
score_cntvec = run_nbsvm(
    cntvec, train_df, test_df,
    comment_col=comment_col,
    class_labels=class_labels,
    C=4, dual=False,
    max_iter=200,
    random_state=123, n_jobs=-1)

scores_tracker['nbsvm_cntvec'] = score_cntvec

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate
Mean ROC-AUC: 0.9421376159074196
Wall time: 4min 25s


##### 1.2 TF-IDF

In [9]:
%%time
# Fit, predict and get scores
score_tfidf = run_nbsvm(
    tfidf, train_df, test_df,
    comment_col=comment_col,
    class_labels=class_labels,
    C=4, dual=False,
    max_iter=200,
    random_state=123, n_jobs=-1)
scores_tracker['nbsvm_tfidf'] = score_tfidf

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate
Mean ROC-AUC: 0.9762957516442285
Wall time: 2min 56s


##### 1.3 TF-IDF with stopwords removal

In [13]:
%%time
# Fit, predict and get scores
score_tfidf_stopw = run_nbsvm(
    tfidf_stopw, train_df, test_df,
    comment_col=comment_col,
    class_labels=class_labels,
    C=4, dual=False,
    max_iter=200,
    random_state=123, n_jobs=-1)
scores_tracker['nbsvm_tfidf_stopw'] = score_tfidf_stopw

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate
Mean ROC-AUC: 0.9737348660544357
Wall time: 2min 33s


##### 1.4 TF-IDF with lemmatization

In [14]:
%%time
# Fit, predict and get scores
score_tfidf_lemma = run_nbsvm(
    tfidf_lemma, train_df, test_df,
    comment_col=comment_col,
    class_labels=class_labels,
    C=4, dual=False,
    max_iter=50,
    random_state=123, n_jobs=-1)
scores_tracker['nbsvm_tfidf_lemma'] = score_tfidf_lemma

fit toxic
fit severe_toxic
fit obscene
fit threat
fit insult
fit identity_hate
Mean ROC-AUC: 0.9772775631141992
Wall time: 2min 58s
