In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

In [2]:
import gzip
import os
import gc

In [3]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from scipy import sparse

In [4]:
# from nltk.tokenize import word_tokenize, 
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, log_loss

In [5]:
class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual=False, n_jobs=1):
        self.C = C
        self.dual = dual
        self.n_jobs = n_jobs

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))

    def predict_proba(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict_proba(x.multiply(self._r))

    def fit(self, x, y):
        # Check that X and y have correct shape
        try:
            y = y.values
        except AttributeError:
            pass
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y==y_i].sum(0)
            return (p+1) / ((y==y_i).sum()+1)

        self._r = sparse.csr_matrix(np.log(pr(x,1,y) / pr(x,0,y)))
        x_nb = x.multiply(self._r)
        self._clf = LogisticRegression(C=self.C, dual=self.dual, n_jobs=self.n_jobs).fit(x_nb, y)
        return self

In [6]:
hyperparam = {'sequence_len': 100,
              'embedding_dim': 300, 
              'filters': 200, 
              'kernel_size': 3,
              'dropout' : 0.5,
              'dense_units': 100,
              'batch_size': 512,
              'epochs': 1000,
              'steps_per_epochs': 15,
              'early_stopping': True,
              'vocab_size': None,
              'learning_rate' : 0.0005,
              'gradient_clip_value' : None,
              'gradient_clip_norm' : None,
              'validation_split': 0.1,
              'missing_word_vectors': 'normal',
              'conv_activation': 'relu', 
              'dense_activation':'relu',
              'n_class': 6}

In [7]:
train = pd.read_csv('./data/train.csv')

In [8]:
train_text = train['comment_text'].astype('str')

In [9]:
test = pd.read_csv('./data/test.csv')

In [10]:
test_text = test['comment_text'].astype('str').values

In [11]:
count_vectorizer = TfidfVectorizer(max_features=None, 
                                   stop_words=set(stopwords.words('english')),
                                   ngram_range=(1,2),
                                   dtype=np.uint32)

In [12]:
count_vectorizer.fit(train_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.uint32'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'until', 'out', 'how', 'here', 'who', 'him', 'he', 'these', 'ma', 'mustn', 'no', 'when', 'm', 'am', 'was', 'why', 'themselves', 've', 'with', 'were', 'so', 'few', 'being', 'won', 'during', 'if', 'isn', 'their', 'll', 'your', 't', 'me', 'be', 'whom', 'wasn', 'about', 'myself', 'its', 'onl...y', 'wouldn', 'been', 'too', 'any', 'yours', 'herself', 'against', 'shouldn', 'our', 'didn', 'very'},
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [13]:
X_train = count_vectorizer.transform(train_text)

In [14]:
X_test = count_vectorizer.transform(test_text)

In [15]:
X_train

<159571x2969685 sparse matrix of type '<class 'numpy.float64'>'
	with 9608487 stored elements in Compressed Sparse Row format>

In [16]:
y_train = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

In [30]:
y_score = []
for i in tqdm_notebook(range(6), total=6):
    model = NbSvmClassifier(n_jobs=6)
    model.fit(x=X_train, y=y_train[:, i])
    y_score.append(model.predict_proba(x=X_test)[:, 1])

A Jupyter Widget

  " = {}.".format(self.n_jobs))





In [32]:
col_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [42]:
for i, name in enumerate(col_names):
    test[name] = y_score[i]

In [47]:
del test['comment_text']

In [49]:
test.to_csv('NBSVM_Bigrams.csv', index=False)

In [50]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=hyperparam['validation_split'], random_state=22)

In [51]:
loss = []
for i in range(6):
    model = NbSvmClassifier(n_jobs=6)
    model.fit(x=X_train, y=y_train[:, i])
    y_score = model.predict_proba(x=X_val)
    loss.append(log_loss(y_true=y_val[:, i], y_pred=y_score))
    print(loss[i])

  " = {}.".format(self.n_jobs))


0.12585622691808185
0.03532838193721931
0.06892130793827003
0.010981361470498557
0.08932669921187385
0.035168523596730214


In [52]:
sum(loss)/6

0.06093041684544564