In [1]:
import pandas
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from nltk import download
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer

In [3]:
download('stopwords')
download('punkt')

[nltk_data] Downloading package stopwords to /home/valex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/valex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
test = pandas.read_csv('./data/test.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31911 entries, 0 to 31910
Data columns (total 2 columns):
comment_text    31911 non-null object
target          31911 non-null int64
dtypes: int64(1), object(1)
memory usage: 498.7+ KB


In [5]:
train = pandas.read_csv('./data/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127660 entries, 0 to 127659
Data columns (total 2 columns):
comment_text    127660 non-null object
target          127660 non-null int64
dtypes: int64(1), object(1)
memory usage: 1.9+ MB


In [6]:
nb_pipeline= Pipeline([
    ('vect', CountVectorizer(tokenizer=Stemmer())), 
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])
nb_parameters_grid = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3)
}
nb_model = GridSearchCV(nb_pipeline, nb_parameters_grid, n_jobs=-1)
nb_model = nb_model.fit(train['comment_text'], train['target'])



In [7]:
nb_model.best_estimator_.steps[0][1]

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<core.classifier.Stemmer object at 0x7f1925ed3160>,
        vocabulary=None)

In [8]:
svm_pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=Stemmer())),
    ('tfidf', TfidfTransformer()),
    ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))
])
svm_parameters_grid = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf-svm__alpha': (1e-2, 1e-3)
}
svm_model = GridSearchCV(svm_pipeline, svm_parameters_grid, n_jobs=-1)
svm_model = svm_model.fit(train['comment_text'], train['target'])



In [9]:
from core.classifier import Classifier
if nb_model.best_score_ > svm_model.best_score_:
    best_model = nb_model.best_estimator_
else:
    best_model = svm_model.best_estimator_
dumpable_model = Classifier(best_model)
dumpable_model.dump('./data/model.pk')

In [10]:
best_model

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip...near_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))])