# Creating model for sentiment classification with scikit-learn (EN)

## 1. Data retrieval

For this model, movie revies dataset will be used

In [1]:
import nltk
from pathlib import Path
nltk.data.path.append(f"{str(Path.home())}/datasets")
import numpy as np
import pandas as pd
from nltk.corpus import movie_reviews

In [2]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [movie_reviews.words(fileids=[f]) for f in negids]
posfeats = [movie_reviews.words(fileids=[f]) for f in posids]
X = [*negfeats, *posfeats]
X = [" ".join(list(x)) for x in X]
y = [0]*len(negfeats)+[1]*len(posfeats)

In [3]:
sum(y)/len(y) # classes are balanced

0.5

## 2. Classical approach
This will include sklearn toolset to vectorize data and build a classifier

In [4]:
import sklearn
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.model_selection import cross_val_score

In [5]:
model = make_pipeline(
    TfidfVectorizer(stop_words=nltk.corpus.stopwords.words('english')), 
    GradientBoostingClassifier())


In [6]:
cross_val_score(model, X, y, cv=5)

array([0.79  , 0.8125, 0.775 , 0.7875, 0.8075])

In [7]:
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingRandomSearchCV

In [8]:
from sklearn.pipeline import Pipeline
from scipy.stats import randint

pipeline_ = Pipeline([
    ("vec", TfidfVectorizer()),
    ("clf", GradientBoostingClassifier())
])

vectorizers = {
    "vec": [CountVectorizer(), TfidfVectorizer()],
    "vec__ngram_range": [(1,1), (1,2), (1,3), (1,4), (1,5)],
    "vec__min_df": randint(1, 10),
    "vec__max_features": [None, 200, 500, 1000, 350, 1500]
}
classifiers = {
    "clf":[GradientBoostingClassifier()],
    "clf__n_estimators":randint(100, 500),
    "clf__max_depth": randint(3, 20),
}


param_distributions = [{
    **vectorizers,
    **classifiers    
}]

In [9]:
%%time
# ~ несколько недель на 6 ядрах Ryzen5
hrscv_ = HalvingRandomSearchCV(
    estimator=pipeline_,
    param_distributions=param_distributions, 
    scoring="f1", 
    n_jobs=6, refit=True, cv=5, verbose=2)
hrscv_.fit(X, y)

n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 20
max_resources_: 2000
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 100
n_resources: 20
Fitting 5 folds for each of 100 candidates, totalling 500 fits
----------
iter: 1
n_candidates: 34
n_resources: 60
Fitting 5 folds for each of 34 candidates, totalling 170 fits
----------
iter: 2
n_candidates: 12
n_resources: 180
Fitting 5 folds for each of 12 candidates, totalling 60 fits
----------
iter: 3
n_candidates: 4
n_resources: 540
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 4
n_candidates: 2
n_resources: 1620
Fitting 5 folds for each of 2 candidates, totalling 10 fits
CPU times: user 50.1 s, sys: 1.34 s, total: 51.5 s
Wall time: 4min 33s


HalvingRandomSearchCV(estimator=Pipeline(steps=[('vec', TfidfVectorizer()),
                                                ('clf',
                                                 GradientBoostingClassifier())]),
                      n_jobs=6,
                      param_distributions=[{'clf': [GradientBoostingClassifier(max_depth=4,
                                                                               n_estimators=326)],
                                            'clf__max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f2415e95130>,
                                            'clf__n_estimators': <scipy.stats._distn_infrastructure.r...0a790>,
                                            'vec': [CountVectorizer(),
                                                    TfidfVectorizer(max_features=1000,
                                                                    min_df=4,
                                                                    ngram_range=(

In [10]:
print(hrscv_.best_score_)
print(hrscv_.best_params_)

0.7808407857091266
{'clf': GradientBoostingClassifier(max_depth=4, n_estimators=326), 'clf__max_depth': 4, 'clf__n_estimators': 326, 'vec': TfidfVectorizer(max_features=1000, min_df=4, ngram_range=(1, 2)), 'vec__max_features': 1000, 'vec__min_df': 4, 'vec__ngram_range': (1, 2)}


In [11]:
classifier_final = hrscv_.best_estimator_["clf"]
vectorizer_final = hrscv_.best_estimator_["vec"]

## 4. Saving model

For sklearn model it is usually OK to use `dill` or `pickle`. 

In [12]:
import dill

In [15]:
classifier_path = "./Classifier.pkl"
vectorizer_path = "./Vectorizer.pkl"

In [16]:
with open(classifier_path, "wb") as fout:
    dill.dump(classifier_final, fout)
with open(vectorizer_path, "wb") as fout:
    dill.dump(vectorizer_final, fout)

In [23]:
classifier_final.predict(vectorizer_final.transform(["excellent"]))[0]

1