In [0]:
import tarfile
import os
import shutil
from urllib.request import urlopen
from contextlib import closing

#Data downloading
#relative path of train/test data folder
imdb_train_data_folder = "./aclImdb/train"
imdb_test_data_folder = "./aclImdb/test"

URL="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
ARCHIVE_NAME = "aclImdb_v1.tar"

if not os.path.exists("aclImdb"):
    opener = urlopen(URL)
    
    #downloading and extract all files.
    with open(ARCHIVE_NAME, 'wb') as archive:
        archive.write(opener.read())
        
    with closing(tarfile.open(ARCHIVE_NAME, "r:gz")) as archive:
        archive.extractall(path='.')
        
    test_folder = os.listdir(imdb_test_data_folder)
    train_folder = os.listdir(imdb_train_data_folder)
    
    #remove .txt, .feat, and unsup folder.
    for item in train_folder:
        if (item.endswith(".feat") or item.endswith(".txt")):
            os.remove(os.path.join(imdb_train_data_folder, item))
    shutil.rmtree(os.path.join(imdb_train_data_folder,"unsup"))
    for item in test_folder:
        if (item.endswith(".feat") or item.endswith(".txt")):
            os.remove(os.path.join(imdb_test_data_folder, item))
    os.remove(ARCHIVE_NAME)
#remove archieve

In [0]:
import sys
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics

#loads the data from folder
imdb_train_data = load_files(imdb_train_data_folder,shuffle=False)
print("train_samples: %d" % len(imdb_train_data.data))

imdb_test_data = load_files(imdb_test_data_folder,shuffle=False)
print("test_samples: %d" % len(imdb_test_data.data))

train_samples: 25000
test_samples: 25000


In [0]:
from sklearn.feature_extraction.text import CountVectorizer

#Builds a dictionary of features and transforms 
#documents to feature vectors: 
count_vect= CountVectorizer()
#second one exclude stopwords, like 'the','of'..
count_vect2= CountVectorizer(stop_words='english') 

X_train_counts = count_vect.fit_transform(imdb_train_data.data)
X_train_counts2 = count_vect2.fit_transform(imdb_train_data.data)

print(X_train_counts.shape)
print(X_train_counts2.shape)

(25000, 74849)
(25000, 74538)


In [0]:
from sklearn.feature_extraction.text import TfidfTransformer

#transform our count-matrix to a tf-idf representation
#Similarly. a suffix 2 meaning we remove the stopwords
tfidf_transformer = TfidfTransformer()
tfidf_transformer2 = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf2 = tfidf_transformer2.fit_transform(X_train_counts2)

print(X_train_tfidf.shape)
print(X_train_tfidf2.shape)

(25000, 74849)
(25000, 74538)


In [0]:
import pandas as pd

# get the first document
first_vector=X_train_tfidf[10]
first_vector2=X_train_tfidf2[10]
 
# show the TF-IDF scores , compare with/without stopwords
df = pd.DataFrame(first_vector.T.todense(), index=count_vect.get_feature_names(), columns=["tfidf"])
df2 = pd.DataFrame(first_vector2.T.todense(), index=count_vect2.get_feature_names(), columns=["tfidf_no_stopwords"])

In [0]:
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
the,0.252033
he,0.192326
accent,0.172488
of,0.167375
three,0.164270
...,...
flaw,0.000000
flavourless,0.000000
flavouring,0.000000
flavoured,0.000000


In [0]:
df2.sort_values(by=["tfidf_no_stopwords"],ascending=False)

Unnamed: 0,tfidf_no_stopwords
accent,0.214215
spite,0.169063
actresses,0.149836
decrescendos,0.147218
bleibtreau,0.147218
...,...
flavin,0.000000
flavia,0.000000
flava,0.000000
flav,0.000000


# Models

## Logistic Regression

In [0]:
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
text_clf_lr = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf_lr', LogisticRegression(multi_class = 'multinomial')),])
text_clf_lr.fit(imdb_train_data.data, imdb_train_data.target)
# clf_lr = LogisticRegression().fit(X_train_tfidf2, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf_lr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,


### Hyperparameter Tuning

In [0]:
# Hyperparameter tuning using Randomized search
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
X = imdb_train_data.data
Y = imdb_train_data.target
# Create parameter for CountVectorizer
ngram_range = [(1, 1), (1, 2), (2, 2)]
# Create parameter for TfidfTransformer
use_idf = (True, False)

# Create lists of parameter for Logistic Regression Classifier:
penalty = ['l1', 'l2', 'elasticnet']
C = [0.001, 0.01, 0.1, 1, 10, 100]
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]
class_weight.append(None)
solver = ['sag', 'saga']


n = len(X)
idx1 = np.arange(0, n, dtype=int)
idx1_train, idx1_test = train_test_split(idx1, test_size=0.2, shuffle = True, random_state = 123)
custom_cv = [(idx1_train, idx1_test)]

# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range=ngram_range,
                  tfidf__use_idf = use_idf, clf_lr__penalty = penalty, clf_lr__C = C, clf_lr__class_weight = class_weight, clf_lr__solver = solver);
# Call RandomizedSearchCV
rs_clf_lr = RandomizedSearchCV(text_clf_lr, parameters, n_iter=20, cv=custom_cv, random_state=199, return_train_score=False, n_jobs=-1)
rs_clf_lr.fit(X, Y)

# View Best Parameters
print('Best n-gram range:', rs_clf_lr.best_estimator_.get_params()['vect__ngram_range'])
print('Best use_idf:', rs_clf_lr.best_estimator_.get_params()['tfidf__use_idf'])
print('Best penalty:', rs_clf_lr.best_estimator_.get_params()['clf_lr__penalty'])
print('Best C:', rs_clf_lr.best_estimator_.get_params()['clf_lr__C'])
print('Best class_weight:', rs_clf_lr.best_estimator_.get_params()['clf_lr__class_weight'])
print('Best solver:', rs_clf_lr.best_estimator_.get_params()['clf_lr__solver'])
print(); print(rs_clf_lr.best_estimator_.get_params()['clf_lr'])



Best n-gram range: (1, 2)
Best use_idf: False
Best penalty: l2
Best C: 10
Best class_weight: {1: 0.5, 0: 0.5}
Best solver: sag

LogisticRegression(C=10, class_weight={0: 0.5, 1: 0.5}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='multinomial', n_jobs=None,
                   penalty='l2', random_state=None, solver='sag', tol=0.0001,
                   verbose=0, warm_start=False)


### Evaluate

In [0]:
docs_test = imdb_test_data.data

print("Logistic Regresssion:")
predicted_lr = rs_clf_lr.predict(docs_test)
print(np.mean(predicted_lr == imdb_test_data.target))
from sklearn import metrics
print(metrics.classification_report(imdb_test_data.target, predicted_lr, target_names=imdb_test_data.target_names))

Logistic Regresssion:
0.8824
              precision    recall  f1-score   support

         neg       0.88      0.88      0.88     12500
         pos       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



## SVM


In [0]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf_svm', SGDClassifier()),])
text_clf_svm.fit(imdb_train_data.data, imdb_train_data.target)
# clf_svm = SGDClassifier().fit(X_train_tfidf2, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000,

### Hyperparameter Tuning

In [0]:
# Create lists of parameter for SVM Classifier:
alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1]
learning_rate = ['optimal', 'constant', 'adaptive']
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]
class_weight.append(None)
eta0 = [0.01, 0.1, 0.5, 0.75, 1] 

# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range=ngram_range,
                  tfidf__use_idf = use_idf, clf_svm__alpha = alpha, clf_svm__learning_rate = learning_rate, clf_svm__class_weight = class_weight, clf_svm__eta0 = eta0);
# Call RandomizedSearchCV
rs_clf_svm = RandomizedSearchCV(text_clf_svm, parameters, n_iter=20, cv=custom_cv, random_state=199, return_train_score=False, n_jobs=-1)
rs_clf_svm.fit(X, Y)

# View Best Parameters
print('Best n-gram range:', rs_clf_svm.best_estimator_.get_params()['vect__ngram_range'])
print('Best use_idf:', rs_clf_svm.best_estimator_.get_params()['tfidf__use_idf'])
print('Best alpha:', rs_clf_svm.best_estimator_.get_params()['clf_svm__alpha'])
print('Best learning_rate:', rs_clf_svm.best_estimator_.get_params()['clf_svm__learning_rate'])
print('Best class_weight:', rs_clf_svm.best_estimator_.get_params()['clf_svm__class_weight'])
print('Best eta0:', rs_clf_svm.best_estimator_.get_params()['clf_svm__eta0'])
print(); print(rs_clf_svm.best_estimator_.get_params()['clf_svm'])



Best n-gram range: (1, 2)
Best use_idf: True
Best alpha: 1e-05
Best learning_rate: constant
Best class_weight: None
Best eta0: 0.5

SGDClassifier(alpha=1e-05, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.5, fit_intercept=True,
              l1_ratio=0.15, learning_rate='constant', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)


### Evaluate

In [0]:
print("SVM:")
predicted_svm = rs_clf_svm.predict(docs_test)
print(np.mean(predicted_svm == imdb_test_data.target))
print(metrics.classification_report(imdb_test_data.target, predicted_svm, target_names=imdb_test_data.target_names))

SVM:
0.88612
              precision    recall  f1-score   support

         neg       0.89      0.88      0.89     12500
         pos       0.88      0.89      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000



## Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier
text_clf_rf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf_rf', RandomForestClassifier()),])
text_clf_rf.fit(imdb_train_data.data, imdb_train_data.target)
# clf_rf = RandomForestClassifier().fit(X_train_tfidf2, twenty_train.target)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

### Hyperparameter Tuning

In [0]:
# Create lists of parameter for Random Forest Classifier:
# Number of trees in random forest
n_estimators = [10, 100, 500, 750]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [45, 65, 95, 125]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range = ngram_range,
                  tfidf__use_idf = use_idf, clf_rf__n_estimators = n_estimators, clf_rf__max_features = max_features, clf_rf__max_depth=max_depth, clf_rf__min_samples_split = min_samples_split, clf_rf__min_samples_leaf = min_samples_leaf, clf_rf__bootstrap = bootstrap);
# Call RandomizedSearchCV
rs_clf_rf = RandomizedSearchCV(text_clf_rf, parameters, n_iter=20, cv=custom_cv, random_state=199, return_train_score=False, 
                            n_jobs=-1)
rs_clf_rf.fit(X, Y)

# View Best Parameters
print('Best n-gram range:', rs_clf_rf.best_estimator_.get_params()['vect__ngram_range'])
print('Best use_idf:', rs_clf_rf.best_estimator_.get_params()['tfidf__use_idf'])
print('Best n_estimators:', rs_clf_rf.best_estimator_.get_params()['clf_rf__n_estimators'])
print('Best max_features:', rs_clf_rf.best_estimator_.get_params()['clf_rf__max_features'])
print('Best max_depth:', rs_clf_rf.best_estimator_.get_params()['clf_rf__max_depth'])
print('Best min_samples_split:', rs_clf_rf.best_estimator_.get_params()['clf_rf__min_samples_split'])
print('Best min_samples_leaf:', rs_clf_rf.best_estimator_.get_params()['clf_rf__min_samples_leaf'])
print('Best bootstrap:', rs_clf_rf.best_estimator_.get_params()['clf_rf__bootstrap'])
print(); print(rs_clf_rf.best_estimator_.get_params()['clf_rf'])



Best n-gram range: (1, 2)
Best use_idf: False
Best n_estimators: 750
Best max_features: sqrt
Best max_depth: None
Best min_samples_split: 2
Best min_samples_leaf: 4
Best bootstrap: True

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=750,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


### Evaluate

In [0]:
print("Random Forest:")
predicted_rf = rs_clf_rf.predict(docs_test)
print(np.mean(predicted_rf == imdb_test_data.target))
print(metrics.classification_report(imdb_test_data.target, predicted_rf, target_names=imdb_test_data.target_names))

Random Forest:
0.86868
              precision    recall  f1-score   support

         neg       0.87      0.86      0.87     12500
         pos       0.86      0.88      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



## AdaBoost

In [0]:
from sklearn.pipeline import Pipeline
from sklearn import metrics
import numpy as np

docs_test = imdb_test_data.data

from sklearn.ensemble import AdaBoostClassifier
text_clf_ada = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf_ada', AdaBoostClassifier())])
text_clf_ada.fit(imdb_train_data.data, imdb_train_data.target)


Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf_ada',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1.0, n_estimators=50,

In [0]:
# Hyperparameter tuning using Randomized search
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
X = imdb_train_data.data
Y = imdb_train_data.target
# Create parameter for CountVectorizer
ngram_range = [(1, 1), (1, 2), (2, 2)]
stop_words=[]
# Create parameter for TfidfTransformer
use_idf = (True, False)

n = len(X)
idx1 = np.arange(0, n, dtype=int)
idx1_train, idx1_test = train_test_split(idx1, test_size=0.2, shuffle = True, random_state = 123)
custom_cv = [(idx1_train, idx1_test)]

#The maximum number of estimators at which boosting is terminated. 
n_estimators=[50, 200, 400, 600]

learning_rate=[0.001,0.1,0.3,1]

#clf_ada__base_estimator = base_estimator
# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range=ngram_range,
                  tfidf__use_idf = use_idf, clf_ada__n_estimators = n_estimators, clf_ada__learning_rate = learning_rate);
# Call RandomizedSearchCV
rs_clf_ada = RandomizedSearchCV(text_clf_ada, parameters, n_iter=20, cv=custom_cv, random_state=199, return_train_score=False, n_jobs=-1)
rs_clf_ada.fit(X, Y)

# View Best Parameters
print('Best n-gram range:', rs_clf_ada.best_estimator_.get_params()['vect__ngram_range'])
print('Best use_idf:', rs_clf_ada.best_estimator_.get_params()['tfidf__use_idf'])
print('Best n_estimators:', rs_clf_ada.best_estimator_.get_params()['clf_ada__n_estimators'])
print('Best learning_rate:', rs_clf_ada.best_estimator_.get_params()['clf_ada__learning_rate'])
print(); print(rs_clf_ada.best_estimator_.get_params()['clf_ada'])

print(rs_clf_ada)





Best n-gram range: (1, 2)
Best use_idf: True
Best n_estimators: 600
Best learning_rate: 1

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
                   n_estimators=600, random_state=None)
RandomizedSearchCV(cv=[(array([ 9064,  6051, 17848, ..., 17730, 15725, 19966]),
                        array([20000,  5515,   966, ..., 20230, 19078,  1073]))],
                   error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('vect',
                                              CountVectorizer(analyzer='word',
                                                              binary=False,
                                                              decode_error='strict',
                                                              dtype=<class 'numpy.int64'>,
                                                              encoding='utf-8',
                                                             

### Evaluate

In [0]:
print("AdaBoost:")
predicted_ada = rs_clf_ada.predict(imdb_test_data.data)
print(np.mean(predicted_ada == imdb_test_data.target))
print(metrics.classification_report(imdb_test_data.target, predicted_ada, target_names=imdb_test_data.target_names))


AdaBoost:
0.863
              precision    recall  f1-score   support

         neg       0.88      0.85      0.86     12500
         pos       0.85      0.88      0.87     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



## Decision Tree

In [0]:
# Building a pipeline that behaves like a compound classifier
from sklearn import tree
from sklearn.pipeline import Pipeline
#text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('dt', tree.DecisionTreeClassifier())])
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('dt', tree.DecisionTreeClassifier())])

### Hyperparameter Tuning

In [0]:
# Hyperparameter tuning using Randomized search
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
X = imdb_train_data.data
Y = imdb_train_data.target
# Create parameter for CountVectorizer
ngram_range = [(1, 1), (1, 2), (2, 2)]
# Create parameter for TfidfTransformer
use_idf = (True, False)
stop_words=['english', None]

# Create lists of parameter for Decision Tree Classifier
criterion = ['gini', 'entropy']
max_depth = [45,65,95,125]


n = len(X)
idx1 = np.arange(0, n, dtype=int)
idx1_train, idx1_test = train_test_split(idx1, test_size=0.2, shuffle = True, random_state = 123)
custom_cv = [(idx1_train, idx1_test)]

# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range=ngram_range,
                  vect__stop_words=stop_words,
                  tfidf__use_idf = use_idf,
                      dt__criterion=criterion,
                      dt__max_depth=max_depth);
# Call RandomizedSearchCV
rs_clf = RandomizedSearchCV(text_clf, parameters, n_iter=96, cv=custom_cv, random_state=199, return_train_score=False, 
                            n_jobs=-1)
rs_clf.fit(X, Y)
# View Best Parameters
print('Best n-gram range:', rs_clf.best_estimator_.get_params()['vect__ngram_range'])
print('Best stop_words:', rs_clf.best_estimator_.get_params()['vect__stop_words'])
print('Best use_idf:', rs_clf.best_estimator_.get_params()['tfidf__use_idf'])
print('Best criterion:', rs_clf.best_estimator_.get_params()['dt__criterion'])
print('Best max_depth:', rs_clf.best_estimator_.get_params()['dt__max_depth'])
print(); print(rs_clf.best_estimator_.get_params()['dt'])

Best n-gram range: (1, 2)
Best stop_words: english
Best use_idf: False
Best criterion: gini
Best max_depth: 45

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=45, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')


In [0]:
# Let's see the prediction of the test set

docs_test = imdb_test_data.data
import numpy as np
from sklearn import metrics
print("Decision trees:")
predicted_dt1 = rs_clf.predict(docs_test)
print(np.mean(predicted_dt1 == imdb_test_data.target))
print(metrics.classification_report(imdb_test_data.target, predicted_dt1, target_names=imdb_test_data.target_names))


Decision trees:
0.72392
              precision    recall  f1-score   support

         neg       0.74      0.69      0.71     12500
         pos       0.71      0.76      0.73     12500

    accuracy                           0.72     25000
   macro avg       0.72      0.72      0.72     25000
weighted avg       0.72      0.72      0.72     25000

