In [9]:
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import pickle
from sklearn.svm import SVC
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline

from joblib import dump
from imdb_movie_reviews import utils

MAXLEN = 50

In [5]:
train_data, test_data = tfds.load(
    name="imdb_reviews",
    split=('train', 'test'), 
    as_supervised=True)

In [6]:
x_train, y_train = utils.unpack_dataset(train_data)
x_test, y_test = utils.unpack_dataset(train_data)

In [10]:
print(x_train[0])

b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it."


In [14]:
c_vectorizer = CountVectorizer(max_features=5000)
rf = RandomForestClassifier()

In [15]:
pipeline = Pipeline([('preprocessor', c_vectorizer),
                     ('clf', rf)])

In [16]:
pipeline.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=5000, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabu...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

In [17]:
pipeline.score(x_test, y_test)

1.0

In [22]:
dump(pipeline, 'random_forest.joblib')

['random_forest.joblib']

In [19]:
class_mapping = {
    0 : "Negative Review",
    1 : "Positive Review"
}

In [20]:
with open('class_mapping.pkl', 'wb') as output:  # Overwrites any existing file.
    pickle.dump(class_mapping, output, pickle.HIGHEST_PROTOCOL)

In [9]:
x_train_t = c_vectorizer.fit_transform(x_train)

In [10]:
#x_train_t = x_train_t.toarray()
x_test_t = c_vectorizer.transform(x_test)

In [11]:
with open('count_vectorizer.pck', 'wb') as output:  # Overwrites any existing file.
    pickle.dump(c_vectorizer, output, pickle.HIGHEST_PROTOCOL)

In [25]:
scaler = StandardScaler(with_mean=False)

In [26]:
x_train_s = scaler.fit_transform(x_train_t)
x_test_s = scaler.transform(x_test_t)

In [29]:
x_train_s.dtype

dtype('float64')

In [42]:
rf = RandomForestClassifier()

In [43]:
rf.fit(x_train_t, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [44]:
rf.score(x_test_t, y_test)

0.84324

In [54]:
assess_model(RandomForestClassifier(), 'Random forest')

Random forest scored 84.17999999999999 % accuracy.


In [55]:
assess_model(LogisticRegression(max_iter=10000), 'Logistic regression')

Logistic regression scored 81.176 % accuracy.


In [None]:
assess_model(SVC(), 'SVM')

In [46]:
dump(rf, 'rf.joblib')

['rf.joblib']

In [31]:
with open('rf.pickle', 'wb') as output:  # Overwrites any existing file.
        pickle.dump(rf, output, pickle.HIGHEST_PROTOCOL)

NameError: name 'rf' is not defined

In [17]:
svc = SVC()

In [None]:
svc.fit(x_train_t, y_train)