#### Simple poc that:

1. inputs labeled text (1=positive, 0=negative), source: uci ml repo
2. vectorizes the text
3. trains a naive bayes model
4. serializes the model object
5. outputs the model object
6. inputs the model object
7. makes some predictions using the serialized model

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

pd.set_option('display.max_colwidth', -1)

os.getcwd()

'/Users/tborgstadt/git/ml-sandbox/pred-pos-neg-review'

In [2]:
# input labeled data, imdb review comments
df = pd.read_csv('imdb_labelled.txt', sep='\t', lineterminator='\n')
df.columns = ['text', 'label']
df['lbl'] = np.where(df['label']==0, "Neg", "Pos")
df = df[['lbl','text']]
df.head(5)

Unnamed: 0,lbl,text
0,Neg,"Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out."
1,Neg,"Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent."
2,Neg,Very little music or anything to speak of.
3,Pos,The best scene in the movie was when Gerardo is trying to find a song that keeps running through his head.
4,Neg,"The rest of the movie lacks art, charm, meaning... If it's about emptiness, it works I guess because it's empty."


In [3]:
# train model - create the model object for serializing
print
print df['lbl'].value_counts()

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['lbl'], test_size=0.3, random_state=42) #random_state is set seed

tfidf_vect = TfidfVectorizer()
X_train_count = tfidf_vect.fit_transform(X_train)

model = MultinomialNB().fit(X_train_count, y_train)
X_test_count = tfidf_vect.transform(X_test)

print
print model.score(X_test_count, y_test)
print
print(classification_report(y_test, model.predict(X_test_count)))


Pos    386
Neg    361
Name: lbl, dtype: int64

0.76

              precision    recall  f1-score   support

         Neg       0.81      0.67      0.73       111
         Pos       0.72      0.85      0.78       114

   micro avg       0.76      0.76      0.76       225
   macro avg       0.77      0.76      0.76       225
weighted avg       0.77      0.76      0.76       225



In [4]:
# use model to make a prediction
print model.predict(tfidf_vect.transform(["really funny and kid friendly"]))

['Pos']


In [5]:
# again
print model.predict(tfidf_vect.transform(["sad and even creepy"]))


['Pos']


In [6]:
# again
print model.predict(tfidf_vect.transform(["pathetic story line"]))


['Neg']


In [7]:
# serialize the model
from sklearn.externals import joblib
joblib.dump(model, 'model.joblib')

['model.joblib']

In [8]:
# load serialized model
model_from_binary = joblib.load('model.joblib')

In [9]:
# make prediction with serialized model
print model_from_binary.predict(tfidf_vect.transform(["fun for the whole family "]))

['Pos']
