### Data Cleaning

In [1]:
import pandas as pd
import re

def preprocessor(text):
    
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

df = pd.read_csv('movie_data.csv')
df['review'] = df['review'].apply(preprocessor) 

### Train & Test





In [2]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [3]:
from nltk.stem.porter import PorterStemmer  # translate words to stem, ex: running -> run
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop = stopwords.words('english')  # remove stopwords

porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split() if word not in stop]

# def tokenizer(text):
#     return text.split()

from tokenizer import tokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tzuchun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param = {
    'vect__tokenizer': [tokenizer, tokenizer_porter],
    'vect__stop_words': [stop, None],
    'vect__ngram_range': [(1, 1)],
    'clf__penalty': ['l1', 'l2'],
    'clf__C': [1.0, 10.0, 100.0]
   
}

In [5]:
lr_tfidf = Pipeline([
    ('vect', tfidf), ('clf', LogisticRegression(random_state=0))
])  # Tfidf Vectorizer first, then Logistic Regression

#gs_lr_tfidf = GridSearchCV(lr_tfidf, param, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
#gs_lr_tfidf.fit(X_train, y_train)

In [6]:
# 'Best parameter set: %s ' % gs_lr_tfidf.best_params_

In [7]:
lr_tfidf = Pipeline([
    ('vect', TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, 
                             tokenizer=tokenizer, stop_words=None)), 
    ('clf', LogisticRegression(random_state=0, C=10.0, penalty='l2'))
])  

lr_tfidf.fit(X_train, y_train)
print('Test accuracy: %.3f' % lr_tfidf.score(X_train, y_train))



Test accuracy: 0.988


In [8]:
sklearn.metrics.accuracy_score(lr_tfidf.predict(X_test), y_test)

0.89876

In [9]:
import joblib  # persist the model

joblib.dump(lr_tfidf, 'movie_review_model.pkl')

['movie_review_model.pkl']

In [10]:
movie_review_model = open('movie_review_model.pkl','rb')
lr_tfidf = joblib.load(movie_review_model) # reuse this model

In [11]:
lr_tfidf.predict(['I love movie'])

array([1])

In [12]:
joblib.dump(lr_tfidf, 'movie_review_model.pkl')

['movie_review_model.pkl']