# Import packages

In [28]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

import re
import nltk
nltk.download('punkt')

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import words

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

import dill
import gzip

[nltk_data] Error loading punkt: <urlopen error [Errno 8] nodename nor
[nltk_data]     servname provided, or not known>


# Load data

In [29]:
df_train = pd.read_csv("./data/train.csv")

In [30]:
# df_train = df_train.head(1000)

df_train.head()

Unnamed: 0,file_id,review,sentiment,polarity
0,neg_10939_3.txt,This movie has some of the worst acting that ...,neg,0
1,pos_5631_8.txt,Kudos to Fawcett to taking on roles that at t...,pos,1
2,pos_8298_8.txt,I m serious as well I mean don t get me wrong...,pos,1
3,pos_9373_9.txt,Like many of you I am a great fan of the real...,pos,1
4,neg_4956_1.txt,it s embarrassing I had like 3 minutes on my ...,neg,0


# Production pipeline

In [31]:
def prep(review):
    review = BeautifulSoup(review, 'html.parser').get_text()     # Remove HTML tags.
    review = re.sub("[^a-zA-Z]", " ", review)              # Remove non-letters
    review = review.lower()     # Lower case
    token = nltk.word_tokenize(review)     # Tokenize to each word.
    review = [nltk.stem.SnowballStemmer('english').stem(w) for w in token]     # Stemming
    # Join the words back into one string separated by space, and return the result.
    return " ".join(review)

In [32]:
tv = TfidfVectorizer(ngram_range = (1,3), sublinear_tf = True, max_features = 40000)
lr = LogisticRegression(random_state=5, penalty='l2', dual=True, C=6, class_weight='balanced', solver='liblinear')

estimators = [('tfidf', tv),
              ('clf', lr)]

pipe = Pipeline(estimators)

pipe.fit(df_train['review'], df_train['polarity'])

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=40000,
                                 min_df=1, ngram_range=(1, 3), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=True,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=6, class_weight='balanced', dual=True,
                                    fit_intercept=True, intercept_scal

## Test the created pipeline

In [33]:
def sentimentAnalysis(in_sentence):
# in_sentence = 'this is a bad movie.'
    classes_label = ['Negative', 'Positive']
    return classes_label[pipe.predict([prep(in_sentence)])[0]]

in_sentence = 'this is a bad movie.'
sentimentAnalysis(in_sentence)

'Negative'

## Save pipeline

In [34]:
filename = 'model_sentiment_analysis.pk'
with gzip.open(filename, 'wb') as file:
    dill.dump(pipe, file, recurse=True)

## Re-loaded serialized pipeline and check prediction

In [35]:
with gzip.open(filename ,'rb') as f:
    loaded_model = dill.load(f)

In [36]:
test_sentence = 'this is a very bad hotel. there was no heating and i will never recommend'
classes_label = ['Negative', 'Positive']
classes_label[loaded_model.predict([prep(test_sentence)])[0]]

'Negative'

## Save model for App

In [37]:
filename = './sentimentAPI/model_sentiment_analysis.pk'
with gzip.open(filename, 'wb') as file:
    dill.dump(pipe, file, recurse=True)    