## Reference: 
    
https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

https://www.kaggle.com/snehithatiger/movie-review-sentiment-analysis

In [1]:
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split   
from sklearn.naive_bayes import MultinomialNB

In [2]:
train = pd.read_csv('../input/train.tsv', sep='\t')
test = pd.read_csv('../input/test.tsv',  sep='\t')
sampleSub = pd.read_csv('../input/sampleSubmission.csv')

In [3]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [5]:
x_train, x_test, y_train, y_test = train_test_split(train.Phrase, train.Sentiment, test_size = 0.2)

In [6]:
cv = CountVectorizer(max_features = None)

In [7]:
cv.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [8]:
x_train_cv = cv.transform(x_train)
x_test_cv = cv.transform(x_test)

In [9]:
nb_classifier = MultinomialNB()
nb_classifier.fit(x_train_cv, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
y_pred = nb_classifier.predict(x_test_cv)

In [11]:
print('Accuracy of Naive Bayes train: ', nb_classifier.score( x_train_cv , y_train))
print('Accuracy of Naive Bayes test: ', metrics.accuracy_score( y_pred , y_test))

Accuracy of Naive Bayes train:  0.67240164039472
Accuracy of Naive Bayes test:  0.6127771369985903


In [12]:
x_final_test = test.Phrase

In [13]:
x_final_test_cv = cv.transform(x_final_test)

In [14]:
y_final_test_pred = nb_classifier.predict(x_final_test_cv)

In [15]:
y_final_test_pred

array([3, 3, 2, ..., 2, 2, 1])

In [16]:
sampleSub.Sentiment = y_final_test_pred

In [17]:
sampleSub.to_csv("naive_bayes_cv.csv", index=False)

## Simple Prediction

0 - negative

1 - somewhat negative

2 - neutral

3 - somewhat positive

4 - positive

In [18]:
sentences = ["this movie is bad", "this movie is fucking bad", "this movie is fucking good", "this movie is funny"]

In [19]:
def predict_sentences(sentences, classifier, transformer):
    
    sentence_cv = transformer.transform(sentences)
    return classifier.predict(sentence_cv)

In [21]:
sentences_label = predict_sentences(sentences, nb_classifier, cv)
for sentence, label in zip(sentences, sentences_label):
        print("Sentence:  " + sentence + "\nLabel:  " + str(label))

Sentence:  this movie is bad
Label:  0
Sentence:  this movie is fucking bad
Label:  0
Sentence:  this movie is fucking good
Label:  3
Sentence:  this movie is funny
Label:  3


## Save & Load model

https://scikit-learn.org/stable/modules/model_persistence.html

In [22]:
from joblib import dump, load
nb_path = 'model/nb_cv.joblib'
cv_path = 'model/cv.joblib'

In [23]:
dump(nb_classifier, nb_path)
dump(cv, cv_path)

['model/cv.joblib']

In [24]:
loaded_nb_clf = load(nb_path)

In [25]:
loaded_cv = load(cv_path)

In [None]:
sentences_label = predict_sentences(sentences, loaded_nb_clf, cv)
for sentence, label in zip(sentences, sentences_label):
        print("Sentence:  " + sentence + "\nLabel:  " + str(label))