## Reference: 
    
https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

https://www.kaggle.com/snehithatiger/movie-review-sentiment-analysis

In [1]:
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split   
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from joblib import dump, load


In [2]:
train = pd.read_csv('../input/train.tsv', sep='\t')
test = pd.read_csv('../input/test.tsv',  sep='\t')
sampleSub = pd.read_csv('../input/sampleSubmission.csv')

In [3]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [4]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


## Feature: Tokenization & Stemming 

In [5]:
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer,WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
import re

stemmer=SnowballStemmer('english')
lemma=WordNetLemmatizer()

# https://www.kaggle.com/nikitpatel/blending-with-lr-xgb-mnb-adaboost-kne-lsvc
def clean(review_raw):
    review_clean=[]
    for i in range(0,len(review_raw)):
        review=str(review_raw[i])
        review=[stemmer.stem(w) for w in word_tokenize(str(review).lower())]
        review=[lemma.lemmatize(w) for w in word_tokenize(str(review).lower())]
        review=' '.join(review)
        review_clean.append(review)
    return review_clean

# train.Phrase=clean(train.Phrase.values)
# test.Phrase=clean(test.Phrase.values)

## Feature: CountVectorizer

In [6]:
cv = CountVectorizer(max_features = None, tokenizer=TweetTokenizer().tokenize)
cv.fit(train.Phrase)
data = cv.transform(train.Phrase)
x_final_test_cv = cv.transform(test.Phrase)

x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(data, train.Sentiment, test_size = 0.1)

## Feature: TF-IDF

In [7]:
tf_idf = TfidfVectorizer(tokenizer=TweetTokenizer().tokenize)

In [8]:
tf_idf.fit(train.Phrase)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x1a1c2d8f28>>,
        use_idf=True, vocabulary=None)

In [9]:
data = tf_idf.transform(train.Phrase)
x_final_test_cv = tf_idf.transform(test.Phrase)

x_train_tf, x_test_tf, y_train_tf, y_test_tf = train_test_split(data, train.Sentiment, test_size = 0.1)

## Training

In [10]:
def fit_classifier(classifier, x_train_feature, y_train, x_test_feature, y_test):

    classifier.fit(x_train_feature, y_train)
    y_pred = classifier.predict(x_test_feature)
    print(classifier)
    print('Accuracy: ', metrics.accuracy_score( y_pred , y_test))
    return classifier

def predict_final_result(classifier, x_final_test_feature):

    y_final_test_pred = nb_classifier.predict(x_final_test_feature)
    return y_final_test_pred

def save(model, path):
    dump(model, path)
    
# sampleSub.to_csv("naive_bayes_cv.csv", index=False)

In [11]:
save(cv, 'model/cv.joblib')
save(cv, 'model/tf_idf.joblib')

## Naive Bayes classifier

In [12]:
nb_classifier = MultinomialNB()
fit_classifier(nb_classifier, x_train_cv, y_train_cv, x_test_cv, y_test_cv)
save(nb_classifier, 'model/nb_cv.joblib')
fit_classifier(nb_classifier, x_train_tf, y_train_tf, x_test_tf, y_test_tf)
save(nb_classifier, 'model/nb_tf.joblib')
# predict_final_result(nb_classifier, x_final_test_cv)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy:  0.6143150070485711
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Accuracy:  0.5876585928489043


## Logistic Regression classifier

In [13]:
lr_classifier = LogisticRegression() 
fit_classifier(lr_classifier, x_train_cv, y_train_cv, x_test_cv, y_test_cv)
save(lr_classifier, 'model/lr_cv.joblib')
fit_classifier(lr_classifier, x_train_tf, y_train_tf, x_test_tf, y_test_tf)
save(lr_classifier, 'model/lr_tf.joblib')
# predict_final_result(lr_classifier, x_final_test_cv)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Accuracy:  0.6417403562732282
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Accuracy:  0.6302704088171216


## Simple Prediction

0 - negative

1 - somewhat negative

2 - neutral

3 - somewhat positive

4 - positive

In [14]:
sentences = ["this movie is bad", "this movie is fucking bad", "this movie is fucking good", "this movie is funny"]

In [15]:
def predict_sentences(sentences, classifier, transformer):
    
    sentence_cv = transformer.transform(sentences)
    return classifier.predict(sentence_cv)

In [16]:
sentences_label = predict_sentences(sentences, nb_classifier, cv)
for sentence, label in zip(sentences, sentences_label):
        print("Sentence:  " + sentence + "\nLabel:  " + str(label))

Sentence:  this movie is bad
Label:  1
Sentence:  this movie is fucking bad
Label:  1
Sentence:  this movie is fucking good
Label:  3
Sentence:  this movie is funny
Label:  3
