In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
from spacy import displacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [3]:
amazon_data = pd.read_csv('/content/drive/MyDrive/seminar/dataset/amazon_cells_labelled.txt',sep = '\t')
amazon_data.head()

Unnamed: 0,review,sentiment
0,"Good case, Excellent value.",1
1,Great for the jawbone.,1
2,Tied to charger for conversations lasting more...,0
3,The mic is great.,1
4,I have to jiggle the plug to get it to line up...,0


In [4]:
amazon_data.shape

(999, 2)

In [5]:
movies_data = pd.read_csv('/content/drive/MyDrive/seminar/dataset/IMDB Dataset.csv')
movies_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
from sklearn.preprocessing import LabelEncoder 
  
le = LabelEncoder() 
  
movies_data['sentiment']= le.fit_transform(movies_data['sentiment'])
movies_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [7]:
movies_data = movies_data[:1000]
movies_data.shape

(1000, 2)

In [8]:
resturant_data = pd.read_csv('/content/drive/MyDrive/seminar/dataset/Restaurant_Reviews.tsv',sep = '\t')
resturant_data.head()

Unnamed: 0,review,sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [9]:
resturant_data.shape

(1000, 2)

In [10]:
final_data = amazon_data

In [11]:
final_data = final_data.append([movies_data,resturant_data],ignore_index=True)
final_data.shape

(2999, 2)

In [12]:
import string 
punct = string.punctuation

In [13]:
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)

In [14]:
cleaned_review = []
for i in range(0,2999):
    doc = nlp(final_data['review'][i])
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = ""
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens+=" "
            cleaned_tokens+=token   
    cleaned_review.append(cleaned_tokens)

In [15]:
for i in range(0,8):
    print(cleaned_review[i])

 good case excellent value
 great jawbone
 tie charger conversation 45 minute major problems
 mic great
 jiggle plug line right decent volume
 dozen contact imagine fun send
 razr owner ...
 needless waste money


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(cleaned_review).toarray()
y = final_data.sentiment

In [17]:
y.shape

(2999,)

In [18]:
# Creating a pickle file for the CountVectorizer
from sklearn.externals import joblib
joblib.dump(cv,'vect-transform.pkl')



['vect-transform.pkl']

In [19]:
# Model Building
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [20]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.2)
classifier.fit(X_train, y_train)

MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True)

In [21]:
joblib.dump(classifier, 'final_sentiment_model.pkl')

['final_sentiment_model.pkl']

In [22]:
def predict_sentiment(sample_review):
    doc = nlp(sample_review)
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = ""
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens+=" "
            cleaned_tokens+=token   
   

    temp = cv.transform([cleaned_tokens]).toarray()
    return classifier.predict(temp)

In [23]:
test_reviews = [
    'This was nice',
    'you have to improve',
    'loved the food',
    'this was not that good',
    'wow this sucks',
    'really bad one'
]
for i in test_reviews:
    if predict_sentiment(i):
      print('This is a POSITIVE review.')
    else:
      print('This is a NEGATIVE review!')

This is a POSITIVE review.
This is a NEGATIVE review!
This is a POSITIVE review.
This is a POSITIVE review.
This is a NEGATIVE review!
This is a NEGATIVE review!
