In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [2]:
data=pd.read_csv('labeledTrainData.tsv', delimiter='\t')
data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
data.groupby('sentiment').count()

Unnamed: 0_level_0,id,review
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12500,12500
1,12500,12500


In [4]:
df_x=data['review']
df_y=data.sentiment
df_x.head()

0    With all this stuff going down at the moment w...
1    \The Classic War of the Worlds\" by Timothy Hi...
2    The film starts with a manager (Nicholas Bell)...
3    It must be assumed that those who praised this...
4    Superbly trashy and wondrously unpretentious 8...
Name: review, dtype: object

In [5]:
x_train, x_test, y_train, y_test=train_test_split(df_x, df_y, test_size=0.2, random_state=0)

In [6]:
from nltk.tokenize import RegexpTokenizer

token=RegexpTokenizer(r'[a-zA-Z0-9]+')
cv=CountVectorizer(stop_words='english', tokenizer=token.tokenize)
x_traincv=cv.fit_transform(x_train)
x_testcv=cv.transform(x_test)
x_traincv.shape

(20000, 67759)

In [7]:
mnb=MultinomialNB()
mnb.fit(x_traincv, y_train)
mnb.score(x_testcv, y_test)

0.868

In [8]:
log=LogisticRegression()
log.fit(x_traincv, y_train)
log.score(x_testcv, y_test)



0.8772

In [9]:
scores=cross_val_score(log, x_traincv, y_train, cv=5)




In [10]:
print(scores)

[0.87803049 0.88127968 0.87675    0.87621905 0.87496874]


In [11]:
from sklearn.metrics import confusion_matrix
#create confusion matrix
pred=log.predict(x_testcv)
confusion=confusion_matrix(y_test, pred)
print ("Confusion matrix:\n {}".format(confusion))

Confusion matrix:
 [[2244  304]
 [ 278 2174]]


In [17]:
#review='This movie is a bit boring but there are some moments that touched my heart'
review='This movie is fantastic'
reviewcv=cv.transform([review])
log_pred=log.predict(reviewcv)
mnb_pred=mnb.predict(reviewcv)
print("logistic regression result: {}".format(log_pred))
print("multinomial nb result: {}".format(mnb_pred))

logistic regression result: [1]
multinomial nb result: [1]


In [22]:
import spacy
import en_core_web_sm
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
#nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [23]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [24]:

x_traincv=bow_vector.fit_transform(x_train)
x_testcv=bow_vector.transform(x_test)
x_traincv.shape

(20000, 109270)

In [25]:
mnb=MultinomialNB()
mnb.fit(x_traincv, y_train)
mnb.score(x_testcv, y_test)

0.87

In [26]:
log=LogisticRegression()
log.fit(x_traincv, y_train)
log.score(x_testcv, y_test)



0.8778