In [1]:
import pandas as pd
import numpy as np

In [2]:
yelp = pd.read_csv('yelp_labelled.txt',names=['review','sentiment'],delimiter='\t')
yelp.head(10)

Unnamed: 0,review,sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


In [3]:
yelp.shape

(1000, 2)

In [4]:
yelp.isnull().sum()

review       0
sentiment    0
dtype: int64

In [5]:
yelp['sentiment'].value_counts()

1    500
0    500
Name: sentiment, dtype: int64

In [6]:
import re, string
tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): 
    return tok.sub(r' \1 ', s).split()

In [7]:
yelp

Unnamed: 0,review,sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [8]:
y = yelp['sentiment']
X = yelp['review']
print('X : ', X.shape)
print('y: ', y.shape)

X :  (1000,)
y:  (1000,)


In [9]:
from sklearn.model_selection import train_test_split
X_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=4)

In [10]:
X_train.head()
x_test.head()

698    Needless to say, we will never be back here ag...
577         Service was good and the company was better!
763                      But the service was beyond bad.
790    When I received my Pita it was huge it did hav...
520    Oh this is such a thing of beauty, this restau...
Name: review, dtype: object

In [11]:
X_train.value_counts()

I would not recommend this place.                                                                                               2
I love this place.                                                                                                              2
The salad had just the right amount of sauce to not over power the scallop, which was perfectly cooked.                         1
It was so bad, I had lost the heart to finish it.                                                                               1
The patio seating was very comfortable.                                                                                         1
                                                                                                                               ..
We were promptly greeted and seated.                                                                                            1
We ordered some old classics and some new dishes after going there a few times and were so

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
vectorizer_words = TfidfVectorizer(tokenizer=tokenize,analyzer="word",stop_words="english",max_features=10000,
                               ngram_range=(1,3))

In [14]:
vectorizer_char = TfidfVectorizer(tokenizer=tokenize,analyzer="char",stop_words="english",max_features=10000,
                              ngram_range=(1,3))

In [15]:
feature_train_word = vectorizer_words.fit_transform(X_train)
feature_test_word = vectorizer_words.transform(x_test)
# feature_train_char = vectorizer_char.fit_transform(x_test['review'])

In [16]:
feature_train_word

<800x9464 sparse matrix of type '<class 'numpy.float64'>'
	with 14286 stored elements in Compressed Sparse Row format>

In [51]:
from scipy import sparse

In [18]:
# y = yelp['sentiment']
# X = sparse.hstack([feature_word,feature_char])

In [19]:
# from sklearn.model_selection import train_test_split
# X_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=4)

In [17]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(random_state=0)
logistic_model.fit(feature_train_word,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
y_pred = logistic_model.predict(feature_test_word)
# logistic_model.predict_proba(feature_test_word)



In [21]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test,y_pred)
print("Accuracy for Logistic Regression: ", acc)

Accuracy for Logistic Regression:  0.79


In [40]:
from sklearn.ensemble import RandomForestClassifier
random_model = RandomForestClassifier(criterion='entropy',n_estimators=1000)
random_model.fit(feature_train_word,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [41]:
y_pred = random_model.predict(feature_test_word)

In [42]:
acc_random = accuracy_score(y_test,y_pred)
print("Accuracy for RandomForest: ", acc_random)

Accuracy for RandomForest:  0.755


In [61]:
from sklearn.ensemble import ExtraTreesClassifier
tree_model = ExtraTreesClassifier(n_estimators=1000)

In [62]:
tree_model.fit(feature_train_word,y_train)
y_pred = tree_model.predict(feature_test_word)
acc_tree = accuracy_score(y_test,y_pred)
print("Accuracy with ExtraTreeClassifier: ", acc_tree)

Accuracy with ExtraTreeClassifier:  0.765
