In [4]:
import pandas as pd

# read file into pandas using a relative path
path = "D:\ML Internship\yelp_labelled.txt"
data = pd.read_table(path, header=None, names=['reviews', 'rating'])

In [5]:
data.shape

(1000, 2)

In [6]:
data.head(5)

Unnamed: 0,reviews,rating
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [7]:
data.tail(5)

Unnamed: 0,reviews,rating
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [8]:
data.rating.value_counts()

1    500
0    500
Name: rating, dtype: int64

In [9]:
#Creating data frames

X = data.reviews
y = data.rating

print(X.shape)
print(y.shape)

(1000,)
(1000,)


In [10]:
X.head()

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
4    The selection on the menu was great and so wer...
Name: reviews, dtype: object

In [11]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(750,)
(250,)
(750,)
(250,)


In [12]:
#Part 4: Vectorizing our dataset

# import and instantiate the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [13]:
# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
#Creating document-term matrix
X_train_dtm = vect.transform(X_train)

In [15]:
# examine the document-term matrix
X_train_dtm

<750x1711 sparse matrix of type '<class 'numpy.int64'>'
	with 7315 stored elements in Compressed Sparse Row format>

In [16]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<250x1711 sparse matrix of type '<class 'numpy.int64'>'
	with 2120 stored elements in Compressed Sparse Row format>

In [17]:
#examine the vocabulary and document-term matrix together
pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,00,10,100,11,12,17,1979,20,23,30,...,yellowtail,yelpers,yet,you,your,yourself,yucky,yukon,yummy,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [19]:
# train the model using X_train_dtm (timing it with an IPython "magic command")
%time nb.fit(X_train_dtm, y_train)

Wall time: 2.99 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [20]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

In [23]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.768

In [24]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[104,  30],
       [ 28,  88]], dtype=int64)

In [None]:
nb.predict_proba(X_test_dtm)

In [27]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([2.07596582e-01, 4.12526407e-03, 2.89432320e-02, 9.99990846e-01,
       5.11404489e-01, 5.56352165e-01, 9.57838071e-01, 5.80758884e-01,
       4.70653252e-01, 9.98371925e-01, 7.42687222e-02, 8.71852821e-01,
       1.83207207e-01, 9.41249903e-01, 5.35117408e-01, 5.31590351e-02,
       1.20620780e-01, 5.13572844e-01, 4.71315543e-01, 9.00400790e-01,
       9.59095073e-01, 9.96206877e-01, 9.22052019e-01, 6.08324846e-01,
       5.66733678e-05, 7.63012634e-02, 1.79241914e-01, 3.49673970e-01,
       9.41756674e-01, 3.97895311e-01, 8.78016466e-01, 5.26923982e-02,
       7.42687222e-02, 9.88898222e-01, 4.40990253e-02, 9.67131067e-01,
       8.46871259e-01, 9.98945384e-01, 4.83149972e-01, 8.41414786e-01,
       9.99996971e-01, 7.68673229e-03, 7.27939161e-01, 1.76973218e-01,
       6.53428251e-01, 6.44769819e-01, 9.99999221e-01, 1.25807772e-02,
       2.85767751e-01, 9.42090675e-02, 3.90181409e-02, 4.30363008e-01,
       9.28060460e-01, 5.64516906e-01, 4.97324893e-01, 7.34108749e-01,
      

In [85]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.8844570252187339

In [29]:
#Part 2: Comparing models
#We will compare multinomial Naive Bayes with logistic regression:

# import and instantiate a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [30]:
# train the model using X_train_dtm
%time logreg.fit(X_train_dtm, y_train)

Wall time: 892 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [31]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_dtm)

In [32]:
# calculate accuracy
metrics.accuracy_score(y_test, y_pred_class)

0.82

In [33]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[109,  25],
       [ 20,  96]], dtype=int64)

In [34]:
# calculate predicted probabilities for X_test_dtm (well calibrated)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([0.36560035, 0.19446008, 0.12653627, 0.99811061, 0.25735859,
       0.56038747, 0.76714332, 0.07915086, 0.77760802, 0.91952515,
       0.11705075, 0.68679827, 0.41810631, 0.86849379, 0.55265883,
       0.25401342, 0.17206028, 0.48647035, 0.54546079, 0.76357181,
       0.85634909, 0.94856847, 0.90128551, 0.86542579, 0.05508935,
       0.21829674, 0.60131293, 0.31363303, 0.71446229, 0.39345453,
       0.7148726 , 0.14379517, 0.11705075, 0.83016978, 0.37913695,
       0.97124434, 0.69461923, 0.95707384, 0.38187942, 0.52917116,
       0.9971229 , 0.11599901, 0.62011382, 0.58067921, 0.60605962,
       0.61463107, 0.99972336, 0.06856227, 0.41092091, 0.39554249,
       0.11372506, 0.65074487, 0.85805172, 0.40706858, 0.51648993,
       0.65287615, 0.29106499, 0.05675191, 0.63552377, 0.4981988 ,
       0.35089484, 0.01336805, 0.21237651, 0.64621877, 0.96969699,
       0.26666157, 0.94644058, 0.21874037, 0.79045566, 0.37576922,
       0.36650533, 0.43432987, 0.24046208, 0.29189707, 0.92743

In [35]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.8844570252187339

In [39]:
#Part 3: Examining a model for further insight
#We will examine the our trained Naive Bayes model to calculate the approximate "goodness" of each token.

In [40]:
# store the vocabulary of X_train
X_train_tokens = vect.get_feature_names()
len(X_train_tokens)

1711

In [41]:
# examine the first 50 tokens
print(X_train_tokens[0:50])

['00', '10', '100', '11', '12', '17', '1979', '20', '23', '30', '30s', '35', '40', '45', '4ths', '5lb', '70', '85', '90', '99', 'about', 'above', 'absolutely', 'absolutley', 'accident', 'accommodations', 'accomodate', 'accordingly', 'accountant', 'acknowledged', 'actually', 'added', 'affordable', 'after', 'afternoon', 'again', 'ago', 'ahead', 'airline', 'airport', 'ala', 'albondigas', 'all', 'allergy', 'almonds', 'almost', 'alone', 'also', 'although', 'always']


In [42]:
# examine the last 50 tokens
print(X_train_tokens[-50:])

['why', 'wienerschnitzel', 'wife', 'wildly', 'will', 'wine', 'wines', 'wings', 'winner', 'wire', 'with', 'without', 'witnessed', 'won', 'wonderful', 'wontons', 'word', 'words', 'work', 'worker', 'working', 'world', 'worries', 'worse', 'worst', 'worth', 'would', 'wouldn', 'wound', 'wow', 'wrap', 'wrapped', 'writing', 'wrong', 'ya', 'yama', 'yay', 'year', 'years', 'yellow', 'yellowtail', 'yelpers', 'yet', 'you', 'your', 'yourself', 'yucky', 'yukon', 'yummy', 'zero']


In [43]:
# Naive Bayes counts the number of times each token appears in each class
nb.feature_count_

array([[1., 3., 1., ..., 0., 1., 5.],
       [0., 0., 1., ..., 1., 2., 0.]])

In [44]:
# rows represent classes, columns represent tokens
nb.feature_count_.shape

(2, 1711)

In [51]:
# number of times each token appears across all HAM messages
bad_token_count = nb.feature_count_[0, :]
bad_token_count

array([1., 3., 1., ..., 0., 1., 5.])

In [52]:
# number of times each token appears across all SPAM messages
good_token_count = nb.feature_count_[1, :]
good_token_count

array([0., 0., 1., ..., 1., 2., 0.])

In [74]:
# create a DataFrame of tokens with their separate ham and spam counts
tokens = pd.DataFrame({'token':X_train_tokens, 'bad':bad_token_count, 'good':good_token_count}).set_index('token')
tokens.head()

Unnamed: 0_level_0,bad,good
token,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.0,0.0
10,3.0,0.0
100,1.0,1.0
11,1.0,0.0
12,2.0,0.0


In [75]:
# examine 5 random DataFrame rows
tokens.sample(5, random_state=5)

Unnamed: 0_level_0,bad,good
token,Unnamed: 1_level_1,Unnamed: 2_level_1
signs,1.0,0.0
as,12.0,15.0
fresh,3.0,10.0
bamboo,1.0,0.0
app,0.0,1.0


In [76]:
# Naive Bayes counts the number of observations in each class
nb.class_count_

array([366., 384.])

In [77]:
# add 1 to ham and spam counts to avoid mathematical errors like dividing by zero.
tokens['bad'] = tokens.bad + 1
tokens['good'] = tokens.good + 1
tokens.sample(5, random_state=5)

Unnamed: 0_level_0,bad,good
token,Unnamed: 1_level_1,Unnamed: 2_level_1
signs,2.0,1.0
as,13.0,16.0
fresh,4.0,11.0
bamboo,2.0,1.0
app,1.0,2.0


In [78]:
# convert the bad and good counts into frequencies
tokens['bad'] = tokens.bad / nb.class_count_[0]
tokens['good'] = tokens.good / nb.class_count_[1]
tokens.sample(5, random_state=5)

Unnamed: 0_level_0,bad,good
token,Unnamed: 1_level_1,Unnamed: 2_level_1
signs,0.005464,0.002604
as,0.035519,0.041667
fresh,0.010929,0.028646
bamboo,0.005464,0.002604
app,0.002732,0.005208


In [79]:
# calculate the ratio of good-to-bad for each token
tokens['good_ratio'] = tokens.good / tokens.bad
tokens.sample(5, random_state=5)

Unnamed: 0_level_0,bad,good,good_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
signs,0.005464,0.002604,0.476562
as,0.035519,0.041667,1.173077
fresh,0.010929,0.028646,2.621094
bamboo,0.005464,0.002604,0.476562
app,0.002732,0.005208,1.90625


In [81]:
# examine the DataFrame sorted by good_ratio
# note: use sort() instead of sort_values() for pandas 0.16.2 and earlier
tokens.sort_values('good_ratio', ascending=False)

Unnamed: 0_level_0,bad,good,good_ratio
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
great,0.002732,0.125000,45.750000
delicious,0.002732,0.052083,19.062500
awesome,0.002732,0.028646,10.484375
fantastic,0.002732,0.028646,10.484375
perfect,0.002732,0.023438,8.578125
loved,0.002732,0.023438,8.578125
spot,0.002732,0.020833,7.625000
happy,0.002732,0.020833,7.625000
excellent,0.002732,0.020833,7.625000
town,0.002732,0.020833,7.625000


In [84]:
# look up the good_ratio for a given token
tokens.loc['great', 'good_ratio']

45.75