### December 6, 2017
### Udemy Machine Learning A-Z
### NLP -- for entire dataset

Bag of Words Model

Take all the unique words, each one gets a column

Each row = each review

Each cell = number of times word appears in review

In [1]:
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# data
# note, data is tsv - using a comma may be problematic with NLP
dataset = pd.read_csv("Restaurant_Reviews.tsv", 
                      delimiter = '\t', # setting delimter as tabs
                      quoting = 3) # ignoring double quotes

In [3]:
dataset.head(5)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
# cleaning data for our Bag of Words Model
# removing superfluous words
# random punctuation
# stemming (different versions of words), minimize words
import re # library to clean texts - removing characters and punctuation
import nltk # removing superfluous words
nltk.download('stopwords') # downloading list of words to remove
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # to stem words
ps = PorterStemmer()
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer() # used to create the sparse matrix

[nltk_data] Downloading package stopwords to /Users/tim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# initialize for loop - corpus is common words in NLP, collection of texts
corpus = [] # empty list

In [6]:
# loop
n = len(dataset)
for i in range(0, n):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review) # appending the updated review to the corpus list

In [7]:
corpus[0:5]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

In [8]:
# create sparse matrix
# note: countvectorizer already has stopwords, lower case, token pattern
# note: but it's better to clean the data manually, according to teachers
X = cv.fit_transform(corpus).toarray()

In [9]:
X.shape

(1000, 1565)

In [10]:
X.shape[0]

1000

In [11]:
X.shape[1]

1565

In [12]:
# can minimize word count, using max_features, e.g., remove "rick" and "steve" from the reviews
cv = CountVectorizer(max_features = 1500) # doing this in sequence, but would ordinarily write this line up above

In [13]:
X = cv.fit_transform(corpus).toarray()

In [14]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
X.shape

(1000, 1500)

In [16]:
# create dependent values
y = dataset.iloc[:, -1].values

In [17]:
y[0:5]

array([1, 0, 0, 1, 1])

In [18]:
# create classification model for bag of words matrix
# common models for NLP is Naive Bayes, DT or RF -- could run all, and compare results

# split data
from sklearn.cross_validation import train_test_split



In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [20]:
# feature scaling -- not necessary for this b/c it's 0, 1's
# from sklearn.preprocessing import StandardScaler

In [21]:
# sc = StandardScaler()

In [22]:
# X_train = sc.fit_transform(X_train)
# X_test = sc.fit_transform(X_test)

In [23]:
# using naive bayes
from sklearn.naive_bayes import GaussianNB

In [24]:
classifier = GaussianNB()

In [25]:
# fit model
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [26]:
# predictions on test set
y_pred = classifier.predict(X_test)

In [27]:
# confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [52]:
sum(y_test)

103

In [30]:
cm

array([[55, 42],
       [12, 91]])

In [48]:
import decimal

In [53]:
accuracy = decimal.Decimal((cm[0,0]+cm[1,1])/np.sum(cm))

In [54]:
print(round(accuracy, 2))

0.73
