## In this notebook we will use classification models to predicate how good is our model to predict whether a reviewer liked a restaurant solely on the basis of their review
** The data file has 1000 reviews and for each review there is a Liked value indicating whether the reviewer liked it or not. We will take 800 reviews to train our model and then use 200 remaining reviewes to see how good the predictions are. **

In [63]:
library(tm)
library(SnowballC)
library(caTools)
library(randomForest)
library(e1071)

In [9]:
# Importing the dataset
dataset_original = read.delim('Restaurant_Reviews.tsv', quote = '', stringsAsFactors = FALSE)

In [11]:
head(dataset_original)

Review,Liked
Wow... Loved this place.,1
Crust is not good.,0
Not tasty and the texture was just nasty.,0
Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.,1
The selection on the menu was great and so were the prices.,1
Now I am getting angry and I want my damn pho.,0


In [21]:
# before cleanup
paste("row 1:", as.character(corpus[[1]]), sep="")
paste("row 841:", as.character(corpus[[841]]), sep="")

In [24]:
# Clean text
# build corpus from Review column
corpus = VCorpus(VectorSource(dataset_original$Review))
# convert everything to lowercase
corpus = tm_map(corpus, content_transformer(tolower))
# remove numbers like 1, 50, etc.
corpus = tm_map(corpus, removeNumbers)
# remove punctuations like ',', '...', ':', etc.
corpus = tm_map(corpus, removePunctuation)
# remove stop words like i, me, we, it, have, can't, etc.
# you can add your own stopwords
corpus = tm_map(corpus, removeWords, stopwords())
#orpus = tm_map(corpus, removeWords, c('ours', 'he'))
# stem words (get the root of each word) loved to love
corpus = tm_map(corpus, stemDocument)
# remove whitespace
corpus = tm_map(corpus, stripWhitespace)

In [26]:
# after cleanup
paste("row 1:", as.character(corpus[[1]]), sep="")
paste("row 841:", as.character(corpus[[841]]), sep="")

In [31]:
# Creating the Bag of Words model
dtm = DocumentTermMatrix(corpus)


<<DocumentTermMatrix (documents: 1000, terms: 1577)>>
Non-/sparse entries: 5435/1571565
Sparsity           : 100%
Maximal term length: 32
Weighting          : term frequency (tf)

List of 6
 $ i       : int [1:5435] 1 1 1 2 2 3 3 3 3 4 ...
 $ j       : int [1:5435] 800 1032 1557 323 589 746 904 1362 1374 90 ...
 $ v       : num [1:5435] 1 1 1 1 1 1 1 1 1 1 ...
 $ nrow    : int 1000
 $ ncol    : int 1577
 $ dimnames:List of 2
  ..$ Docs : chr [1:1000] "1" "2" "3" "4" ...
  ..$ Terms: chr [1:1577] "absolut" "absolutley" "accid" "accommod" ...
 - attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
 - attr(*, "weighting")= chr [1:2] "term frequency" "tf"


In [33]:
# We'll have a very large sparse matrix (many columns with 0 values)
print(dtm)
str(dtm)
dim(dtm) # notice it has 1000 rows and 1577 columns!!!

<<DocumentTermMatrix (documents: 1000, terms: 1577)>>
Non-/sparse entries: 5435/1571565
Sparsity           : 100%
Maximal term length: 32
Weighting          : term frequency (tf)
List of 6
 $ i       : int [1:5435] 1 1 1 2 2 3 3 3 3 4 ...
 $ j       : int [1:5435] 800 1032 1557 323 589 746 904 1362 1374 90 ...
 $ v       : num [1:5435] 1 1 1 1 1 1 1 1 1 1 ...
 $ nrow    : int 1000
 $ ncol    : int 1577
 $ dimnames:List of 2
  ..$ Docs : chr [1:1000] "1" "2" "3" "4" ...
  ..$ Terms: chr [1:1577] "absolut" "absolutley" "accid" "accommod" ...
 - attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
 - attr(*, "weighting")= chr [1:2] "term frequency" "tf"


In [34]:
# so let's remove the sparse terms
dtm = removeSparseTerms(dtm, 0.999) # keep 99% of all words with 1
dtm
dim(dtm)
# now columns reduced to 691 but sparsity is still very high

<<DocumentTermMatrix (documents: 1000, terms: 691)>>
Non-/sparse entries: 4549/686451
Sparsity           : 99%
Maximal term length: 12
Weighting          : term frequency (tf)

In [35]:
# for classification we need a data frame
dataset = as.data.frame(as.matrix(dtm))
# add the dependent variable (Liked) to this new dataframe that is a copy of the original df
dataset$Liked = dataset_original$Liked

In [36]:
head(dataset)
# sparce matrix ... mostly all 0

absolut,acknowledg,actual,ago,almost,also,although,alway,amaz,ambianc,...,wow,wrap,wrong,year,yet,youd,your,yummi,zero,Liked
0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# Encoding the target feature as factor
dataset$Liked = factor(dataset$Liked, levels = c(0, 1))

In [39]:
# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset$Liked, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

In [71]:
# Fitting Random Forest Classification to the Training set
classifier = randomForest(x = training_set[-692],
                          y = training_set$Liked,
                          ntree = 10)

In [72]:
# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-692])

In [73]:
# Making the Confusion Matrix
cm = table(test_set[, 692], y_pred)
cm

   y_pred
     0  1
  0 66 34
  1 24 76

In [61]:
#Evaluation
# TP True Positives, TN True Negatives, FP False Positives, FN False Negatives
# Accuracy = (TP + TN) / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)
TP = cm[4]
TN = cm[1]
FP = cm[3]
FN = cm[2]

paste("Accuracy: ", (TP + TN) / (TP + TN + FP + FN), sep="")
paste("Precision: ",TP / (TP + FP), sep="")
paste("Recall: ", TP / (TP + FN), sep="")

** With Naive Bayes Classification **

In [74]:
# Fitting Naive Bayes Classification to the Training set
nbclassifier = naiveBayes(x = training_set[-692],
                        y = training_set$Liked)

In [75]:
# Predicting the Test set results
y_pred = predict(nbclassifier, newdata = test_set[-692])

In [76]:
# Making the Confusion Matrix
cm = table(test_set[, 692], y_pred)
cm

   y_pred
     0  1
  0  5 95
  1  4 96

In [77]:
#Evaluation
# TP True Positives, TN True Negatives, FP False Positives, FN False Negatives
# Accuracy = (TP + TN) / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)
TP = cm[4]
TN = cm[1]
FP = cm[3]
FN = cm[2]

paste("Accuracy: ", (TP + TN) / (TP + TN + FP + FN), sep="")
paste("Precision: ",TP / (TP + FP), sep="")
paste("Recall: ", TP / (TP + FN), sep="")

** With Support Vector Machine Classification **

In [79]:
svmclassifier = svm(formula = Liked ~ .,
                 data = training_set,
                 type = 'C-classification',
                 kernel = 'linear')

"Variable(s) 'boot' and 'brick' and 'eye' and 'given' and 'legit' and 'mall' and 'oven' and 'peanut' and 'pure' and 'scallop' and 'show' and 'tap' constant. Cannot scale data."

In [81]:
# Predicting the Test set results
y_pred = predict(svmclassifier, newdata = test_set[-692])

In [82]:
# Making the Confusion Matrix
cm = table(test_set[, 692], y_pred)
cm

   y_pred
     0  1
  0 78 22
  1 19 81

In [83]:
#Evaluation
# TP True Positives, TN True Negatives, FP False Positives, FN False Negatives
# Accuracy = (TP + TN) / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN)
TP = cm[4]
TN = cm[1]
FP = cm[3]
FN = cm[2]

paste("Accuracy: ", (TP + TN) / (TP + TN + FP + FN), sep="")
paste("Precision: ",TP / (TP + FP), sep="")
paste("Recall: ", TP / (TP + FN), sep="")