# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t',quoting=3)

## Cleaning the texts

In [4]:
import re
import nltk
nltk.download('stopwords')
# stopwords คือ คำที่ไม่ได้ช่วยในการ predict เช่น the, a เป็นต้น
from nltk.corpus import stopwords
# stemming คือ การหา root word เช่น loved-> love เพื่อลด dimension ตอน train model(reduce learning curve)
from nltk.stem.porter import PorterStemmer
corpus=[]
for i in range(0,1000):
  review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords=stopwords.words('english')
  all_stopwords.remove('not')
  review=[ps.stem(word) for word in review if not word in set(all_stopwords)]
  review= ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
print(corpus)

['wow love place', 'crust not good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would not go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place not worth time let alon vega', 'not like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid could not

## Creating the Bag of Words model

In [11]:
# tokenization การแยก word ที่มีใน review ทั้งหมดมาสร้างเป็น column ที่มี value 0,1
# sparse matrix matrix ที่มี 0 เป็นส่วนใหญ่
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [12]:
len(X[0])

1500

## Splitting the dataset into the Training set and Test set

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Training the Naive Bayes model on the Training set

In [14]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [19]:
from sklearn.linear_model import LogisticRegression
classifier_logistic = LogisticRegression(random_state = 0)
classifier_logistic.fit(X_train, y_train)

In [20]:
from sklearn.neighbors import KNeighborsClassifier
classifier_knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier_knn.fit(X_train, y_train)

In [21]:
from sklearn.svm import SVC
classifier_svc = SVC(kernel = 'linear', random_state = 0)
classifier_svc.fit(X_train, y_train)

In [22]:
from sklearn.svm import SVC
classifier_kernel_svc = SVC(kernel = 'rbf', random_state = 0)
classifier_kernel_svc.fit(X_train, y_train)

In [23]:
from sklearn.tree import DecisionTreeClassifier
classifier_dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_dt.fit(X_train, y_train)

In [24]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier_rf.fit(X_train, y_train)

## Predicting the Test set results

In [25]:
y_pred = classifier.predict(X_test)
y_pred_logis = classifier_logistic.predict(X_test)
y_pred_knn = classifier_knn.predict(X_test)
y_pred_svc = classifier_svc.predict(X_test)
y_pred_kernel_svc = classifier_kernel_svc.predict(X_test)
y_pred_dt= classifier_dt.predict(X_test)
y_pred_rf = classifier_rf.predict(X_test)

## Making the Confusion Matrix

In [37]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score,precision_score,recall_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('accuracy: '+str(accuracy_score(y_test,y_pred)))
print('f1 '+str(f1_score(y_test,y_pred)))
print('precision '+str(precision_score(y_test,y_pred)))
print('recall '+ str(recall_score(y_test,y_pred)))

[[55 42]
 [12 91]]
accuracy: 0.73
f1 0.7711864406779663
precision 0.6842105263157895
recall 0.883495145631068


In [29]:
accuracy_score(y_test,y_pred_logis)

0.775

In [30]:
accuracy_score(y_test,y_pred_knn)

0.645

In [31]:
accuracy_score(y_test,y_pred_svc)

0.79

In [32]:
accuracy_score(y_test,y_pred_kernel_svc)

0.78

In [33]:
accuracy_score(y_test,y_pred_dt)

0.75

In [34]:
accuracy_score(y_test,y_pred_rf)

0.725