# Natural Language Processing

## ======= Import the data set =========

In [1]:
import pandas as pd
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

## ====== Cleaning the dataset =======

In [2]:
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []

for i in range(dataset.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', dataset.iloc[i, 0]) #remove the non-letter characters.
    review = review.lower()
    review = review.split() #for dealing with each word in the next step
    review = [PorterStemmer().stem(word) for word in review if not word in set(stopwords.words('english'))] #'set' is faster
    review = ' '.join(review) #put into a string
    corpus.append(review)

## ====== Create the Bag of Words model ======

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500) #top 1500 features ordered by frequency
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

## ======= Splitting the dataset ============

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)

## ======= Utilize different classification models =======

### ======= Naive Bayes =======

In [5]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

y_pred_train = classifier.predict(x_train)
y_pred_test = classifier.predict(x_test)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
print ('The averaged accuracy is %f, and\n std is %f.\n'%(accuracies.mean(), accuracies.std()))

from sklearn.metrics import confusion_matrix
cm_train = confusion_matrix(y_train, y_pred_train, labels=[0, 1])
cm_test = confusion_matrix(y_test, y_pred_test, labels=[0, 1]) #lables indicate the order of the results
print ('The accuracy for the training set is %f.\n'%((cm_train[0][0]+cm_train[1][1])/(x_train.shape[0])))
print ('The accuracy for the test set is %f.\n'%((cm_test[0][0]+cm_test[1][1])/(sum(sum(cm_test)))))

The averaged accuracy is 0.673657, and
 std is 0.049905.

The accuracy for the training set is 0.921250.

The accuracy for the test set is 0.730000.



### ======== Logistic Regression =======

In [6]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(x_train, y_train)

y_pred_train = classifier.predict(x_train)
y_pred_test = classifier.predict(x_test)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
print ('The averaged accuracy is %f, and\n std is %f.\n'%(accuracies.mean(), accuracies.std()))

from sklearn.metrics import confusion_matrix
cm_train = confusion_matrix(y_train, y_pred_train, labels=[0, 1])
cm_test = confusion_matrix(y_test, y_pred_test, labels=[0, 1]) #lables indicate the order of the results
print ('The accuracy for the training set is %f.\n'%((cm_train[0][0]+cm_train[1][1])/(x_train.shape[0])))
print ('The accuracy for the test set is %f.\n'%((cm_test[0][0]+cm_test[1][1])/(sum(sum(cm_test)))))

The averaged accuracy is 0.774978, and
 std is 0.033634.

The accuracy for the training set is 0.958750.

The accuracy for the test set is 0.710000.





### ======== K Nearest Neighbors =======

In [7]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=8, p=2)
classifier.fit(x_train, y_train)

y_pred_train = classifier.predict(x_train)
y_pred_test = classifier.predict(x_test)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
print ('The averaged accuracy is %f, and\n std is %f.\n'%(accuracies.mean(), accuracies.std()))

from sklearn.metrics import confusion_matrix
cm_train = confusion_matrix(y_train, y_pred_train, labels=[0, 1])
cm_test = confusion_matrix(y_test, y_pred_test, labels=[0, 1]) #lables indicate the order of the results
print ('The accuracy for the training set is %f.\n'%((cm_train[0][0]+cm_train[1][1])/(x_train.shape[0])))
print ('The accuracy for the test set is %f.\n'%((cm_test[0][0]+cm_test[1][1])/(sum(sum(cm_test)))))

The averaged accuracy is 0.662390, and
 std is 0.041453.

The accuracy for the training set is 0.746250.

The accuracy for the test set is 0.595000.



### ====== Linear SVM =======

In [8]:
from sklearn.svm import SVC
classifier = SVC(kernel= 'linear', decision_function_shape = 'ovo', random_state= 0, C = 2)
classifier.fit(x_train, y_train)

y_pred_train = classifier.predict(x_train)
y_pred_test = classifier.predict(x_test)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
print ('The averaged accuracy is %f, and\n std is %f.\n'%(accuracies.mean(), accuracies.std()))

from sklearn.metrics import confusion_matrix
cm_train = confusion_matrix(y_train, y_pred_train, labels=[0, 1])
cm_test = confusion_matrix(y_test, y_pred_test, labels=[0, 1]) #lables indicate the order of the results
print ('The accuracy for the training set is %f.\n'%((cm_train[0][0]+cm_train[1][1])/(x_train.shape[0])))
print ('The accuracy for the test set is %f.\n'%((cm_test[0][0]+cm_test[1][1])/(sum(sum(cm_test)))))

The averaged accuracy is 0.762430, and
 std is 0.031449.

The accuracy for the training set is 0.986250.

The accuracy for the test set is 0.735000.



### ====== Kernel SVM =======

In [9]:
from sklearn.svm import SVC
classifier = SVC(kernel= 'rbf', decision_function_shape = 'ovo', random_state= 0, C = 5, gamma = 'auto')
classifier.fit(x_train, y_train)

y_pred_train = classifier.predict(x_train)
y_pred_test = classifier.predict(x_test)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
print ('The averaged accuracy is %f, and\n std is %f.\n'%(accuracies.mean(), accuracies.std()))

from sklearn.metrics import confusion_matrix
cm_train = confusion_matrix(y_train, y_pred_train, labels=[0, 1])
cm_test = confusion_matrix(y_test, y_pred_test, labels=[0, 1]) #lables indicate the order of the results
print ('The accuracy for the training set is %f.\n'%((cm_train[0][0]+cm_train[1][1])/(x_train.shape[0])))
print ('The accuracy for the test set is %f.\n'%((cm_test[0][0]+cm_test[1][1])/(sum(sum(cm_test)))))

The averaged accuracy is 0.507486, and
 std is 0.007774.

The accuracy for the training set is 0.511250.

The accuracy for the test set is 0.495000.



### ====== Decision Tree =======

In [10]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(x_train, y_train)

y_pred_train = classifier.predict(x_train)
y_pred_test = classifier.predict(x_test)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
print ('The averaged accuracy is %f, and\n std is %f.\n'%(accuracies.mean(), accuracies.std()))

from sklearn.metrics import confusion_matrix
cm_train = confusion_matrix(y_train, y_pred_train, labels=[0, 1])
cm_test = confusion_matrix(y_test, y_pred_test, labels=[0, 1]) #lables indicate the order of the results
print ('The accuracy for the training set is %f.\n'%((cm_train[0][0]+cm_train[1][1])/(x_train.shape[0])))
print ('The accuracy for the test set is %f.\n'%((cm_test[0][0]+cm_test[1][1])/(sum(sum(cm_test)))))

The averaged accuracy is 0.705036, and
 std is 0.045112.

The accuracy for the training set is 0.996250.

The accuracy for the test set is 0.710000.



### ====== Random Forest =======

In [11]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(criterion = 'gini', n_estimators = 35, random_state = 0)
classifier.fit(x_train, y_train)

y_pred_train = classifier.predict(x_train)
y_pred_test = classifier.predict(x_test)

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10)
print ('The averaged accuracy is %f, and\n std is %f.\n'%(accuracies.mean(), accuracies.std()))

from sklearn.metrics import confusion_matrix
cm_train = confusion_matrix(y_train, y_pred_train, labels=[0, 1])
cm_test = confusion_matrix(y_test, y_pred_test, labels=[0, 1]) #lables indicate the order of the results
print ('The accuracy for the training set is %f.\n'%((cm_train[0][0]+cm_train[1][1])/(x_train.shape[0])))
print ('The accuracy for the test set is %f.\n'%((cm_test[0][0]+cm_test[1][1])/(sum(sum(cm_test)))))

The averaged accuracy is 0.741272, and
 std is 0.023536.

The accuracy for the training set is 0.996250.

The accuracy for the test set is 0.740000.

