# Natural Language Processing


Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

 Importing the dataset

In [None]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
# quoting = 3 ; is used to ignore double, to avoid processing and sparsing error

In [None]:
# Understanding our dataset

print("Total number of columns:", len(dataset.columns),"\nTotal number of rows:", len(dataset))

In [None]:
dataset.head(n=10)



Cleaning the texts


In [None]:
import re # used to simplify sentences, in this case reviews
import nltk # nlpk helps to download ensemble of stopwords (words that does not help in the prediction of a review, eg a, and, the, I, etc)
nltk.download('stopwords') # downloading the stopwords
from nltk.corpus import stopwords # importing the stopword for use
from nltk.stem.porter import PorterStemmer # importing stem, used for stemming our reviews, it is used for extracting the root word from a given word eg hateful --- hate.

corpus = [] # cleaned reviews will be appended to this list
for i in range(0, len(dataset)): # iterating through each row
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # replacing all punctuations with spaces
    review = review.lower() # convert all letters to lower case
    review = review.split() # splitting each reviews into the constituent words, this returns a list of the words
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english') # all ensemble of english stopwords
    all_stopwords.remove('not') 
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)] # applying stemming words that are not stopwords
    review = ' '.join(review) # joining the words in each review into one string
    corpus.append(review) # appending each cleaned review to the list

In [None]:
corpus


Creating the Bag of Words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500) # max size of the sparse matrix ie the max number of words to be included in the sparse matrix. The lower the this number, the more less frequent words in the reviews would be removed. Like Holiday, people's name or towns
X = cv.fit_transform(corpus).toarray() # 
y = dataset.iloc[:, 1].values

In [None]:
X

In [None]:
y

Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 0)


In [None]:
print("The sizes of the train set, x and y are,", len(X_train), ",", len(y_train), ', respectively')
print("The sizes of the test set, x and y are,", len(X_test), ",", len(y_test), ', respectively')

## Fitting Naive Bayes to the Training set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("Prediction Accuracy:",accuracy_score(y_test, y_pred))

## Using  Logistic Regression 

In [None]:
from sklearn.linear_model import LogisticRegression # import the  Logistic Regression class from its lib
classifier = LogisticRegression (random_state=0)
classifier.fit(X_train,y_train)

# Predicting the Test set Result 
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("Prediction Accuracy:",accuracy_score(y_test, y_pred))


## Using K-Nearest Neighbor KNN

In [None]:
# Fitting classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier # importing the KNeighborsClassifier from its lib
classifier = KNeighborsClassifier (n_neighbors=5, metric= 'minkowski', p=2 ) # creating an object out of KNeighborsClassifier, press cmd+i for the meaning of the arguement selected. Note they are all important
classifier.fit(X_train, y_train)

# Predicting the Test set Result 
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("Prediction Accuracy:",accuracy_score(y_test, y_pred))


## Using SVM

In [None]:
# Fitting Support Vector Machine SVM to the Training set
from sklearn.svm import SVC 
classifier = SVC (kernel = 'linear',  random_state = 0, ) # creating an object out of SVC, press cmd+i for the meaning of the arguement selected. Note they are all important. Using kernel = 'linear' will change our classifier to linear
classifier.fit(X_train, y_train)


# Predicting the Test set Result 
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("Prediction Accuracy:",accuracy_score(y_test, y_pred))


## Using Kernel SVM

In [None]:
# Fitting Kernel SVM to the Training set
from sklearn.svm import SVC # importing the kernel svm from its lib
classifier = SVC (kernel = 'rbf',   random_state = 0) # creating an object out of SVC, press cmd+i for the meaning of the arguement selected. Note they are all important. Using kernel = 'rbf' will change our classifier to gaussian
classifier.fit(X_train, y_train)



# Predicting the Test set Result 
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("Prediction Accuracy:",accuracy_score(y_test, y_pred))


## Using Decision Trees

In [None]:
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0) # the most used criterion is entropy
classifier.fit(X_train, y_train)


# Predicting the Test set Result 
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("Prediction Accuracy:",accuracy_score(y_test, y_pred))


## Using Random Forest

In [None]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier # importing the random forest classifier 
classifier = RandomForestClassifier (n_estimators=10, criterion = 'entropy', random_state = 0) # the most used criterion is entropy
classifier.fit(X_train, y_train)


# Predicting the Test set Result 
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

print("Prediction Accuracy:",accuracy_score(y_test, y_pred))


## Final Words

With an accuracy of 0.79 (79%) for the test set, the prediction model built using SVM classifier (kernel=linear) was the best. The second best is another SVM classifier (kernel = 'rbf'). 
