<a href="https://colab.research.google.com/github/xxcramseyxx/NLP_Rest_Reviews/blob/main/natural_language_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing

## Importing the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the dataset

In [None]:
df = pd.read_csv('/content/Restaurant_Reviews.tsv', delimiter= '\t', quoting = 3) #\t makes the tab the seperator quoting=3 ignores quotes

## Cleaning the texts

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
  review = re.sub('^a-zA-Z', ' ', df['Review'][i]) #hat means not in this instance
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not') 
  #checks stopwords list returns ones that are not with stemming
  review = [ps.stem(word) for word in review if not word in set()]
  review = ' '.join(review) #' '. applies a space between words in review
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(corpus)

['wow... love thi place.', 'crust is not good.', 'not tasti and the textur wa just nasty.', 'stop by dure the late may bank holiday off rick steve recommend and love it.', 'the select on the menu wa great and so were the prices.', 'now i am get angri and i want my damn pho.', "honeslti it didn't tast that fresh.)", 'the potato were like rubber and you could tell they had been made up ahead of time be kept under a warmer.', 'the fri were great too.', 'a great touch.', 'servic wa veri prompt.', 'would not go back.', 'the cashier had no care what so ever on what i had to say it still end up be wayyy overpriced.', 'i tri the cape cod ravoli, chicken, with cranberry...mmmm!', 'i wa disgust becaus i wa pretti sure that wa human hair.', 'i wa shock becaus no sign indic cash only.', 'highli recommended.', 'waitress wa a littl slow in service.', 'thi place is not worth your time, let alon vegas.', 'did not like at all.', 'the burritto blah!', 'the food, amazing.', 'servic is also cute.', 'i cou

## Creating the Bag of Words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values

In [None]:
len(X[0])

1500

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training the Naive Bayes model on the Training set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy = accuracy_score(y_test,y_pred)
print('Accuracy is: {}'.format(accuracy))

[[50 47]
 [12 91]]
Accuracy is: 0.705
