# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting=3)

In [3]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Cleaning the texts

In [4]:
import re         # Support for regular expressions (RE)
import nltk       # Natural Language Toolkit

print('Reviews before cleaning\n')
for i in range(5):
    print(i+1,'->',dataset['Review'][i])

# Downloading Stop Words 
# A stop word is a commonly used word (such as “the”, “a”, “an”, “in”)
nltk.download('stopwords')

# Importing Stop Words package
from nltk.corpus import stopwords

# Import Stemming package from nltk
from nltk.stem.porter import PorterStemmer

corpus = []

for i in range(0, 1000):
    # Removing all regular expressions from the revew other than letters
    review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])

    # Converting the string to lower case
    review = review.lower()

    # Removing non significant words i.e., 'the','is','that','a','then','it','was'
    # And Stemming i.e., situation where two or more words have a common root.
    review = review.split()
    ps = PorterStemmer() # creating stemming object
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] 

    # joining the list to make a string
    review = ' '.join(review)     
    
    # adding all the cleaned reviews to a new list
    corpus.append(review)
    
# Displaying reviews after cleaning
print('Reviews after cleaning\n')
for i in range(5):
    print(i+1,'->',corpus[i])

Reviews before cleaning

1 -> Wow... Loved this place.
2 -> Crust is not good.
3 -> Not tasty and the texture was just nasty.
4 -> Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.
5 -> The selection on the menu was great and so were the prices.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\brigu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Reviews after cleaning

1 -> wow love place
2 -> crust good
3 -> tasti textur nasti
4 -> stop late may bank holiday rick steve recommend love
5 -> select menu great price


## Creating the Bag of Words model

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()

In [6]:
y = dataset.iloc[:,-1].values

## Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the Naive Bayes model on the Training set

In [8]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Predicting the Test set results

In [9]:
y_pred = classifier.predict(X_test)

## Making the Confusion Matrix

In [10]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[55 42]
 [12 91]]


0.73

### Accuracy
$$ Accuracy = \frac{TP + TN}{TP + TN + FP + FN} $$
<br>
here, TP - True Positive, 
      TN - True Negative, 
      FP - False Positive, 
      FN - False Negative

In [11]:
accuracy = (cm[0,0] + cm[1,1])/(cm[0,0] + cm[1,1] + cm[0,1] + cm[1,0])
accuracy

0.73

### Precision
$$ Precision = \frac{TP}{TP + FP} $$
<br>
here, TP - True Positive, 
      TN - True Negative, 
      FP - False Positive, 
      FN - False Negative

In [12]:
precision = cm[0,0]/(cm[0,0] + cm[0,1])
precision

0.5670103092783505

### Recall
$$ Recall = \frac{TP}{TP + FN} $$
<br>
here, TP - True Positive, 
      TN - True Negative, 
      FP - False Positive, 
      FN - False Negative

In [13]:
recall = cm[0,0]/(cm[0,0] + cm[1,0])
recall

0.8208955223880597

### F1 Score
$$ F1 Score = \frac{2 * Precision * Recall}{Precision + Recall} $$

In [14]:
score = (2 * precision * recall) / (precision + recall)
score

0.6707317073170731