# Sentiment analysis with NLP

### Importing the libraries

In [46]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score

[nltk_data] Downloading package stopwords to /Users/surya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Importing the dataset

In [6]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### Cleaning the text

In [18]:
ps = PorterStemmer()
# corpus = [' '.join(ps.stem(word) if word not in stopwords.words('english') for word in re.sub('[^a-zA-Z]', ' ', review).lower().split()) for review in dataset.Review]
corpus = []
for i in range(0, len(dataset.Review)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if word not in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

### Creating the Bag of Words model

In [20]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.Liked.values

### Splitting the data in training and test sets

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Training the Naive Bayes model on Training set

In [25]:
classifier = GaussianNB().fit(X_train, y_train)

### Predicting

In [45]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1)[:5])

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]]


### Confusion Matrix

In [47]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[55 42]
 [12 91]]


0.73