## Question 01 :: Using NLP predict whether the review is positive or negative for a given dataset
[https://drive.google.com/open?id=1-TJWzdxapGhp2aElncd6RH6zOpSAf69X]

In [1]:
import nltk
# Importing libraries for tokenizing in various ways
from nltk.tokenize import line_tokenize, sent_tokenize, WordPunctTokenizer, word_tokenize
# Importing stopwwords collection
from nltk.corpus import stopwords
# Importing libraries for stemming
from nltk.stem import PorterStemmer
# to save and load models
import pickle
# Importing data sets
from nltk.corpus import state_union, movie_reviews, gutenberg
# Importing punction sentence tokenizer
from nltk.tokenize import PunktSentenceTokenizer
# Importing regular expressions
import re
# Importing to lemmatize like stem
from nltk.stem import WordNetLemmatizer
# to randomly shuffle
import random
# collection of dictionary, lemma ,examples. 
from nltk.corpus import wordnet 
# Importing sklearn classifiers to be used in NLP for classification
from nltk.classify.scikitlearn import SklearnClassifier
import numpy as np
import pandas as pd

In [2]:
# Importing the data into a dataframe
try:
    dataframe = pd.read_csv("Restaurant_Reviews.tsv", sep='\t')
except Exception as e:
    print(f"Error : {e}")

In [3]:
# Reading the first few rows of the dataframe
dataframe.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
# Understanding the shape of the dataframe
dataframe.shape

(1000, 2)

### Text Cleaning and Preparation

In [5]:
# Reading and writing the rows of data into document list as tuples
document = []
for row in range(0,dataframe.shape[0]):
    sentence = dataframe.iloc[row,0]
    category = dataframe.iloc[row,1]
    document.append((sentence,category))


In [6]:
document

[('Wow... Loved this place.', 1),
 ('Crust is not good.', 0),
 ('Not tasty and the texture was just nasty.', 0),
 ('Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.',
  1),
 ('The selection on the menu was great and so were the prices.', 1),
 ('Now I am getting angry and I want my damn pho.', 0),
 ("Honeslty it didn't taste THAT fresh.)", 0),
 ('The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.',
  0),
 ('The fries were great too.', 1),
 ('A great touch.', 1),
 ('Service was very prompt.', 1),
 ('Would not go back.', 0),
 ('The cashier had no care what so ever on what I had to say it still ended up being wayyy overpriced.',
  0),
 ('I tried the Cape Cod ravoli, chicken, with cranberry...mmmm!', 1),
 ('I was disgusted because I was pretty sure that was human hair.', 0),
 ('I was shocked because no signs indicate cash only.', 0),
 ('Highly recommended.', 1),
 ('Waitress was a little slow 

### Tokenization

In [7]:
all_words = []
for row in range(0,dataframe.shape[0]):
    words = word_tokenize(dataframe.iloc[row,0])
    for word in words:
        all_words.append(word.lower())

In [8]:
all_words[0:5]

['wow', '...', 'loved', 'this', 'place']

In [9]:
# Taking the frequency of the words(or tokens)
words_freq = nltk.FreqDist(all_words)

In [10]:
words_freq

FreqDist({'.': 823, 'the': 585, 'and': 392, ',': 366, 'i': 356, 'was': 308, '!': 251, 'a': 237, 'to': 219, 'is': 174, ...})

> here we'll try after removing the the stop words

In [11]:
top_words = list(words_freq.keys())[0:5000]

In [12]:
len(top_words)

2079

### Feature Selection

In [13]:
def find_features(doc):
    words = set(doc)
    features={}
    for key in top_words:
        features[key] = (key in words)
    return features

In [14]:
feature_sets = [(find_features(words),category) for (words,category) in document]

In [15]:
len(feature_sets)

1000

In [16]:
# feature_sets

### Train Test Split

In [17]:
training_set = feature_sets[:1900]
test_set = feature_sets[1900:1905]
cross_val = feature_sets[1905:]

In [18]:
test_file = open('test_001.pkl','wb')
pickle.dump(test_set,test_file)
test_file.close()

### Model Selection and Evaluation

In [19]:
try:
    classifier = nltk.classify.NaiveBayesClassifier.train(training_set)
except Exception as e:
    print(e)

In [20]:
file = open('nltk_NLP.pkl','wb')
pickle.dump(classifier,file)
file.close()