This notebook is modified from the following exmaple
https://medium.com/@sabber/classifying-yelp-review-comments-using-lstm-and-word-embeddings-part-1-eb2275e4066b

In [43]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential 
from keras.layers import Dense, Flatten, LSTM,  Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding


##Plotly 
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

#Others
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

from nltk.stem.porter import *
from nltk.stem.snowball import *

from sklearn.manifold import TSNE
import re

In [44]:
def clean_text(text):
    #Remove punctuation
    text = text.translate(string.punctuation)
    
    #Convert words to lower case and split them
    text = text.lower().split()
    
    #Remove stop words
    stops = set(stopwords.words('english'))
    text = [w for w in text if not w in stops and len(w)>=3]
    
    text = " ".join(text)
    
    #Clean text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]"," ", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    
    return text

In [25]:
pd.set_option('display.max_colwidth', -1)

In [22]:
with open('/Users/tawfiq/Downloads/yelp_academic_dataset_review.json') as json_file:      
    data = json_file.readlines()
    # this line below may take at least 8-10 minutes of processing for 4-5 million rows. It converts all strings in list to actual json objects. 
    data = list(map(json.loads, data)) 

In [23]:
Reviews = pd.DataFrame(data)

In [26]:
Reviews.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,iCQpiavjjPzJ5_3gPD5Ebg,0,2011-02-25,0,x7mDIiDB3jEiPGPHOmDzyw,2,"The pizza was okay. Not the best I've had. I prefer Biaggio's on Flamingo / Fort Apache. The chef there can make a MUCH better NY style pizza. The pizzeria @ Cosmo was over priced for the quality and lack of personality in the food. Biaggio's is a much better pick if youre going for italian - family owned, home made recipes, people that actually CARE if you like their food. You dont get that at a pizzeria in a casino. I dont care what you say...",0,msQe1u7Z_XuqjGoqhB0J5g
1,pomGBqfbxcqPv14c3XH-ZQ,0,2012-11-13,0,dDl8zu1vWPdKGihJrwQbpw,5,I love this place! My fiance And I go here atleast once a week. The portions are huge! Food is amazing. I love their carne asada. They have great lunch specials... Leticia is super nice and cares about what you think of her restaurant. You have to try their cheese enchiladas too the sauce is different And amazing!!!,0,msQe1u7Z_XuqjGoqhB0J5g
2,jtQARsP6P-LbkyjbO1qNGg,1,2014-10-23,1,LZp4UX5zK3e-c5ZGSeo3kA,1,Terrible. Dry corn bread. Rib tips were all fat and mushy and had no flavor. If you want bbq in this neighborhood go to john mulls roadkill grill. Trust me.,3,msQe1u7Z_XuqjGoqhB0J5g
3,elqbBhBfElMNSrjFqW3now,0,2011-02-25,0,Er4NBWCmCD4nM8_p1GRdow,2,"Back in 2005-2007 this place was my FAVORITE thai place EVER. I'd go here ALLLLL the time. I never had any complaints. Once they started to get more known and got busy, their service started to suck and their portion sizes got cut in half. I have a huge problem with paying MORE for way less food. The last time I went there I had the Pork Pad se Ew and it tasted good, but I finished my plate and was still hungry. I used to know the manager here and she would greet me with a ""Hello Melissa, nice to see you again, diet coke & pad thai or pad se ew?"" Now a days, I know she still knows me but she disregards my presence. Also, I had asked her what was up with the new portion sizes and she had no answer for me. Great food but not worth the money. I havent been back in over a year because I refuse to pay $10-15 for dinner and still be hungry after. Sorry PinKaow, you are not what you used to be!!",2,msQe1u7Z_XuqjGoqhB0J5g
4,Ums3gaP2qM3W1XcA5r6SsQ,0,2014-09-05,0,jsDu6QEJHbwP2Blom1PLCA,5,Delicious healthy food. The steak is amazing. Fish and pork are awesome too. Service is above and beyond. Not a bad thing to say about this place. Worth every penny!,0,msQe1u7Z_XuqjGoqhB0J5g


In [28]:
#dropping Nan values
#Reviews = Reviews.dropna()
#changing the review value to a numeric type so we can do operations using it
#Reviews = Reviews[Reviews.stars.apply(lambda x:x.isnumeric())]
#drop any records with no score
Reviews = Reviews[Reviews.stars.apply(lambda x:x!="")]
#drop any records with no text for review
Reviews = Reviews[Reviews.text.apply(lambda x:x!="")]

In [33]:
#convert the reviews into positive and neagative reviews
#positive reivews have score>3
#negative reviews have score<=3
labels =  Reviews['stars'].map(lambda x: 1 if int(x) >3 else 0)

In the following code snippet, I used pandas one of the efficient builtin function ‘map’ to be used on pandas Series (single column). ‘Map’ used an external function that takes a string argument and performs some cleaning steps. First, the function removes all the punctuations, then converts all the words into lowercase. I used the ‘nltk’ stop-word list to remove them from the text. Later, the function performs some regex operations to clean the unnecessary part of the text. Finally, I used ‘SnowballStemmer’ to stem the words. Stemming is also another important part of NLP.

In [None]:
#apply the clean text function to the body field
Reviews['text'] = Reviews['text'].map(lambda x:clean_text(x))

In [None]:
#create sequence
vocabulary_size = 20000
tokenizer = Tokenizer(num_words = vocabulary_size)
tokenizer.fit_on_texts(Reviews['text'])

sequences = tokenizer.texts_to_sequence(Reviews['text'])
data = pad_sequences(sequences, maxlen=50)

In [None]:
#network architecture

model = Sequential()
model.add(Embedding(20000, 100, input_length=50))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#Fit the model

model.fit(data, np.array(labels), validation_split=0.4, epochs=3)