In [1]:
import pandas as pd


data = pd.read_csv("NOLA Restaurants.csv")

In [17]:
import re
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
  

'''
Preprocess a string.
:parameter
    :param text: string - name of column containing text
    :param lst_stopwords: list - list of stopwords to remove
    :param flg_lemm: bool - whether lemmitisation is to be applied
:return
    cleaned text
'''
def utils_preprocess_text(text, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and  characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yahms\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yahms\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


In [18]:
STOPWORDS = nltk.corpus.stopwords.words("english")

data["text_clean"] = data["text"].apply(lambda x: 
          utils_preprocess_text(x, flg_lemm=True, 
          lst_stopwords=STOPWORDS))
data.head(10)

Unnamed: 0,business_id,name,review_count,avg_stars,review_id,user_id,text,useful,stars,wordcount,train,text_clean
0,YNjyv0gfOr2g8lbmUpTnKg,Copper Vine,350,4.5,2u8kIWm1CrMGuwGTW1HBGQ,nn6DoANEtr7SgvWWgrh2oQ,The service is awesome...staff is very friendl...,0,5,32,True,service awesomestaff friendly knowledgeable go...
1,YNjyv0gfOr2g8lbmUpTnKg,Copper Vine,350,4.5,EwarwhOOmnB22qESUv_VPw,G6ZnatT96yzdcX81PZyT3g,New and cool spot in downtown New Orleans!\nGr...,0,5,51,False,new cool spot downtown new orleans great varie...
2,YNjyv0gfOr2g8lbmUpTnKg,Copper Vine,350,4.5,EdsUPECvSzIj503qt13Pwg,dRvKAgf9a0DSKioJSv1p0Q,We went here before a concert. It was super bu...,0,4,60,False,went concert super busy service sort slow lot ...
3,YNjyv0gfOr2g8lbmUpTnKg,Copper Vine,350,4.5,XEfHuaszNLgeUu-6gXB-qQ,JDOeSXX33nUx4q-AmUFBSw,My friends and I had a great meal at Copper Vi...,0,4,177,True,friend great meal copper vine space beautifull...
4,YNjyv0gfOr2g8lbmUpTnKg,Copper Vine,350,4.5,W7_4Dd6xTuWuTDqe68XBtQ,GCldu8eAzez5rSFpijNZ3A,"What a great concept! 38 wines on tap, oh yeah...",1,5,65,True,great concept 38 wine tap oh yeah fantastic sp...
5,YNjyv0gfOr2g8lbmUpTnKg,Copper Vine,350,4.5,iFSgljVogeJYq44jppE2zQ,ScskFjiX1EwhJritfYHPuQ,Our waitress Rose was amazing. Made me feel li...,0,5,18,True,waitress rose amazing made feel like home bar ...
6,YNjyv0gfOr2g8lbmUpTnKg,Copper Vine,350,4.5,PEVRSj1KilCMmxw0l1at1A,4HTjgdTXIK07za49VvfF7A,Best place in the CBD. Rolled in rough around ...,0,5,52,True,best place cbd rolled rough around edge rose s...
7,YNjyv0gfOr2g8lbmUpTnKg,Copper Vine,350,4.5,b4IUHT-DvIeEneCNx7O0jA,eUVmYjvgSPTojgff5CToXg,"So the waiter was great, the atmosphere was GR...",1,4,132,False,waiter great atmosphere great red blend copper...
8,YNjyv0gfOr2g8lbmUpTnKg,Copper Vine,350,4.5,dx_dmJDhUrQJ16-Oqp73Eg,FbQiilAZrTValxD_X3cPZw,"I have been here twice, and will continue to c...",1,5,242,False,twice continue come perfect setting instagramw...
9,YNjyv0gfOr2g8lbmUpTnKg,Copper Vine,350,4.5,Qekf7LisFJNPn6aFzKB-Jg,BqAu7D5qHMQsrlRW8bRl-w,My sister and I ate here last Sunday and it wa...,0,5,37,True,sister ate last sunday beyond amazing atmosphe...


In [20]:
Train = data[data['train']== True]
Test = data[data['train']== False]

In [37]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences



tokenizer = Tokenizer(num_words = 3000, oov_token = '<OOV>', split=' ')
tokenizer.fit_on_texts(Train['text_clean'])
len(tokenizer.word_index)

150845

In [55]:
len([t for t in tokenizer.word_counts.values() if int(t) >= 25])

12830

In [61]:
tokenizer = Tokenizer(num_words = 12000, oov_token = '<OOV>', split=' ')
tokenizer.fit_on_texts(Train['text_clean'])
X_train = tokenizer.texts_to_sequences(Train['text_clean'])
X_train = pad_sequences(X_train, padding="pre")

In [62]:
X_test = tokenizer.texts_to_sequences(Test['text_clean'])
X_test = pad_sequences(X_test, maxlen = X_train.shape[1], padding="pre")

In [63]:
Y_test = Test['stars']
Y_train = Train['stars']

In [66]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 150845 unique tokens.


We're using GloVe's twitter vectors, as they may match up better with yelp slang.
# https://nlp.stanford.edu/projects/glove/ (download here!)
The max dimensionality of the embedded vectors is 200-dimensional, which we;ll try to use.


In [70]:
import numpy as np

embeddings_index = {}
f = open('glove.twitter.27B.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Found 1193514 word vectors.


In [78]:
embedding_matrix = np.zeros((12001, 200))
for word, i in word_index.items():
    if i > 12000:
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector


In [89]:
from keras.layers import Embedding, Input

embedding_layer = Embedding(12001,
                            200,
                            weights=[embedding_matrix],
                            input_length=237,
                            trainable=False)


In [92]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical


lstm_out = 196
batch_size = 32

model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.2))
model.add(LSTM(lstm_out, dropout = 0.2))
model.add(Dense(5,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 237, 200)          2400200   
                                                                 
 dropout (Dropout)           (None, 237, 200)          0         
                                                                 
 lstm (LSTM)                 (None, 196)               311248    
                                                                 
 dense (Dense)               (None, 5)                 985       
                                                                 
Total params: 2,712,433
Trainable params: 312,233
Non-trainable params: 2,400,200
_________________________________________________________________
None


In [94]:
Y_train = pd.get_dummies(Y_train)
Y_test = pd.get_dummies(Y_test)

In [103]:
np.save(file="X_test.npy",arr=X_test)
np.save(file="X_train.npy",arr=X_train)
np.save(file="Y_test.npy",arr=Y_test)
np.save(file="Y_train.npy",arr=Y_train)

In [107]:
Y_train

Unnamed: 0,1,2,3,4,5
0,0,0,0,0,1
3,0,0,0,1,0
4,0,0,0,0,1
5,0,0,0,0,1
6,0,0,0,0,1
...,...,...,...,...,...
445947,0,0,0,1,0
445948,0,0,0,0,1
445950,0,1,0,0,0
445951,0,0,0,0,1


In [99]:
model.fit(X_train, Y_train, batch_size = batch_size, verbose = 1)


score,acc = model.evaluate(X_test, Y_test, verbose = 1, batch_size = batch_size, epochs=10)
print("Score: %.2f" % (score))
print("Validation Accuracy: %.2f" % (acc))




TypeError: Invalid keyword arguments: ['epochs']

334466