Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import zipfile

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tensorflow.keras.models import  Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout2D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
import re
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

%matplotlib inline

Using TensorFlow backend.


Reading data from zipfile

In [0]:
with zipfile.ZipFile("/content/drive/My Drive/SA.zip", 'r') as file:
  file.extractall("")

Exploratory Data Analysis

In [3]:
train_data = pd.read_csv("/content/SA/train_data.csv")
train_data.head()

Unnamed: 0,trn_id,text
0,trn_1,Well this place got me to write my first revie...
1,trn_2,A very good Greek restaurant with tasty food. ...
2,trn_3,"Website says open, Google says open, Yelp says..."
3,trn_4,If I could give zero stars I would. When we wa...
4,trn_5,They have great food & definitely excellent se...


In [4]:
train_label = pd.read_csv("/content/SA/train_label.csv")
train_label.head()

Unnamed: 0,trn_id,label
0,trn_1,2
1,trn_2,5
2,trn_3,1
3,trn_4,1
4,trn_5,5


In [5]:
test_data = pd.read_csv("/content/SA/test_data.csv")
test_data.head()

Unnamed: 0,test_id,text
0,test_1,trying to have a nice quiet dinner. the annou...
1,test_2,Been getting food to go from here for over 3yr...
2,test_3,Ugh. I've had to eat here a couple of times be...
3,test_4,The people here are so nice! I ordered on eat ...
4,test_5,Heard alot of good things about this place and...


In [0]:
data = pd.merge(train_data, train_label)
data.head()

Unnamed: 0,trn_id,text,label
0,trn_1,Well this place got me to write my first revie...,2
1,trn_2,A very good Greek restaurant with tasty food. ...,5
2,trn_3,"Website says open, Google says open, Yelp says...",1
3,trn_4,If I could give zero stars I would. When we wa...,1
4,trn_5,They have great food & definitely excellent se...,5


In [0]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 650000 entries, 0 to 649999
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   trn_id  650000 non-null  object
 1   text    650000 non-null  object
 2   label   650000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 19.8+ MB


In [0]:
pd.value_counts(data['label'])

5    130000
4    130000
3    130000
2    130000
1    130000
Name: label, dtype: int64

Converting train data to lower case

In [0]:
print(data['text'][0])
print("")
data['text'] = data['text'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))
print(data['text'][0])

Well this place got me to write my first review. With all the yelp reviews I figured it would be a good taco spot (and the 4-5 dollar price range per taco) so my wife and I decided to stop by. Usually a place that has that price range like Puesto in San Diego or several taco food trucks in LA they have amazing tacos. Maybe it was a bad day but the steak and the chicken tacos I had were way over cooked and honestly only tasted like the sauce you chose to put on in. The add on for rice and beans were 2 dollars which got you a ketchup cup of each lol my wife's tacos were a little bit better though. The corn tortillas weren't the greatest either

Well this place got me to write my first review With all the yelp reviews I figured it would be a good taco spot and the 45 dollar price range per taco so my wife and I decided to stop by Usually a place that has that price range like Puesto in San Diego or several taco food trucks in LA they have amazing tacos Maybe it was a bad day but the steak

Tokenization of train features

In [0]:
max_feature = 6500

token = Tokenizer(num_words=max_feature, split=' ')

token.fit_on_texts(data['text'])

X = token.texts_to_sequences(data['text'])

Padding of text sequences

In [0]:
X = pad_sequences(X, maxlen=650)

In [0]:
y = data['label']
y = np.array(y)

Splitting data into train and test data

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(520000, 650) (520000,)
(130000, 650) (130000,)


Converting categorical labels to binary form


*   label = 1 is label_0
*   label = 2 is label_1
*   label = 3 is label_2
*   label = 4 is label_3
*   label = 5 is label_4

In [0]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

In [0]:
y_train = np_utils.to_categorical(y_train,5)
y_test = np_utils.to_categorical(y_test,5)

Building Model

In [0]:
model = Sequential()

In [0]:
model.add(Embedding(input_dim=max_feature, output_dim=256, input_length=X.shape[1]))

model.add(LSTM(units=256, activation='tanh', recurrent_activation='sigmoid', dropout=0.4, recurrent_dropout=0.4))

model.add(Dense(units=5, activation='softmax'))



In [0]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 650, 256)          1664000   
_________________________________________________________________
lstm (LSTM)                  (None, 256)               525312    
_________________________________________________________________
dense (Dense)                (None, 5)                 1285      
Total params: 2,190,597
Trainable params: 2,190,597
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

Creating Callback Checkpoint to save each epochs

In [0]:

filepath="/content/drive/My Drive/Data Science/checkpoint/weights.{epoch:02d}-{val_accuracy:.2f}.hdf5"

checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, mode='max')

callbacks_list = [checkpoint]

In [0]:
history = model.fit(X_train, y_train, batch_size=100, epochs=5, validation_data= (X_test, y_test), callbacks=callbacks_list)
history

Epoch 1/5
Epoch 00001: saving model to /content/drive/My Drive/Data Science/checkpoint/weights.01-0.65.hdf5
Epoch 2/5
Epoch 00002: saving model to /content/drive/My Drive/Data Science/checkpoint/weights.02-0.67.hdf5
Epoch 3/5

In [0]:
new_model = load_model("/content/drive/My Drive/Data Science/checkpoint/weights.04-0.67.hdf5")



In [0]:
history = model.fit(X_train, y_train, batch_size=100, epochs=5, validation_data= (X_test, y_test), callbacks=callbacks_list, initial_epoch=4)
history

Epoch 5/5
Epoch 00005: saving model to /content/drive/My Drive/Data Science/checkpoint/weights.05-0.65.hdf5


<tensorflow.python.keras.callbacks.History at 0x7f90685314e0>

In [0]:
SA_model = model.save(filepath= '/content/drive/My Drive/Data Science/SA/SA.h5')

Load model to evaluate and predict classes

In [6]:
yelp_review = load_model("/content/drive/My Drive/Data Science/SA/SA.h5")



In [7]:
print(test_data['text'][0])
print("")
test_data['text'] = test_data['text'].apply(lambda x: re.sub('[^a-zA-Z0-9\s]', '', x))
print(test_data['text'][0])

trying to have a nice quiet dinner.  the announcer for the awards giveaways is way too loud in the restaurant

trying to have a nice quiet dinner  the announcer for the awards giveaways is way too loud in the restaurant


In [0]:
max_feature = 6500

token = Tokenizer(num_words=max_feature, split=' ')

token.fit_on_texts(test_data['text'])

X = token.texts_to_sequences(test_data['text'])

In [0]:
X = pad_sequences(X, maxlen=650)

In [11]:
test_data_pred = yelp_review.predict_classes(X)
test_data_pred

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


array([1, 1, 2, ..., 2, 0, 3])

In [13]:
test_data_pred = pd.DataFrame(test_data_pred,columns=['pred_label'])
test_data_pred.head()

Unnamed: 0,pred_label
0,1
1,1
2,2
3,0
4,2


In [16]:
pred_df = pd.concat([test_data['text'],test_data_pred], axis=1)
pred_df.head()

Unnamed: 0,text,pred_label
0,trying to have a nice quiet dinner the announ...,1
1,Been getting food to go from here for over 3yr...,1
2,Ugh Ive had to eat here a couple of times beca...,2
3,The people here are so nice I ordered on eat 2...,0
4,Heard alot of good things about this place and...,2


In [0]:
yelp_review.save("/content/drive/My Drive/Data Science/SA/yelp_review.h5")