# Sentiment Analysis
Multi-class Classification
Text generation
Machine Translation

Sequence Data
Many-To-One
Many-To-Many

In [1]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score
from tensorflow.keras.layers import Dense, GRU, LSTM, Embedding, Dropout
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nlp = spacy.load("en_core_web_sm")

In [2]:
data = pd.read_csv('review.csv')
data.head()

Unnamed: 0,reviewTime,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime
0,2014-05-21,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400
1,2014-01-14,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600
2,2014-06-26,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800
3,2014-04-03,A8AJS1DW7L3JJ,3998899561,Agata Majchrzak,"[1, 1]",This is a fantastic case. Very stylish and pro...,5,Perfect Case,1396483200
4,2014-04-13,A2YO4SCWAWNYBI,3998899561,Alex Maslakov,"[0, 0]",this case fits perfectly on the s4 and keeps m...,5,Just what I needed,1397347200


In [3]:
data = data.dropna(axis=0, subset=['reviewText'])

In [4]:
data.shape

(55014, 9)

In [5]:
def preprocess(string):
    doc = nlp(string)
    lemma = [token.lemma_ for token in doc if token.lemma_.isalpha() or token.lemma_ not in STOP_WORDS]
    return ' '.join(lemma)

In [6]:
X = []
sentences = list(data['reviewText'])
for sen in sentences:
    X.append(preprocess(sen))

y = np.array(data['overall'].map({1:0, 2:0, 3:1, 4:1, 5:1}))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [9]:
print(len(tokenizer.word_index))

28867


In [10]:
vocab_size = len(tokenizer.word_index)

maxlen=200
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [11]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size+1, output_dim=300, input_length=maxlen, trainable=True, name="Input"))
model.add(Dense(300, name="Dense1"))
model.add(Dropout(rate=0.35, name="Dropout1"))
model.add(LSTM(64, return_sequences=True, dropout=0.2, name="LSTM"))
model.add(GRU(64, return_sequences=False, dropout=0.2, name="GRU"))
model.add(Dropout(rate=0.25, name="Dropout2"))
model.add(Dense(32, name="Dense2"))
model.add(Dense(1, activation='sigmoid', name="Output"))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (Embedding)           (None, 200, 300)          8660400   
                                                                 
 Dense1 (Dense)              (None, 200, 300)          90300     
                                                                 
 Dropout1 (Dropout)          (None, 200, 300)          0         
                                                                 
 LSTM (LSTM)                 (None, 200, 64)           93440     
                                                                 
 GRU (GRU)                   (None, 64)                24960     
                                                                 
 Dense2 (Dense)              (None, 64)                4160      
                                                                 
 Dropout2 (Dropout)          (None, 64)                0

In [None]:
model.fit(X_train, y_train, batch_size=500, epochs=40, validation_split=0.3)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40

In [70]:
model.evaluate(X_test, y_test)



[0.3549400866031647, 0.8842169046401978]