In [1]:
########## import libraries ############
import numpy as np
import pandas as pd
from numpy import array
from numpy import asarray
from numpy import zeros
import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dropout
from keras.layers import Conv1D
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.layers import Embedding
import IPython; from IPython.display import display, HTML
def dfPrint(df):
    display(HTML(df.to_html()))
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cardioid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using TensorFlow backend.


In [2]:
############# import data ##############
def import_data():
    train_path = "./data/nlp_trip_advisor/train.csv"
    data = pd.read_csv(train_path)
    data = data[data.Is_Response.isnull() == False]
#     data['Is_Response'] = data['Is_Response'].map(int)
    data = data[data['Description'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', data.shape)
    data = pd.get_dummies(data, columns=["Is_Response"])
    data.drop(['Browser_Used', 'Device_Used', 'Is_Response_not happy'], axis=1, inplace=True)
    data.columns = ['User_ID', "Description", "Sentiment"]
    return data

data = import_data()
dfPrint(data.sample(5))

dataset loaded with shape (38932, 5)


Unnamed: 0,User_ID,Description,Sentiment
23231,id33557,My mom and I had a wonderful stay @ The Jane. ...,0
37847,id48173,This Holiday Inn's location is great for a vac...,1
26845,id37171,I feel like I'm writing a lot of five star rev...,1
8156,id18482,Great location on the beach. Good restaurants ...,0
13962,id24288,I stayed at the Westin LAX many times.\nIt's t...,1


In [3]:
################ clean data ##############
def clean_decription(line):
    words = list(set(line.split()))
    words = list(sorted([x.lower() for x in words]))
#     print (len(words))
#     print (words)
    words2 = words.copy()
    for word in list(set(words2).intersection(stop_words)):
        words.remove(word)
#     out = ' '.join(words)
    out = ' '.join(e for e in words if e.isalnum())
    if out == "":
        print(words)
#     table = str.maketrans({key: None for key in string.punctuation})
#     out.translate(table, string.punctuation)
    return out
data["Description"] = data["Description"].map(clean_decription)
#     print (len(words))
#     print (words)
#     break
# data

In [11]:
############### define documents ################
docs = data["Description"]
############# define class labels ################
labels = data["Sentiment"]
############## prepare tokenizer #################
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

########## integer encode the documents ##########
encoded_docs = t.texts_to_sequences(docs)
########## pad documents to a max length ##########
max_length = 200
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

print(docs.sample(5))
print(vocab_size)

23716                                    clean comfy great
14285    ac acceptable bad bed bugs carpet cheaper chel...
7634     also art asked baths bit booked booked certain...
1030     across aka ally along also always amazing anni...
38702    actually all along annoying appointed carting ...
Name: Description, dtype: object
39586


In [9]:
print(encoded_docs[0:5])

[[461, 99, 15, 19, 654, 34, 846, 312, 429, 836, 86, 3, 580, 5, 78, 1480, 2, 7], [891, 1442, 52, 1256, 87, 79, 4511, 3652, 1305, 741, 151, 50, 21, 665, 2967, 864, 95, 314, 400, 875, 288, 70, 1523, 56, 76, 29, 196, 306, 581, 581, 532, 402, 348, 742, 40, 748, 1161, 793, 634, 1427, 111, 68, 443, 5930, 1144, 1177, 10, 820, 231, 98, 690, 1428, 1603, 65, 3, 21333, 35, 293, 4, 1223, 5, 8, 128, 484, 2, 523, 8798, 181, 36, 16754, 935, 7], [22, 22, 366, 271, 4716, 247, 1940, 39, 1675, 73, 99, 83, 2705, 15, 1785, 108, 1124, 21, 47, 310, 26, 149, 489, 1451, 1086, 509, 23, 211, 16, 44, 42, 6, 233, 1, 1332, 349, 1327, 40, 57, 12, 193, 2527, 307, 7637, 536, 735, 1963, 49, 4882, 429, 1220, 55, 224, 14247, 571, 86, 1397, 3, 9, 121, 35, 4, 326, 2, 10627, 166, 118, 252, 18, 11, 7], [4961, 313, 235, 87, 73, 21, 1357, 289, 3081, 355, 28, 651, 6, 2002, 196, 209, 155, 596, 596, 715, 169, 3, 3, 17, 3289, 701, 8, 2102, 71, 41, 88, 11, 717], [7638, 319, 100, 823, 37, 699, 521, 226, 1043, 360, 2284, 1333, 1277, 3

In [5]:
########### load the whole embedding into memory ############
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for num, line in enumerate(f):
    values = line.split()
    word = values[0]
    if word in stop_words :
        print (word)
        continue
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    
f.close()
max_width = 100
print('Loaded %s word vectors.' % len(embeddings_index))

the
of
to
and
in
a
for
that
on
is
was
with
he
as
it
by
at
from
his
an
be
has
are
have
but
were
not
this
who
they
had
i
which
will
their
or
its
after
been
we
more
about
up
when
there
all
out
she
other
her
than
over
into
some
you
if
no
can
do
only
most
against
so
them
what
him
during
before
while
where
because
now
between
did
just
under
such
then
any
through
being
down
off
both
those
these
our
here
should
very
my
how
until
same
won
each
does
own
me
few
too
again
your
once
further
having
himself
why
am
doing
themselves
itself
above
whom
below
s
re
d
m
nor
t
herself
myself
don
y
ma
o
yourself
ourselves
haven
ours
theirs
shan
yours
hers
ain
ll
yourselves
ve
doesn
didn
isn
aren
wasn
couldn
hasn
shouldn
wouldn
weren
Loaded 399851 word vectors.


In [6]:
########### create a weight matrix for words in training docs ############
embedding_matrix = zeros((vocab_size, max_width))
mil=0
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        mil+=1
        embedding_matrix[i] = embedding_vector
print(mil)

29657


In [7]:
################ define model#################
model = Sequential()
e = Embedding(vocab_size, max_width, weights=[embedding_matrix], input_length=max_length, trainable=True)
model.add(e)
model.add(Flatten())
model.add(Dense(50,  kernel_initializer="normal",activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(25,  kernel_initializer="normal",activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(25,  kernel_initializer="normal",activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(25,  kernel_initializer="normal",activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(25,  kernel_initializer="normal",activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(25,  kernel_initializer="normal",activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

############## compile the model ##############
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc','binary_accuracy'])
############# summarize the model ##############
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 100)          3958600   
_________________________________________________________________
flatten_1 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 25)                500025    
_________________________________________________________________
dropout_1 (Dropout)          (None, 25)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 25)                650       
_________________________________________________________________
dropout_2 (Dropout)          (None, 25)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 25)                650       
__________

In [8]:
############### split into train test data #################
padded_docs_train, padded_docs_test, labels_train, labels_test = train_test_split(padded_docs, labels, test_size=0.2, random_state=69)
################### fit the model ##########3#############
model.fit(padded_docs_train, labels_train, epochs = 3, validation_split = 0.2,verbose=1)

Train on 24916 samples, validate on 6229 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f7487d0d8d0>

In [9]:
# evaluate the model
loss, accuracy,binaryAccuracy = model.evaluate(padded_docs_test, labels_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))
print('binaryAccuracy: %f' % (binaryAccuracy*100))
predicted = model.predict(padded_docs_test)

Accuracy: 83.986131
binaryAccuracy: 83.986131
