In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import bz2
import random
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, SpatialDropout1D
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer

In [2]:
train_file = bz2.BZ2File('../input/amazonreviews/train.ft.txt.bz2')
test_file = bz2.BZ2File('../input/amazonreviews/test.ft.txt.bz2')

In [3]:
train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

In [4]:
labels = []
titles = []
reviews = []

for line in train_file_lines:
  line = line.decode("utf-8")
  labels.append(line[0:10])
  tt = line.find(':')
  titles.append(line[11:tt])
  reviews.append(line[tt+1:])

Train_Data = pd.DataFrame({'Title':titles,'Review':reviews,'Label':labels})
Train_Data

Unnamed: 0,Title,Review,Label
0,Stuning even for the non-gamer,This sound track was beautiful! It paints the...,__label__2
1,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this...,__label__2
2,Amazing!,This soundtrack is my favorite music of all t...,__label__2
3,Excellent Soundtrack,I truly like this soundtrack and I enjoy vide...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divin...",__label__2
...,...,...,...
3599995,Don't do it!!,The high chair looks great when it first come...,__label__1
3599996,"Looks nice, low functionality",I have used this highchair for 2 kids now and...,__label__1
3599997,"compact, but hard to clean","We have a small house, and really wanted two ...",__label__1
3599998,what is it saying?,not sure what this book is supposed to be. It...,__label__1


In [5]:
labels = []
titles = []
reviews = []

for line in test_file_lines:
  line = line.decode("utf-8")
  labels.append(line[0:10])
  tt = line.find(':')
  titles.append(line[11:tt])
  reviews.append(line[tt+1:])

Test_Data = pd.DataFrame({'Title':titles,'Review':reviews,'Label':labels})
Test_Data

Unnamed: 0,Title,Review,Label
0,Great CD,My lovely Pat has one of the GREAT voices of ...,__label__2
1,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sm...,__label__2
2,Batteries died within a year ...,I bought this charger in Jul 2003 and it work...,__label__1
3,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powere...,__label__2
4,Great for the non-audiophile,Reviewed quite a bit of the combo players and...,__label__2
...,...,...,...
399995,Unbelievable- In a Bad Way,We bought this Thomas for our son who is a hu...,__label__1
399996,"Almost Great, Until it Broke...",My son recieved this as a birthday gift 2 mon...,__label__1
399997,Disappointed !!!,"I bought this toy for my son who loves the ""T...",__label__1
399998,Classic Jessica Mitford,This is a compilation of a wide range of Mitf...,__label__2


In [6]:
def cleantext(text):
    
    # remove URL
    text = re.sub(r"http\S+", "", text)
    
    # remove special characters and numbers
    text = re.sub('[^ a-zA-Z\']', '', text)
    
    return text

In [7]:
Test_Data['Label'] = np.where(Test_Data['Label']=='__label__2',1,0)
Train_Data['Label'] = np.where(Train_Data['Label']=='__label__2',1,0)

In [8]:
Train_Data['Title'] = Train_Data['Title'].apply(lambda x: cleantext(x))
Train_Data['Review'] = Train_Data['Review'].apply(lambda x: cleantext(x))
Test_Data['Title'] = Test_Data['Title'].apply(lambda x: cleantext(x))
Test_Data['Review'] = Test_Data['Review'].apply(lambda x: cleantext(x))

In [9]:
stops = stopwords.words('english')
keep = ['not','do', 'does', 'did', "doesn","didn", "don", "isn't", 't']
for k in keep:
    stops.remove(k)

In [10]:
Train_Data['Title'] = Train_Data['Title'].apply(lambda x: ' '.join([PorterStemmer().stem(w).lower() for w in word_tokenize(x) if w not in stops]))
Train_Data['Review'] = Train_Data['Review'].apply(lambda x: ' '.join([PorterStemmer().stem(w).lower() for w in word_tokenize(x) if w not in stops]))
Test_Data['Title'] = Test_Data['Title'].apply(lambda x: ' '.join([PorterStemmer().stem(w).lower() for w in word_tokenize(x) if w not in stops]))
Test_Data['Review'] = Test_Data['Review'].apply(lambda x: ' '.join([PorterStemmer().stem(w).lower() for w in word_tokenize(x) if w not in stops]))

In [11]:
max_features = 20000
maxlen = 128
embed_size = 128

In [12]:
tokenizer = Tokenizer(num_words=max_features)

In [13]:
Train_data = Train_Data

In [14]:
tokenizer.fit_on_texts(Train_Data['Title']+Train_Data['Review'])

In [15]:
train = tokenizer.texts_to_sequences(Train_Data['Title']+Train_Data['Review'])
test = tokenizer.texts_to_sequences(Test_Data['Title']+Test_Data['Review'])

In [16]:
train_seq = pad_sequences(train, maxlen=maxlen, padding='post')
test_seq = pad_sequences(test, maxlen=maxlen, padding='post')

In [17]:

model = Sequential()
model.add(Embedding(max_features, embed_size, input_length = maxlen))
model.add(SpatialDropout1D(0.4))
model.add(Flatten())
model.add(Dense(2, activation='softmax'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 128, 128)          2560000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 128, 128)          0         
_________________________________________________________________
flatten (Flatten)            (None, 16384)             0         
_________________________________________________________________
dense (Dense)                (None, 2)                 32770     
Total params: 2,592,770
Trainable params: 2,592,770
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
model.fit(x=train_seq, y=to_categorical(Train_Data['Label']), epochs=2, verbose=1, batch_size=64)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fda9b498850>

In [19]:
model.evaluate(test_seq, to_categorical(Test_Data['Label']))



[0.2673531770706177, 0.8968499898910522]