In [None]:
import pandas as pd
import json, gzip
from urllib.request import urlopen
import multiprocessing as mp
from cytoolz import *
from ftfy import fix_text
import re
from sklearn.model_selection import *

Do not forget to put your GPU option on (under Settings).

## Download and feature engineering

In [None]:
url = 'http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Patio_Lawn_and_Garden_5.json.gz'

In [None]:
data = gzip.decompress(urlopen(url).read())
data = data.splitlines()

In [None]:
print(type(data))
data[4]

In [None]:
df = pd.DataFrame(json.loads(line) for line in data)
df.head()

In [None]:
df.groupby('overall').size()

Get rid of four-star reviews (Dr. Malouf believed that they are too wishy-washy).

In [None]:
df = df[df['overall']!=4].copy()

In [None]:
df['sentiment'] = [1 if s>4 else 0 for s in df['overall']]

In [None]:
pd.value_counts(df['sentiment'])

## Cleaning text.
Fixing coding.

Look for the method applied below here: https://ftfy.readthedocs.io/en/latest/

In [None]:
with mp.Pool() as p:
    df['reviewText'] = p.map(fix_text, df['reviewText'])

df.reviewText.head()

As you see I use for replacement 2 methods, one is basic Python `.replace` and another `re.sub`. I did it because the first one ignores regular expression and sometimes this allows simplier application.

In [None]:
def preproc_text(x):
    x = str(x)
    x = x.replace("'s ",' ')
    x = re.sub("[\t\n\r\f\v]",' ', x)
    x = x.replace('&', ' and ')
    x = x.replace(' is ', " ")
    for punct in "/-":
        x = x.replace(punct, ' ') # removing the symbols between words
    x = re.sub('[\W_]+', ' ', x) # keep only alphanumeric characters
    x = re.sub(r'\d \d', '##', x) # to fix things like 5/8 or 2.5 which were left with a space instead of the slash or period.
    x = re.sub('\d', '#', x) # replace single digits with a '#' symbol
    x = re.sub('\d{2,}', '##', x) # replace double and more digits with '##'
    x = re.sub('#{2,}', '##', x) # limiting a number of '#' sequential symbols to 2 
    x = re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', x) # splitting stuck words like "MowingSo" or "yearsORIGINAL" or "UPDATESIm
    x = re.sub('(^[Aa]n )|( [Aa]n )|(^[Aa] )|( [Aa] )', ' ', x)
    for key in misspell_dict:
        x = re.sub(key, misspell_dict[key], x, flags=re.IGNORECASE)
    x = x.strip() 
    x = re.sub(r' +', ' ', x)  # removing redundant spaces in the middle
    return x

I found list of typical misspellings in a script which was shared with me. I modified it.

In [None]:
misspell_dict = {' aqm ': ' am ',
                ' I m ': ' i am',
                ' I d ': ' I would ',
                'they ve ': 'they have ',
                ' won t ': ' will not ',
                ' ll ': ' will ', 
                ' can t ': ' can not ',
                ' haven t': ' have not',
                ' didn t':' did not',
                ' aren t': ' are not',
                ' doesn t':' does not',
                ' hasn t':' has not',
                ' wasn t':' was not',
                ' couldn t': ' could not',
                ' isn t': ' not',
                'colour':'color',
                'centre':'center',
                'shouldn t':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium',
                'Snapchat': 'social medium',
                'pinterest' : 'social medium',
                'WeChat' : 'social medium',
                'mny' : 'many',
                'quora' : 'social medium',
                'bitcoin' : 'dollar',
                'cryptocurrency' : 'dollar',
                'behaviour' : 'behavior',
                'programme': 'program',
                'realise':'realize',
                'defence':'defense',
                'cryptocurrencies' : 'currencies',
                'Brexit':'brexit',
                'honours':'honors',
                'learnt':'learned',
                'upvote':'like',
                'licence':'license',
                'Whatis':'what is',
                'aluminium':'aluminum',
                'favour':'favor',
                'modelling':'modeling',
                'recognise':'recognize',
                'grey':'gray',
                'programr':'programmer',
                'travelled':'traveled',
                'cheque':'check',
                'judgement':'judgment',
                'neighbour':'neighbor',
                'analyse':'analyze',
                'practise':'practice',
                'litre':'liter'
               }

In [None]:
with mp.Pool() as p:
    df['reviewText'] = p.map(preproc_text, df['reviewText'])

df.reviewText[4]

In [None]:
wordN = max([len(doc.split(' ')) for doc in df["reviewText"]])
print(wordN)
long_ref = [doc for doc in df["reviewText"] if len(doc.split(' '))==wordN]
long_ref

It is long... Here is a phrase from its middle: "I don t know how many words Amazon allows for reviews but I can see this will become blog"

There was the following problem: some words are joined together, like "MowingSo" or "yearsORIGINAL" or "UPDATESIm".  I fixed it in my preprocessing method.

# Tokenization and Classification with `keras`

In [None]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, Embedding,Dropout, MaxPooling1D, LSTM,  BatchNormalization
from keras import regularizers

It turned out that `keras` module does not accept sparse matrices. There are ways to hande this problem. 

1. Introduce a method for each batch which convertes chosen rows into dense matrix.

2. Go to `keras` backend, `tensorflow`, which has methods for sparse matrices.

3. Reduce dimensions: 
  - using embeddings (the most popular on kaggle.com)
  - chop off too frequent words with CountVectorizer, hoping that they are too ubiquitous to be significant for our classification
  - use PCI or LSI. 

In [None]:
max_words =5000
max_sequence_length= wordN
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['reviewText'])
sequences = tokenizer.texts_to_sequences(df['reviewText'])

In [None]:
type(sequences)

In [None]:
sequences[:3]

Now we can turn our sequences into a numpy array

In [None]:
data = sequence.pad_sequences(sequences, maxlen=max_sequence_length)
print(type(data))
data.shape

In [None]:
x_train,x_test, y_train, y_test = train_test_split(data,df['sentiment'],
                             test_size=0.2,
                             stratify=df['sentiment'],
                             random_state=42)

In [None]:
max_features = vocab_size = len(tokenizer.word_index) + 1
maxlen = max_sequence_length
batch_size = 750
embedding_dims = 100
filters = 250
kernel_size = 8
hidden_dims = 64
epochs = 50
vocab_size

In [None]:
CNN_model = Sequential()
CNN_model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
CNN_model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))

CNN_model.add(Dropout(0.3))
CNN_model.add(MaxPooling1D(pool_size=5))

#Let us repeat a layer.
CNN_model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
CNN_model.add(MaxPooling1D(pool_size=5))

#CNN_model.add(LSTM(64))
CNN_model.add(Flatten())
CNN_model.add(Dropout(0.2))

# We add a simple hidden layer:
CNN_model.add(Dense(hidden_dims, activation='relu', 
                    kernel_regularizer=regularizers.l1_l2(l1=0.1, l2=0.7)))

CNN_model.add(Dropout(0.2))

# We project onto a single unit output layer, and squash it with a sigmoid:
CNN_model.add(Dense(1, activation='sigmoid'))

CNN_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
history = CNN_model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,verbose=0, 
          validation_data=(x_test, y_test))

In [None]:
from matplotlib import pyplot as plt
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.ylim(.76, 1.01)
plt.axhline(y=1.0, color="purple", linestyle="--")
plt.legend()
plt.show()