### Imports

In [1]:
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from itertools import combinations
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bert_embedding import BertEmbedding
from allennlp.commands.elmo import ElmoEmbedder
from sklearn.model_selection import train_test_split

from transformers import *
import torch
import keras

import imp, gzip
import pickle, nltk
import gensim
import multiprocessing
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils as my_utils



import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from pathlib import Path

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers

Using TensorFlow backend.


In [2]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

### Definitions

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in tqdm(parse(path)):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [4]:
def process_df(df):
    df['text'] = my_utils.preprocess(df['text'])
    return df

### Start

In [None]:
dataset_movies = getDF('datasets_raw/reviews_Movies_and_TV_5.json.gz')
dataset_movies.shape

In [5]:
dataset_kindle = getDF('datasets_raw/reviews_Kindle_Store_5.json.gz')
dataset_kindle.shape

982619it [01:25, 11438.96it/s]


(982619, 9)

In [None]:
dataset_home = getDF('datasets_raw/reviews_Home_and_Kitchen_5.json.gz')
dataset_home.shape

In [6]:
dataset = dataset_kindle

In [None]:
dataset = pd.concat([dataset_movies, dataset_home, dataset_kindle])

In [7]:
dataset = dataset.drop(columns=['reviewerID', 'asin', 'reviewerName', 'helpful', 'summary', 'unixReviewTime', 'reviewTime'])

In [8]:
dataset = dataset.rename(columns={'reviewText': 'text', 'overall': 'sentiment'})

In [9]:
dataset.shape

(982619, 2)

In [10]:
n_cores = 45

In [11]:
%%time
n = int(dataset.shape[0]/n_cores)
list_df = [dataset[i:i+n] for i in range(0, dataset.shape[0],n)]

pool = multiprocessing.Pool(n_cores)
processed_list_df = pool.map(process_df, list_df)
pool.close()

dataset = pd.concat(processed_list_df)
dataset.shape

CPU times: user 2.96 s, sys: 11.7 s, total: 14.7 s
Wall time: 5min 3s


(982619, 2)

In [12]:
Counter(dataset.sentiment)

Counter({5.0: 575264, 4.0: 254013, 3.0: 96194, 2.0: 34130, 1.0: 23018})

In [13]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r', encoding='utf8')
    model = {}
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [14]:
glove_embedding_dim = 300
glove_embeddings_index = loadGloveModel("nongit_resources/glove.6B.300d.txt")

720it [00:00, 7196.42it/s]

Loading Glove Model


400000it [01:03, 6336.84it/s]

Done. 400000  words loaded!





In [15]:
max_features = 20000

In [16]:
# dataset.text = dataset.text.apply(lambda x: " ".join(x))

In [17]:
# d = dataset.sample(1000000)

In [18]:
vectorizer = TfidfVectorizer(max_features = max_features)

In [19]:
vectorizer.fit(dataset.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=20000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [20]:
vocab = vectorizer.get_feature_names()

In [21]:
d = dataset #.sample(300000)

In [22]:
d.text = d.text.apply(lambda x: x.split(" "))

In [23]:
# %%time
# d.text = d.text.apply(lambda x: [i for i in x if i in vocab])

In [24]:
n_cores = 45

In [25]:
def process_df(df):
    df['text'] = df['text'].apply(lambda x: [i for i in x if i in vocab])
    return df

In [26]:
%%time
n = int(d.shape[0]/n_cores)
list_df = [d[i:i+n] for i in range(0, d.shape[0],n)]

pool = multiprocessing.Pool(n_cores)
processed_list_df = pool.map(process_df, list_df)
pool.close()

d_ = pd.concat(processed_list_df)
d_.shape

CPU times: user 26.4 s, sys: 25.9 s, total: 52.3 s
Wall time: 10min 2s


(982619, 2)

In [27]:
d_ = d_[d_.text.apply(lambda x: len(x)>3 and len(x)<50)]

In [28]:
d_.shape

(782823, 2)

In [29]:
d_.text = d_.text.apply(lambda x: " ".join(x))

In [30]:
X_train, X_test, y_train, y_test = train_test_split(d_.text, d_.sentiment, test_size=0.3, random_state=37)

In [31]:
%%time
tk = Tokenizer(num_words=max_features, split=" ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

CPU times: user 40.9 s, sys: 1.88 s, total: 42.8 s
Wall time: 42.8 s


In [32]:
seq_lengths = X_train.apply(lambda x: len(x.split(' ')))
seq_lengths.describe()

count    547976.000000
mean         18.211281
std          11.302874
min           4.000000
25%           9.000000
50%          15.000000
75%          25.000000
max          49.000000
Name: text, dtype: float64

In [33]:
MAX_LEN = 50

In [34]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=MAX_LEN)

In [35]:
X_test_seq_trunc.shape

(234847, 50)

In [36]:
X_train_seq_trunc.shape

(547976, 50)

In [37]:
max_features

20000

In [38]:
len(tk.word_index.items())

19919

In [39]:
emb_matrix = np.zeros((max_features, glove_embedding_dim))

for w, i in tk.word_index.items():
    try:
        vect = glove_embeddings_index[w]
        emb_matrix[i] = vect
    except:
        pass

In [40]:
emb_matrix.shape

(20000, 300)

In [41]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

In [42]:
emb_model2 = models.Sequential()
emb_model2.add(layers.Embedding(max_features, 300, input_length=MAX_LEN))
emb_model2.add(Bidirectional(LSTM(1024)))
emb_model2.add(Dropout(0.7))
emb_model2.add(layers.Dense(256, activation='relu'))
emb_model2.add(layers.Dense(5, activation='softmax'))
emb_model2.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 300)           6000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 2048)              10854400  
_________________________________________________________________
dropout_1 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               524544    
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 1285      
Total params: 17,380,229
Trainable params: 17,380,229
Non-trainable params: 0
_________________________________________________________________


In [43]:
emb_model2.layers[0].set_weights([emb_matrix])
emb_model2.layers[0].trainable = True

In [44]:
emb_model2.compile(optimizer='rmsprop', loss='categorical_crossentropy' , metrics=['accuracy'])

In [45]:
history = emb_model2.fit(X_train_seq_trunc , y_train_oh, epochs=10,
                    batch_size=512 , validation_data=(X_test_seq_trunc, y_test_oh), verbose=1)


Train on 547976 samples, validate on 234847 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
embeddings = emb_model2.layers[0].get_weights()[0]

In [None]:
embeddings.shape

In [None]:
len(tk.word_index.items())

In [None]:
words_embeddings = {w:embeddings[idx] for w, idx in tk.word_index.items() if idx < max_features}

In [None]:
import pickle

with open('nongit_resources/words_embeddings_trained2.pickle', 'wb') as handle:
    pickle.dump(words_embeddings, handle)