### Imports

In [1]:
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from itertools import combinations
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from bert_embedding import BertEmbedding
from allennlp.commands.elmo import ElmoEmbedder

from transformers import *
import torch
import keras

import imp, gzip
import pickle, nltk
import gensim
import multiprocessing
from copy import deepcopy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import utils as my_utils



import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from pathlib import Path

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers

Using TensorFlow backend.


In [2]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

### Definitions

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in tqdm(parse(path)):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [4]:
def process_df(df):
    df['text'] = my_utils.preprocess2(df['text'])
    return df

### Start

In [None]:
# dataset_movies = getDF('datasets_raw/reviews_Movies_and_TV_5.json.gz')
# dataset_movies.shape

In [None]:
# dataset_kindle = getDF('datasets_raw/reviews_Kindle_Store_5.json.gz')
# dataset_kindle.shape

In [5]:
dataset_home = getDF('datasets_raw/reviews_Home_and_Kitchen_5.json.gz')
dataset_home.shape

551682it [00:46, 11840.58it/s]


(551682, 9)

In [6]:
dataset = dataset_home

In [7]:
# dataset = pd.concat([dataset_movies, dataset_home, dataset_kindle])

In [8]:
dataset = dataset.drop(columns=['reviewerID', 'asin', 'reviewerName', 'helpful', 'summary', 'unixReviewTime', 'reviewTime'])

In [9]:
dataset = dataset.rename(columns={'reviewText': 'text', 'overall': 'sentiment'})

In [10]:
dataset.shape

(551682, 2)

In [None]:
# dataset = pd.read_csv("../Notebooks/NLP_training/amazon_reviews_cleaned.csv")
# dataset['text'] = dataset.clean_reviewtext
# dataset['sentiment'] = dataset.overall
# dataset = dataset.fillna("")

In [12]:
n_cores = 45

In [13]:
%%time
n = int(dataset.shape[0]/n_cores)
list_df = [dataset[i:i+n] for i in range(0, dataset.shape[0],n)]

pool = multiprocessing.Pool(n_cores)
processed_list_df = pool.map(process_df, list_df)
pool.close()

dataset = pd.concat(processed_list_df)
dataset.shape

CPU times: user 1.31 s, sys: 8.4 s, total: 9.71 s
Wall time: 57 s


(551682, 2)

In [14]:
Counter(dataset.sentiment)

Counter({5.0: 349696, 4.0: 105508, 2.0: 24313, 3.0: 45059, 1.0: 27106})

In [15]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r', encoding='utf8')
    model = {}
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [16]:
glove_embedding_dim = 300
glove_embeddings_index = loadGloveModel("nongit_resources/glove.6B.300d.txt")

706it [00:00, 7058.32it/s]

Loading Glove Model


400000it [01:03, 6332.17it/s]

Done. 400000  words loaded!





In [21]:
glove_vocab = glove_embeddings_index.keys()

In [22]:
len(glove_vocab)

400000

In [24]:
def process_glove_vocab(df):
    df['text'] = df['text'].apply(lambda x: [i for i in x if i in glove_vocab])
    return df

In [25]:
dataset.text = dataset.text.apply(lambda x: x.split(" "))

In [27]:
%%time
n = int(dataset.shape[0]/n_cores)
list_df = [dataset[i:i+n] for i in range(0, dataset.shape[0],n)]

pool = multiprocessing.Pool(n_cores)
processed_list_df = pool.map(process_glove_vocab, list_df)
pool.close()

dataset = pd.concat(processed_list_df)
dataset.shape

CPU times: user 22.6 s, sys: 20.2 s, total: 42.8 s
Wall time: 44.7 s


(551682, 2)

In [36]:
dataset['len'] = dataset.text.apply(lambda x: len(x))

In [37]:
dataset.len.describe()

count    551682.000000
mean         48.610469
std          59.685705
min           0.000000
25%          17.000000
50%          29.000000
75%          57.000000
max        2891.000000
Name: text, dtype: float64

In [42]:
dataset = dataset[dataset.len > 100]

In [43]:
dataset = dataset[dataset.len < 300]

In [44]:
dataset.shape

(54425, 3)

In [59]:
dataset.text = dataset.text.apply(lambda x: " ".join(x))

In [60]:
max_features = 20000

In [61]:
# dataset.text = dataset.text.apply(lambda x: " ".join(x))

In [62]:
# d = dataset.sample(1000000)

In [63]:
vectorizer = TfidfVectorizer(max_features = max_features)

In [64]:
vectorizer.fit(dataset.text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=20000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [65]:
vocab = vectorizer.get_feature_names()

In [66]:
len(vocab)

20000

In [67]:
d = dataset #.sample(300000)

In [68]:
d.text = d.text.apply(lambda x: x.split(" "))

In [69]:
# %%time
# d.text = d.text.apply(lambda x: [i for i in x if i in vocab])

In [70]:
n_cores = 45

In [71]:
def process_df(df):
    df['text'] = df['text'].apply(lambda x: [i for i in x if i in vocab])
    return df

In [72]:
%%time
n = int(d.shape[0]/n_cores)
list_df = [d[i:i+n] for i in range(0, d.shape[0],n)]

pool = multiprocessing.Pool(n_cores)
processed_list_df = pool.map(process_df, list_df)
pool.close()

d_ = pd.concat(processed_list_df)
d_.shape

CPU times: user 5.54 s, sys: 17.5 s, total: 23 s
Wall time: 2min 23s


(54425, 3)

In [78]:
# d_ = d_[d_.text.apply(lambda x: len(x)>3 and len(x)<50)]
# d_.shape
d_.text = d_.text.apply(lambda x: " ".join(x))

In [79]:
X_train, X_test, y_train, y_test = train_test_split(d_.text, d_.sentiment, test_size=0.3, random_state=37, st)

In [80]:
%%time
tk = Tokenizer(split=" ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

CPU times: user 11.1 s, sys: 266 ms, total: 11.4 s
Wall time: 11.4 s


In [81]:
seq_lengths = X_train.apply(lambda x: len(x.split(' ')))
seq_lengths.describe()

count    38097.000000
mean       153.124131
std         47.102336
min         68.000000
25%        116.000000
50%        139.000000
75%        178.000000
max        299.000000
Name: text, dtype: float64

In [82]:
MAX_LEN = 300

In [83]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=MAX_LEN)

In [84]:
X_test_seq_trunc.shape

(16328, 300)

In [85]:
X_train_seq_trunc.shape

(38097, 300)

In [86]:
max_features = len(vocab)

In [88]:
max_features

20000

In [89]:
max_features = len(tk.word_index.items())

In [90]:
emb_matrix = np.zeros((max_features, glove_embedding_dim))

for w, i in tk.word_index.items():
    try:
        vect = glove_embeddings_index[w]
        emb_matrix[i] = vect
    except:
        pass

In [91]:
emb_matrix.shape

(19700, 300)

In [92]:
Counter(emb_matrix.sum(axis=1))[0.0] #words without embeddings

1

In [93]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

In [94]:
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [95]:
emb_model2 = models.Sequential()
emb_model2.add(layers.Embedding(max_features, 300, input_length=MAX_LEN))
emb_model2.add(Bidirectional(LSTM(200)))
emb_model2.add(Dropout(0.3))
emb_model2.add(layers.Dense(256, activation='relu'))
emb_model2.add(layers.Dense(5, activation='softmax'))
emb_model2.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 300)          5910000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 400)               801600    
_________________________________________________________________
dropout_1 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               102656    
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 1285      
Total params: 6,815,541
Trainable params: 6,815,541
Non-trainable params: 0
_________________________________________________________________


In [96]:
emb_model2.layers[0].set_weights([emb_matrix])
emb_model2.layers[0].trainable = True

In [97]:
emb_model2.compile(optimizer='adam', loss='categorical_crossentropy' , metrics=['accuracy', f1_m])

In [98]:
history = emb_model2.fit(X_train_seq_trunc , y_train_oh, epochs=20,
                    batch_size=512 , validation_data=(X_test_seq_trunc, y_test_oh), verbose=1)


Train on 38097 samples, validate on 16328 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
 7680/38097 [=====>........................] - ETA: 44s - loss: 0.1298 - accuracy: 0.9565 - f1_m: 0.9568

KeyboardInterrupt: 

In [None]:
embeddings = emb_model2.layers[0].get_weights()[0]

In [None]:
embeddings.shape

In [None]:
len(tk.word_index.items())

In [None]:
words_embeddings = {w:embeddings[idx] for w, idx in tk.word_index.items() if idx < max_features}

In [None]:
import pickle

with open('nongit_resources/words_embeddings_trained2.pickle', 'wb') as handle:
    pickle.dump(words_embeddings, handle)