<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [30]:
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTENC
import sys
import numpy as np
from numpy import array
import os
import pandas as pd
import pickle
from tqdm.auto import tqdm
sys.path.append('../')
from src.pipeline_helpers import get_proportions
from src.clean_data import normalize_text
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Input, Dense, Embedding, Flatten, Activation, LeakyReLU,Bidirectional, LSTM, Dropout
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint

In [3]:
working_dir = os.getcwd()
data_path = os.path.dirname(working_dir) + '/data/'

In [4]:
X = np.load(data_path + 'X_lemmatized_prepared.npy')
y = np.load(data_path + 'y_lemmatized_prepared.npy')

In [5]:
#loading the tokenizer
with open('stemmed_tokenizer_and_weights.pickle', 'rb') as f:
    stemmed_data = pickle.load(f)

stemmed_class_weights = stemmed_data[0]
stemmed_tokenizer = stemmed_data[1]

In [6]:
#loading the tokenizer
with open('lemmatized_tokenizer_and_weights.pickle', 'rb') as f:
    lemmatized_data = pickle.load(f)

lemmatized_class_weights = lemmatized_data[0]
lemmatized_tokenizer = lemmatized_data[1]

In [7]:
print(lemmatized_data)

[{0: 0.34422726120068387, 1: 5.178850532274937, 2: 0.4602384272550789, 3: 2.638870598563948, 4: 3.2109228949677617, 5: 25.856067588325654}, <keras_preprocessing.text.Tokenizer object at 0x7ff0fcd3a790>]


In [8]:
print(stemmed_data)

[{0: 0.3525764366557979, 1: 5.548715624055606, 2: 0.4836829099952055, 3: 1.8841922705908913, 4: 2.8652777777777776, 5: 27.543804380438043}, <keras_preprocessing.text.Tokenizer object at 0x7ff0fcce8f40>]


In [9]:
print(lemmatized_class_weights)

{0: 0.34422726120068387, 1: 5.178850532274937, 2: 0.4602384272550789, 3: 2.638870598563948, 4: 3.2109228949677617, 5: 25.856067588325654}


In [10]:
print(stemmed_class_weights)

{0: 0.3525764366557979, 1: 5.548715624055606, 2: 0.4836829099952055, 3: 1.8841922705908913, 4: 2.8652777777777776, 5: 27.543804380438043}


In [17]:
X.shape

(168323, 200)

In [18]:
y.shape

(168323, 6)

In [19]:
print(X)

[[  13    5    6 ...    0    0    0]
 [ 370  212  270 ...    0    0    0]
 [ 105  208   52 ...    0    0    0]
 ...
 [  27  888   27 ...    0    0    0]
 [ 893 1129   12 ...    0    0    0]
 [1199  102   25 ...    0    0    0]]


In [20]:
print(y)

[[0 0 0 0 1 0]
 [0 0 0 1 0 0]
 [0 0 0 0 1 0]
 ...
 [0 0 0 0 1 0]
 [1 0 0 0 0 0]
 [0 0 0 0 1 0]]


In [21]:
X.shape

(168323, 200)

In [22]:
y.shape

(168323, 6)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

Now we will check the proportions of each class in the training set and test set to ensure that sklearn's stratify worked as expected. It's crucial to make sure each class is represented the same amount in each subset, so I defined a function in pipeline_helpers to calculate the normalized proportion of each class. 

In [24]:

y_train_proportions = get_proportions(y_train)
y_test_proportions = get_proportions(y_test)

print(f"y_train proportions: \n {y_train_proportions}\n")
print(f"y_test proportions: \n {y_test_proportions}")

y_train proportions: 
 {0: 0.3621322164297702, 1: 0.051909281290380076, 2: 0.006453385613925649, 3: 0.03219266586463485, 4: 0.48418215033640777, 5: 0.0631674315673781}

y_test proportions: 
 {0: 0.36215654240308925, 1: 0.05195306698351403, 2: 0.0064755680974305655, 3: 0.03216990940145552, 4: 0.48421208970741125, 5: 0.06318134561116887}


In [114]:
y_train_proportions = get_proportions(y_train)
print(f"y_train proportions: \n {y_train_proportions}\n")

y_train proportions: 
 {0: 0.3445756884637642, 1: 0.05817476256935698, 2: 0.006059161929400552, 3: 0.030043911903870374, 4: 0.4727167511999183, 5: 0.08846376416924805}



In [71]:
y_train.shape

(391188, 6)

In [72]:
X_train.shape

(391188, 200)

In [271]:
y_train.shape[1]

6

In [25]:
num_words = 10000

In [35]:
embed_dims = 32
model = Sequential()
model.add(Embedding(num_words, embed_dims, input_length=X_train.shape[1]))
model.add(Bidirectional(LSTM(embed_dims, return_sequences=True)))
model.add(Dense(embed_dims,activation='relu'))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(embed_dims)))
model.add(Dense(embed_dims,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(embed_dims,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1], activation='softmax'))

In [36]:
opt = tf.keras.optimizers.Adam()
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['Accuracy','AUC',
             'Precision','Recall']
)


In [37]:
#create callback
model_path = os.path.dirname(working_dir) + '/models/'
checkpoint = ModelCheckpoint(filepath=model_path+'LSTM_1_best_model.h5', 
                             monitor='val_loss',
                             verbose=1, 
                             save_best_only=True,
                             mode='min')
callbacks = [checkpoint]

In [38]:
num_epochs = 10
history = model.fit(X_train, y_train, epochs=num_epochs, 
                    validation_split=0.2, verbose=2,
                   class_weight=lemmatized_class_weights,
                   callbacks=callbacks)

Epoch 1/10
3367/3367 - 466s - loss: 2.5432 - Accuracy: 0.4771 - auc: 0.8317 - precision: 0.6503 - recall: 0.4240 - val_loss: 1.2189 - val_Accuracy: 0.5136 - val_auc: 0.8715 - val_precision: 0.7000 - val_recall: 0.4840

Epoch 00001: val_loss improved from inf to 1.21893, saving model to /Users/WillemCole/Desktop/DataScience/Projects/sf_building_complaints/models/best_model.h5
Epoch 2/10
3367/3367 - 870s - loss: 1.9424 - Accuracy: 0.5225 - auc: 0.8740 - precision: 0.6719 - recall: 0.4849 - val_loss: 1.1902 - val_Accuracy: 0.5342 - val_auc: 0.8737 - val_precision: 0.6931 - val_recall: 0.4865

Epoch 00002: val_loss improved from 1.21893 to 1.19019, saving model to /Users/WillemCole/Desktop/DataScience/Projects/sf_building_complaints/models/best_model.h5
Epoch 3/10
3367/3367 - 446s - loss: 1.8014 - Accuracy: 0.5433 - auc: 0.8851 - precision: 0.6729 - recall: 0.4959 - val_loss: 1.1455 - val_Accuracy: 0.5829 - val_auc: 0.8817 - val_precision: 0.6642 - val_recall: 0.4442

Epoch 00003: val_loss

In [40]:
 with open(model_path + 'LSTM_model_1_history.h5', 'wb') as f:
        pickle.dump(history.history, f)
