<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [30]:
import sys
import numpy as np
from numpy import array
import os
import pandas as pd
import pickle
from tqdm.auto import tqdm
sys.path.append('../')
from src.pipeline_helpers import get_proportions
from src.clean_data import normalize_text
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Input, Dense, Embedding, Flatten, Activation, LeakyReLU,Bidirectional, LSTM, BatchNormalization, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.models import Sequential
from keras.callbacks import EarlyStopping

In [69]:
working_dir = os.getcwd()
data_path = os.path.dirname(working_dir) + '/data/'
df = pd.read_csv(data_path + 'preprocessed.csv')

In [3]:
df.head()

Unnamed: 0,complaint_description,assigned_division
0,Date last observed: 29-jun-20; time last ob...,Housing Inspection Services
1,Unpermitted interior framing at 3rd level atti...,Electrical Inspection Division
2,Elevator (in the parking area )to condos is in...,Housing Inspection Services
3,Complainant is concerned about the lenght of t...,Building Inspection Division
4,Date last observed: 06-jan-21; time last ob...,Building Inspection Division


In [4]:
#ensure there are no null values or duplicates
print(f"Null: \n {df.isna().sum()} \n")
print(f"Duplicates:  {df.duplicated().sum()}")

Null: 
 complaint_description    0
assigned_division        0
dtype: int64 

Duplicates:  0


In [5]:
df.shape

(183607, 2)

In [6]:
df.assigned_division.value_counts()

Housing Inspection Services       86793
Building Inspection Division      63267
Plumbing Inspection Division      16241
Code Enforcement Section          10680
Electrical Inspection Division     5515
Disabled Access Division           1111
Name: assigned_division, dtype: int64

Now that we have loaded our preprocessed data we're ready to prepare it for machine learning using the normalize_text function declared in our clean_data module. 

We will prepare a stemmed and lemmatized version of the data so that we can explore the change in performance from both approaches. Lemmatization takes significantly longer, but can yield improved accuracy. 

We are using tqdm.pandas to keep track of our functions with a progress bar. 

In [7]:
#create stemmed text
# tqdm.pandas()
# stemmed = df.copy()
# stemmed['complaint_description'] = stemmed['complaint_description'].progress_apply(lambda x: normalize_text(x))
# stemmed.to_csv(data_path + '/stemmed_text.csv')

In [8]:
#create lemmatized text
# tqdm.pandas()
# lemmatized = df.copy()
# lemmatized['complaint_description'] = lemmatized['complaint_description'].progress_apply(lambda x: normalize_text(x, lemmatize = True))
# lemmatized.to_csv(data_path + '/lemmatized_text.csv')

In [210]:
lemm = pd.read_csv(data_path + 'lemmatized_text.csv',index_col=[0])
stemmed = pd.read_csv(data_path + 'stemmed_text.csv',index_col=[0])

In [211]:
lemm.head()

Unnamed: 0,complaint_description,assigned_division
0,"['date', 'last', 'observe', 'jun', 'time', 'la...",Housing Inspection Services
1,"['unpermitte', 'interior', 'frame', 'rd', 'lev...",Electrical Inspection Division
2,"['elevator', 'parking', 'area', 'condo', 'inop...",Housing Inspection Services
3,"['complainant', 'concerned', 'lenght', 'time',...",Building Inspection Division
4,"['date', 'last', 'observe', 'jan', 'time', 'la...",Building Inspection Division


In [212]:
print(lemm.isna().sum())
print("Duplicated :", lemm.duplicated().sum())

complaint_description    0
assigned_division        0
dtype: int64
Duplicated : 15284


In [213]:
lemm.drop_duplicates(inplace = True)

In [214]:
lemm.assigned_division.value_counts()

Housing Inspection Services       81498
Building Inspection Division      60955
Plumbing Inspection Division      10631
Code Enforcement Section           8737
Electrical Inspection Division     5417
Disabled Access Division           1085
Name: assigned_division, dtype: int64

In [215]:
classes_lemmatized = lemm.assigned_division
class_weights_lemmatized = class_weight.compute_class_weight('balanced',
                                                 classes = classes.unique(),y = classes)
class_weights_lemmatized = dict(enumerate(class_weights_lemmatized))

In [216]:
print(class_weights_lemmatized)

{0: 0.34422726120068387, 1: 5.178850532274937, 2: 0.4602384272550789, 3: 2.638870598563948, 4: 3.2109228949677617, 5: 25.856067588325654}


In [217]:
classes.size

168323

In [218]:
classes_stemmed = stemmed.assigned_division
class_weights_stemmed = class_weight.compute_class_weight('balanced',
                                                 classes = classes_stemmed.unique(),y = classes_stemmed)
class_weights_stemmed = dict(enumerate(class_weights_stemmed))

In [219]:
print(class_weights_stemmed)

{0: 0.3525764366557979, 1: 5.548715624055606, 2: 0.4836829099952055, 3: 1.8841922705908913, 4: 2.8652777777777776, 5: 27.543804380438043}


In [220]:
encoder = LabelBinarizer()

X_lemmatized = lemm.complaint_description
y_lemmatized = encoder.fit_transform(classes)

X_stemmed = stemmed.complaint_description
y_stemmed = encoder.fit_transform(classes_stemmed)

In [221]:
X_lemmatized.shape

(168323,)

In [222]:
y_lemmatized.shape

(168323, 6)

In [223]:
X_stemmed.shape

(183607,)

In [224]:
y_stemmed.shape

(183607, 6)

In [225]:
print(X_lemmatized)

0         ['date', 'last', 'observe', 'jun', 'time', 'la...
1         ['unpermitte', 'interior', 'frame', 'rd', 'lev...
2         ['elevator', 'parking', 'area', 'condo', 'inop...
3         ['complainant', 'concerned', 'lenght', 'time',...
4         ['date', 'last', 'observe', 'jan', 'time', 'la...
                                ...                        
183600    ['go', 'beyond', 'scope', 'work', 'remove', 'e...
183601    ['date', 'last', 'observe', 'mar', 'time', 'la...
183603    ['bathroom', 'community', 'bathroom', 'area', ...
183604    ['van', 'ness', 'construction', 'site', 'van',...
183605                ['outdoor', 'pipe', 'leak', 'studio']
Name: complaint_description, Length: 168323, dtype: object


In [226]:
print(y_lemmatized)

[[0 0 0 0 1 0]
 [0 0 0 1 0 0]
 [0 0 0 0 1 0]
 ...
 [0 0 0 0 1 0]
 [1 0 0 0 0 0]
 [0 0 0 0 1 0]]


In [227]:
print(X_stemmed)

0         ['date', 'last', 'observ', 'jun', 'time', 'las...
1         ['unpermit', 'interior', 'frame', 'rd', 'level...
2               ['elev', 'park', 'area', 'condo', 'inoper']
3         ['complain', 'concern', 'lenght', 'time', 'tak...
4         ['date', 'last', 'observ', 'jan', 'time', 'las...
                                ...                        
183602    ['need', 'renew', 'boiler', 'permit', 'permit'...
183603    ['bathroom', 'commun', 'bathroom', 'area', 'ke...
183604    ['van', 'ness', 'construct', 'site', 'van', 'n...
183605                ['outdoor', 'pipe', 'leak', 'studio']
183606    ['need', 'renew', 'boiler', 'permit', 'permit'...
Name: complaint_description, Length: 183607, dtype: object


In [228]:
print(y_stemmed)

[[0 0 0 0 1 0]
 [0 0 0 1 0 0]
 [0 0 0 0 1 0]
 ...
 [1 0 0 0 0 0]
 [0 0 0 0 1 0]
 [0 0 0 0 0 1]]


In [229]:
#define tensorflow tokenizer for the lemmatized text
num_words = 10000
tokenizer_lemmatized = Tokenizer(num_words=num_words, oov_token='<UNK>')

#fit tokenizer to text
tokenizer_lemmatized.fit_on_texts(X_lemmatized)

#save the tokenizer so we can use it later to process data for predictions
lemmatized_data = [class_weights_lemmatized,tokenizer_lemmatized]
with open('lemmatized_tokenizer_and_weights.pickle', 'wb') as f:
    pickle.dump(lemmatized_data, f)


#define variables for word count and index
word_count = tokenizer.word_counts
word_index = tokenizer.word_index


#encode the data into a sequence 
X_lemmatized_sequences = tokenizer.texts_to_sequences(X_lemmatized)

#pad the sequences
X_lemmatized = pad_sequences(X_sequences, padding='post',
                truncating='post', maxlen=200)

In [230]:
#define tensorflow tokenizer for the stemmed text
num_words = 10000
tokenizer_stemmed = Tokenizer(num_words=num_words, oov_token='<UNK>')

#fit tokenizer to text
tokenizer_stemmed.fit_on_texts(X_stemmed)

#save the tokenizer so we can use it later to process data for predictions
stemmed_data = [class_weights_stemmed,tokenizer_stemmed]
with open('stemmed_tokenizer_and_weights.pickle', 'wb') as handle:
    pickle.dump(stemmed_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

#encode the data into a sequence 
X_sequences_stemmed = tokenizer_stemmed.texts_to_sequences(X_stemmed)

#pad the sequences
X_stemmed = pad_sequences(X_sequences_stemmed, padding='post',
                truncating='post', maxlen=200)

In [207]:
print(stemmed_data)

[{0: 0.3525764366557979, 1: 5.548715624055606, 2: 0.4836829099952055, 3: 1.8841922705908913, 4: 2.8652777777777776, 5: 27.543804380438043}, <keras_preprocessing.text.Tokenizer object at 0x7fd36c79dee0>]


In [208]:
print(lemmatized_data)

[{0: 0.34422726120068387, 1: 5.178850532274937, 2: 0.4602384272550789, 3: 2.638870598563948, 4: 3.2109228949677617, 5: 25.856067588325654}, <keras_preprocessing.text.Tokenizer object at 0x7fd3605b6400>]


In [100]:
y_stemmed.shape

(183607, 6)

In [47]:
display(word_count)

OrderedDict([("'date'", 22266),
             ("'last'", 40165),
             ("'observe'", 38319),
             ("'jun'", 1878),
             ("'time'", 21038),
             ("'floor'", 30969),
             ("'nd'", 4748),
             ("'unit'", 38049),
             ("'exact'", 21035),
             ("'location'", 22989),
             ("'main'", 17400),
             ("'bldg'", 20620),
             ("'build'", 63665),
             ("'type'", 21406),
             ("'insectsrodent'", 1235),
             ("'additional'", 20228),
             ("'information'", 19590),
             ("'many'", 1742),
             ("'different'", 390),
             ("'fly'", 695),
             ("'infestation'", 2007),
             ("'affect'", 627),
             ("'multiple'", 1519),
             ("'face'", 1022),
             ("'rear'", 9809),
             ("'several'", 1961),
             ("'attempt'", 357),
             ("'counter'", 499),
             ("'solution'", 72),
             ("'management'", 1692)

In [48]:
display(word_index)

{'<UNK>': 1,
 "'build'": 2,
 "'permit'": 3,
 "'work'": 4,
 "'last'": 5,
 "'observe'": 6,
 "'unit'": 7,
 "'floor'": 8,
 "'water'": 9,
 "'location'": 10,
 "'without'": 11,
 "'construction'": 12,
 "'date'": 13,
 "'type'": 14,
 "'time'": 15,
 "'exact'": 16,
 "'bldg'": 17,
 "'additional'": 18,
 "'information'": 19,
 "'window'": 20,
 "'wall'": 21,
 "'illegal'": 22,
 "'residencedwelle'": 23,
 "'main'": 24,
 "'leak'": 25,
 "'wo'": 26,
 "'bathroom'": 27,
 "'door'": 28,
 "'do'": 29,
 "'property'": 30,
 "'kitchen'": 31,
 "'room'": 32,
 "'garage'": 33,
 "'back'": 34,
 "'perform'": 35,
 "'house'": 36,
 "'heat'": 37,
 "'fire'": 38,
 "'person'": 39,
 "'paint'": 40,
 "'mold'": 41,
 "'front'": 42,
 "'electrical'": 43,
 "'use'": 44,
 "'identity'": 45,
 "'roof'": 46,
 "'ceiling'": 47,
 "'rear'": 48,
 "'break'": 49,
 "'go'": 50,
 "'damage'": 51,
 "'area'": 52,
 "'come'": 53,
 "'plumb'": 54,
 "'nt'": 55,
 "'scope'": 56,
 "'street'": 57,
 "'pm'": 58,
 "'neighbor'": 59,
 "'problem'": 60,
 "'side'": 61,
 "'al

In [86]:
display(X)

array([[  13,    5,    6, ...,    0,    0,    0],
       [ 370,  212,  270, ...,    0,    0,    0],
       [ 105,  208,   52, ...,    0,    0,    0],
       ...,
       [  27,  888,   27, ...,    0,    0,    0],
       [ 893, 1129,   12, ...,    0,    0,    0],
       [1199,  102,   25, ...,    0,    0,    0]], dtype=int32)

In [82]:
display(y)

array([[0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0],
       ...,
       [0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0]])

In [148]:
X_stemmed.shape

(183607, 200)

In [149]:
y_stemmed.shape

(183607, 6)

In [152]:
np.save(data_path + 'X_lemmatized_prepared.npy', X_lemmatized)
np.save(data_path + 'y_lemmatized_prepared.npy', y_lemmatized)

np.save(data_path + 'X_stemmed_prepared.npy', X_stemmed)
np.save(data_path + 'y_stemmed_prepared.npy', y_stemmed)