<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [215]:
import sys
sys.path.append('../')
import datetime as dt
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from src.pipeline_helpers import get_proportions
from transformers import BertTokenizer
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.layers import Input, Dropout, Dense, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tqdm.auto import tqdm
from transformers import TFDistilBertModel, DistilBertConfig, DistilBertTokenizer
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard

In [216]:
working_dir = os.getcwd()
data_path = os.path.dirname(working_dir) + '/data/'
model_path = os.path.dirname(working_dir) + '/models/'

In [217]:
df = pd.read_csv(data_path + 'preprocessed.csv')

In [218]:
df.sample(10)

Unnamed: 0,complaint_description,assigned_division
148462,Needs to renew boiler permit for permit no 960...,Plumbing Inspection Division
114602,Drain in bathtub is slow. leaky windows in be...,Housing Inspection Services
86142,No violations found at the time of inspection-...,Housing Inspection Services
162618,Buckets of gas,Housing Inspection Services
1314,Attached c. green's letter dated - 3/16/95 (pa...,Housing Inspection Services
84508,"No heat, roach infestation and shower in basem...",Housing Inspection Services
158143,Date last observed: 19-mar-19; time last ob...,Building Inspection Division
149530,Responding to an emergency call out form sfpd ...,Building Inspection Division
54820,"Big pile of junks - desk, wood, mattress in fr...",Building Inspection Division
48074,Front door closer not working properly. hinge...,Housing Inspection Services


In [219]:
df.isna().sum()

complaint_description    0
assigned_division        0
dtype: int64

In [220]:
X = df.complaint_description
y = df.assigned_division

In [221]:
encoder = LabelBinarizer()
y = encoder.fit_transform(y)

In [222]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=.5, random_state = 42, stratify = y_test)

In [223]:
y_train_proportions = get_proportions(y_train)
print(f"y_train proportions: \n {y_train_proportions}\n")

y_train proportions: 
 {0: 0.35785110590354746, 1: 0.060562557617469266, 2: 0.48990506752239604, 3: 0.09170238070105066}



In [224]:
X_train

96420     There is construction work starting early in t...
79745     Construction work without a permit. a door has...
132263                                     Illegal unit....
131793    Illegal work going on in basement area. work w...
15135     Complainant is having problems with the neighb...
                                ...                        
18490     The manager/owner of the bldg. turns off water...
101443    The property at 1654 kirkwood is an abandoned,...
151227    Date last observed: 12-sep-19;    time last ob...
142366    A 3-story building - outside lath has been det...
91065     Construction debris -tar, tarps, exposed tar n...
Name: complaint_description, Length: 142101, dtype: object

In [225]:
y_train

array([[0, 0, 1, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       ...,
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0]])

In [226]:
encoder.classes_

array(['Building Inspection Division', 'Code Enforcement Section',
       'Housing Inspection Services', 'Plumbing Inspection Division'],
      dtype='<U28')

In [227]:
def tokenize(sentences, tokenizer):
    input_ids, input_masks = [],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, 
                                       max_length=128,truncation=True,padding='max_length',
                                       return_attention_mask=True,return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])      

    return (tf.convert_to_tensor(input_ids), tf.convert_to_tensor(input_masks))

In [228]:
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

X_train_input_ids, X_train_input_masks = tokenize(X_train,distilbert_tokenizer)
X_val_input_ids, X_val_input_masks = tokenize(X_val,bert_tokenizer)
X_test_input_ids, X_test_input_masks = tokenize(X_test,bert_tokenizer)

  0%|          | 0/142101 [00:00<?, ?it/s]

  0%|          | 0/17763 [00:00<?, ?it/s]

  0%|          | 0/17763 [00:00<?, ?it/s]

In [229]:
DISTILBERT_DROPOUT = 0.2
DISTILBERT_ATT_DROPOUT = 0.2
 
# Configure DistilBERT's initialization
config = DistilBertConfig(dropout=DISTILBERT_DROPOUT, 
                          attention_dropout=DISTILBERT_ATT_DROPOUT, 
                          output_hidden_states=True)
                          
#bare pre-trained DistilBERT model outputting raw hidden-states 
#needs head for classification
distilbert = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

# Make DistilBERT layers untrainable
for layer in distilbert.layers:
    layer.trainable = False

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'vocab_projector', 'activation_13', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [236]:
MAX_LENGTH = 128
LAYER_DROPOUT = 0.2
LEARNING_RATE = 5e-5
RANDOM_STATE = 42

def build_model(transformer, num_classes, max_length=MAX_LENGTH):
    
    """""""""
    Builds a BERT model for classification tasks using a Hugging Face 
    transformer with no head attached.
    
    Input:
      - transformer:  base Hugging Face transformer with no head.
      - max_length:   Controls the maximum number of encoded tokens in 
                      a sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added multi-class 
                      classification layerson top of the base Hugging Face 
                      transformer. 
    """""""""""
    
    #define metrics to monitor
    metrics = [
                tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
                tf.keras.metrics.AUC(name='auc'),
    ]

    # define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=RANDOM_STATE) 
    
    # define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_masks_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')

    

    # tf.tensor representing the hidden-state of the model's last layer
    last_hidden_state = transformer([input_ids_layer, input_masks_layer])[0]
    
    # We only care about BERT's output for the [CLS] token, 
    # which is located at index 0 of every encoded sequence.  
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    ##                                                 ##
    ## Define additional dropout and dense layers here ##
    ##                                                 ##
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(num_classes, 
                                   activation='softmax',
                                   kernel_initializer=weight_initializer,  
                                   kernel_constraint=None,
                                   bias_initializer='zeros'
                                   )(cls_token)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_masks_layer], output)
    
    # Compile the model
    model.compile(tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE), 
                  loss='categorical_crossentropy',
                  metrics=metrics)
    
    return model

In [237]:
model = build_model(transformer=distilbert,num_classes=y_train.shape[1])

In [238]:
model.summary()

Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_attention (InputLayer)    [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model_7 (TFDisti TFBaseModelOutput(la 66362880    input_ids[0][0]                  
                                                                 input_attention[0][0]            
__________________________________________________________________________________________________
tf.__operators__.getitem_8 (Sli (None, 768)          0           tf_distil_bert_model_7[1][

In [239]:
print(X_train_input_ids.shape)

(142101, 128)


In [240]:
#define callbacks for our model
checkpoint = ModelCheckpoint(filepath=model_path+f'LSTM/model_{dt.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")}_best.h5', 
                             monitor='val_loss',
                             verbose=1, 
                             save_best_only=True,
                             mode='min')

tqdm_callback = tfa.callbacks.TQDMProgressBar()

callbacks = [checkpoint,
             tqdm_callback]


In [242]:
EPOCHS = 4
BATCH_SIZE = 64
NUM_STEPS = X_train_input_ids.shape[0] // BATCH_SIZE

# Train the model
history = model.fit(
    x = [X_train_input_ids, X_train_input_masks],
    y = y_train,
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    steps_per_epoch = NUM_STEPS,
    validation_data = ([X_val_input_ids, X_val_input_masks], y_val),
    verbose=2,
    callbacks=callbacks
)

In [243]:
model

<keras.engine.functional.Functional at 0x172103e50>

In [247]:
# Evaluate the model on the test data using `evaluate`
# print("Evaluate model on test data")
y_pred = model.predict([X_test_input_ids, X_test_input_masks])

In [None]:
predictions = np.argmax(y_pred, axis=1) 
test_labels = np.argmax(y_test, axis=1)

In [None]:
#create confusion matrix of our test predictions
print(metrics.confusion_matrix(test_labels, predictions))

In [None]:
#create classification report
print(metrics.classification_report(test_labels, predictions))