# Training Pipeline

### Imports

In [1]:
import os

import hopsworks
import joblib
import keras_tuner as kt
import matplotlib as plt
import numpy as np
import pandas as pd
from datasets import load_dataset
from hsml.model_schema import ModelSchema
from hsml.schema import Schema
from huggingface_hub import notebook_login
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import (LSTM, Bidirectional, Dense, Dropout,
                                     Embedding, GlobalMaxPooling1D)
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from wandb.keras import WandbCallback

import wandb

2023-01-15 17:27:26.353602: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Connect to Hopsworks, Huggingface & WandB

In [2]:
project = hopsworks.login() 

fs = project.get_feature_store() 

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/5322




Connected. Call `.close()` to terminate connection gracefully.


In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# Login to wandb
wandb.login()

2023-01-15 17:27:41,746 ERROR: Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33meengel7[0m ([33mtwo_data_scientists[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

## Load Data from Huggingface

In [5]:
ds = load_dataset("eengel7/sentiment_analysis_training", split='train')
data_df = pd.DataFrame(data = ds, columns=['Sentiment',  'Headline'])

y = pd.get_dummies(data_df['Sentiment'])
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(data_df['Headline'].to_list()),y, test_size=0.2, random_state=42)



## LSTM Model

In [6]:
# Preprocessing & Model params
voc_size = 5000 
max_len = 60
embedding_vector_features = 40

# Training params
epochs = 20
batch_size = 256

In [12]:
run = wandb.init(project='sentiment_analysis')



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01671273005000001, max=1.0)…



In [11]:
callbacks = [
    EarlyStopping(     #EarlyStopping is used to stop at the epoch where val_accuracy does not improve significantly
        monitor='val_accuracy',
        min_delta=1e-4,
        patience=4,
        verbose=1
    ),
    ModelCheckpoint(
        filepath='weights.h5',
        monitor='val_accuracy', 
        mode='max', 
        save_best_only=True,
        save_weights_only=True,
        verbose=1
    ),
    WandbCallback()
]



In [8]:
# Hypertuning the model
def model_builder(hp):
      '''
      Args:
      hp - Keras tuner object
      '''
      model = Sequential()
      model.add(Embedding(voc_size,embedding_vector_features,input_length = max_len))
      model.add(Bidirectional(LSTM(128, return_sequences=True))) 
      model.add(Bidirectional(LSTM(64, return_sequences=True)))
      model.add(GlobalMaxPooling1D()) #Pooling Layer decreases sensitivity to features, thereby creating more generalised data for better test results.
      model.add(Dense(1024))
      model.add(Dropout(0.25)) #Dropout layer nullifies certain random input values to generate a more general dataset and prevent the problem of overfitting.
      model.add(Dense(512))
      model.add(Dropout(0.25))
      model.add(Dense(256))
      model.add(Dropout(0.25))
      model.add(Dense(128))
      model.add(Dropout(0.25))
      model.add(Dense(64))
      model.add(Dropout(0.25))
      model.add(Dense(3, activation='softmax')) #softmax is used as the activation function for multi-class classification problems where class membership is required on more than two class labels.


      # Tune the learning rate for the optimizer
      # Choose an optimal value from 0.01, 0.001, or 0.0001
      hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
      model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
      return model

In [9]:
# Instantiate the tuner
tuner = kt.Hyperband(model_builder, # the hypermodel
                    objective='val_accuracy', # objective to optimize
                    directory='dir', # directory to save logs 
                    project_name='ht_learning_rate')

# hypertuning settings
tuner.search_space_summary() 

2023-01-15 17:27:59,747 INFO: Reloading Oracle from existing project dir/ht_learning_rate/oracle.json


2023-01-15 17:27:59.791905: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


2023-01-15 17:28:01,302 INFO: Reloading Tuner from dir/ht_learning_rate/tuner0.json
Search space summary
Default search space size: 1
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}


In [13]:
# Perform hypertuning
tuner.search(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), callbacks=[callbacks])

2023-01-15 17:28:23,968 INFO: Oracle triggered exit


In [14]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"The hyperparameter search is complete. The optimal learning rate for the optimizeris {best_hps.get('learning_rate')}.")

The hyperparameter search is complete. The optimal learning rate for the optimizeris 0.001.


In [15]:
# Fit Model

    # Build the model with the optimal hyperparameters
model = tuner.hypermodel.build(best_hps)
model.summary()

model.fit(X_train, 
          y_train, 
          batch_size = batch_size, 
          validation_data=(X_test, y_test), 
          epochs = epochs, 
          callbacks=callbacks)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 60, 40)            200000    
                                                                 
 bidirectional_2 (Bidirectio  (None, 60, 256)          173056    
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 60, 128)          164352    
 nal)                                                            
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_6 (Dense)             (None, 1024)              132096    
                                                      

[34m[1mwandb[0m: Adding directory to artifact (/Users/evaengel/news_articles_sentiment/src/wandb/run-20230115_172749-1600scf7/files/model-best)... Done. 0.1s


Epoch 2/20
Epoch 2: val_accuracy improved from 0.60565 to 0.68261, saving model to weights.h5
2023-01-15 17:30:32,168 INFO: Assets written to: /Users/evaengel/news_articles_sentiment/src/wandb/run-20230115_172749-1600scf7/files/model-best/assets


[34m[1mwandb[0m: Adding directory to artifact (/Users/evaengel/news_articles_sentiment/src/wandb/run-20230115_172749-1600scf7/files/model-best)... Done. 0.1s


Epoch 3/20
Epoch 3: val_accuracy did not improve from 0.68261
Epoch 4/20
Epoch 4: val_accuracy did not improve from 0.68261
Epoch 5/20
Epoch 5: val_accuracy did not improve from 0.68261
Epoch 6/20
Epoch 6: val_accuracy did not improve from 0.68261
Epoch 6: early stopping


<keras.callbacks.History at 0x138aab250>

In [16]:
# Close W&B run
wandb.finish()



0,1
accuracy,▁▅▆▇██
epoch,▁▂▄▅▇█
loss,█▅▃▂▂▁
val_accuracy,▁█▇▆▆▆
val_loss,▂▁▂▄▅█

0,1
accuracy,0.91685
best_epoch,1.0
best_val_loss,0.71087
epoch,5.0
loss,0.22976
val_accuracy,0.65609
val_loss,1.30248


## Upload the model

In [17]:
input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema, output_schema)

In [18]:
mr = project.get_model_registry()

model_dir="headlines_sentiment_model"
if os.path.isdir(model_dir) == False:
    os.mkdir(model_dir)

joblib.dump(model, model_dir + "/headlines_sentiment_model.pkl")

Connected. Call `.close()` to terminate connection gracefully.
Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......bidirectional
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......bidirectional_1
.........backward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........forward_layer
............cell
...............vars
..................0
..................1
..................2
............vars
.........layer
............cell
...............vars
............vars
.........vars
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
.......

['headlines_sentiment_model/headlines_sentiment_model.pkl']

In [21]:
headlines_sentiment_model = mr.python.create_model(
        name = "headlines_sentiment_model", 
        model_schema=model_schema,
        description="Predicting Sentiment of Headlines",
        version = 1
    )

In [23]:
headlines_sentiment_model.save(model_dir)

  0%|          | 0/6 [00:00<?, ?it/s]

Model created, explore it at https://c.app.hopsworks.ai:443/p/5322/models/headlines_sentiment_model/1


Model(name: 'headlines_sentiment_model', version: 1)