# Training Pipeline

We choose LSTM as a classification model 


### Imports

In [8]:
import os

import joblib
import numpy as np
import pandas as pd
from datasets import load_dataset
from hsml.model_schema import ModelSchema
from hsml.schema import Schema
from huggingface_hub import notebook_login
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.models import Sequential

### Connect to Hopsworks & Huggingface

In [70]:
import hopsworks

project = hopsworks.login() 

fs = project.get_feature_store() 

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/5322




Connected. Call `.close()` to terminate connection gracefully.


In [None]:
notebook_login()

## Load Data from Huggingface

In [62]:
ds = load_dataset("eengel7/sentiment_analysis_training", split='train')

Using custom data configuration eengel7--sentiment_analysis_training-fd8af630411595dd
Found cached dataset parquet (/Users/evaengel/.cache/huggingface/datasets/eengel7___parquet/eengel7--sentiment_analysis_training-fd8af630411595dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [75]:
data_df = pd.DataFrame(data = ds, columns=['Sentiment',  'Headline'])
X_train, X_test, y_train, y_test = train_test_split(data_df['Headline'], data_df['Sentiment'], test_size=0.2, random_state=42)

In [76]:
input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema, output_schema)

In [64]:
X_train = X_train.to_list()
X_test = X_test.to_list()
y_train = y_train.to_list()
y_test = y_test.to_list()

## LSTM Model

In [65]:
# parameter initialization -  arbitrarily choosen
voc_size = 5000 
max_len = 60
embedding_vector_features = 40

In [66]:
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length = max_len, mask_zero=True))
model.add(Dropout(0.5))
model.add(LSTM(200))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [67]:
history = model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Upload the model

In [79]:
mr = project.get_model_registry()

model_dir="headlines_sentiment_model"
if os.path.isdir(model_dir) == False:
    os.mkdir(model_dir)

joblib.dump(model, model_dir + "/headlines_sentiment_model.pkl")

Connected. Call `.close()` to terminate connection gracefully.
Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......embedding
.........vars
............0
......lstm
.........cell
............vars
...............0
...............1
...............2
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2023-01-13 12:49:34         2502
metadata.json                                  2023-01-13 12:49:34           64
variables.h5        

['headlines_sentiment_model/headlines_sentiment_model.pkl']

In [83]:
history1 = history
acc = history1.history['accuracy'][-1]

headlines_sentiment_model = mr.python.create_model(
        name = "headlines_sentiment_model", 
        metrics = {"accuracy": acc},
        model_schema=model_schema,
        description="Predicting Sentiment of Headlines"
    )

In [84]:
headlines_sentiment_model.save(model_dir)


  0%|          | 0/6 [00:00<?, ?it/s]

Model created, explore it at https://c.app.hopsworks.ai:443/p/5322/models/headlines_sentiment_model/1


Model(name: 'headlines_sentiment_model', version: 1)