In [None]:
from sys import version_info
import os
import io
import joblib
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

import mlflow
import dvc.api
import warnings

warnings.filterwarnings("ignore")

backend_uri = os.environ['MLFLOW_TRACKING_URI']
artifact_uri = os.environ['MLFLOW_ARTIFACT_STORE']
mlflow.set_tracking_uri(backend_uri)

PYTHON_VERSION = "{major}.{minor}.{micro}".format(major=version_info.major,
                                                  minor=version_info.minor,
                                                  micro=version_info.micro)
pipeline_path = "pipe.pkl"
model_path = "model.h5"

path = 'dataset/housing.csv'
repo = '../.git'
version = 'v1' #Git tag

data_url = dvc.api.get_url(
    path = path,
    repo = repo,
    rev = version
)

In [None]:
def fetch_data_from_s3(path, repo, version):
    data = dvc.api.read(
            path = path,
            repo = repo,
            rev = version
        )
    return pd.read_csv(io.StringIO(data), sep=',')


def fetch_data_from_fs(url):
    return pd.read_csv(url, sep=',')


def fetch_data(url):
    storage_type = url.split(":")[0]
    if storage_type.upper() == "S3":
        data = fetch_data_from_s3(path, repo, version)
    else:
        data = fetch_data_from_fs(url)
        
    return data

In [None]:
housing = fetch_data(data_url)
housing.head()

In [None]:
targetCol = "median_house_value"
catCols = ["ocean_proximity"]
numCols = ["housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]

num_pipeline = Pipeline([
                    ("Imputer", SimpleImputer()),
                    ("Scaler", StandardScaler())
                ])

full_pipeline = ColumnTransformer([
                    ("Numerical_Pipeline", num_pipeline, numCols),
                    ("OneHot", OneHotEncoder(), catCols)
                ])

In [None]:
x = housing[numCols + catCols]
y = housing[targetCol]

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)
x_train = full_pipeline.fit_transform(x_train)
x_val = full_pipeline.transform(x_val)

In [None]:
model = Sequential()
model.add(Input(shape=(x_train.shape[1],)))
model.add(Dense(11, activation='relu'))
model.add(Dense(11, activation='relu'))
model.add(Dense(1, activation='relu'))
model.summary()

In [None]:
model.compile(optimizer="Adam", loss="mse", metrics=[tf.keras.metrics.MeanSquaredError()])
history = model.fit(x=x_train, y=y_train, batch_size=32, epochs=10)
preds = model.predict(x_val)
rmse = np.sqrt(mean_squared_error(preds, y_val))
print("rmse: ", rmse)

In [None]:
tf.keras.models.save_model(model, model_path)
joblib.dump(full_pipeline, pipeline_path);

In [None]:
# Create an `artifacts` dictionary that assigns a unique name to the saved pipeline and the model
# This dictionary will be passed to `mlflow.pyfunc.save_model`, which will copy the model file
# into the new MLflow Model's directory.
artifacts = {
    "pipeline": pipeline_path,
    "model": model_path
}

# Define the model class
class ModelWrapper(mlflow.pyfunc.PythonModel):
    import tensorflow as tf
    
    def load_context(self, context):
        self.pipeline = joblib.load(context.artifacts["pipeline"])
        self.model = tf.keras.models.load_model(context.artifacts["model"])

    def predict(self, context, model_input):
        model_input.columns = ["housing_median_age", "total_rooms", "total_bedrooms", 
                   "population", "households", "median_income", "ocean_proximity"]
        input_matrix = self.pipeline.transform(model_input)
        return self.model.predict(input_matrix)

In [None]:
experiment = mlflow.get_experiment_by_name("tf_housing")
if experiment is None:
    experiment_id = mlflow.create_experiment("tf_housing", artifact_location=artifact_uri)
    experiment = mlflow.get_experiment(experiment_id)

mlflow.set_experiment(experiment.name)

mlflow_pyfunc_model_path = "mlflow_tf_housing"
with mlflow.start_run():
    mlflow.log_param('data_url', data_url)
    mlflow.log_param('data_version', version)
    mlflow.log_param('input_rows', x_train.shape[0])
    mlflow.log_param('input_columns', x_train.shape[1])
    
    mlflow.log_metric("rmse", rmse)
    mlflow.pyfunc.log_model(mlflow_pyfunc_model_path, 
                            python_model=ModelWrapper(), 
                            artifacts=artifacts)
mlflow.end_run()