In [None]:
from sys import version_info
import os
import io
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

import mlflow
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec
import dvc.api
import warnings

warnings.filterwarnings("ignore")

backend_uri = os.environ['MLFLOW_TRACKING_URI']
artifact_uri = os.environ['MLFLOW_ARTIFACT_STORE']
mlflow.set_tracking_uri(backend_uri)

path = 'dataset/housing.csv'
repo = '../.git'
version = 'v1' #Git tag

data_url = dvc.api.get_url(
    path = path,
    repo = repo,
    rev = version
)

In [None]:
def fetch_data_from_s3(path, repo, version):
    data = dvc.api.read(
            path = path,
            repo = repo,
            rev = version
        )
    return pd.read_csv(io.StringIO(data), sep=',')


def fetch_data_from_fs(url):
    return pd.read_csv(url, sep=',')


def fetch_data(url):
    storage_type = url.split(":")[0]
    if storage_type.upper() == "S3":
        data = fetch_data_from_s3(path, repo, version)
    else:
        data = fetch_data_from_fs(url)
        
    return data

In [None]:
housing = fetch_data(data_url)
x_train = housing["housing_median_age"].values.reshape(-1,1)
y_train = housing["median_house_value"].values.reshape(-1,1)

model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_train)
rmse = np.sqrt(mean_squared_error(y_train, predictions))
rmse

In [None]:
input_schema = Schema([ColSpec("double", "housing_median_age")])
output_schema = Schema([ColSpec('double')])
sign = ModelSignature(inputs=input_schema, outputs=output_schema)

In [None]:
experiment = mlflow.get_experiment_by_name("single_feature")
if experiment is None:
    experiment_id = mlflow.create_experiment("single_feature", artifact_location=artifact_uri)
    experiment = mlflow.get_experiment(experiment_id)

mlflow.set_experiment(experiment.name)

model_path = "single_feature"
with mlflow.start_run():
    mlflow.log_param('data_url', data_url)
    mlflow.log_param('data_version', version)
    mlflow.log_param('input_rows', x_train.shape[0])
    mlflow.log_param('input_columns', x_train.shape[1])
    
    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(model, model_path, signature=sign)