In [1]:
import pandas as pd
import pickle
import os
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.metrics import mean_squared_error

import mlflow

In [2]:
!ls

mlflow.db                 mlflow_walk_through.ipynb [34mmlruns[m[m


In [3]:
# os.chdir('../')
os.getcwd()

'/Users/dsn/Downloads/MLOps_Zoomcamp/Week 2 - Training/notebooks'

In [4]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc_taxi_experiment")

<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='nyc_taxi_experiment', tags={}>

In [5]:
# Specify the location of the file
green_trip_data_Jan_2021 = '../data/green_tripdata_2021-01.parquet'
green_trip_data_Feb_2021 = '../data/green_tripdata_2021-02.parquet'

In [11]:
def read_dataframe(filename, pickup_time, drop_off_time, pu_id, do_id):
    df = pd.read_parquet(filename)
    df['duration'] = pd.to_datetime(df[drop_off_time]) - pd.to_datetime(df[pickup_time])  # calculate duration and convert to minutes
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)
 
    df = df[((df.duration>=1) & (df.duration<=60))]       # filter to only duration within and less than hour

    categorical = [pu_id, do_id]

    df[categorical] = df_Jan[categorical].fillna(-1)
    df[categorical] = df[categorical].astype(str)
    return df

In [8]:
df_Jan.columns

Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge'],
      dtype='object')

In [14]:
df_Jan = read_dataframe(green_trip_data_Jan_2021,
                            pickup_time="lpep_pickup_datetime",
                            drop_off_time='lpep_dropoff_datetime',
                            pu_id ='PULocationID',
                            do_id='DOLocationID'
                             )
df_Feb = read_dataframe(green_trip_data_Feb_2021,
                            pickup_time="lpep_pickup_datetime",
                            drop_off_time='lpep_dropoff_datetime',
                            pu_id ='PULocationID',
                            do_id='DOLocationID'
                             )

In [29]:
numerical = ['trip_distance']
categorical= ['PULocationID', 'DOLocationID']

dict_train = df_Jan[numerical + categorical].to_dict(orient='records')
dict_eval = df_Feb[numerical + categorical].to_dict(orient='records')


dv = DictVectorizer()
X_train = dv.fit_transform(dict_train) #this returns a sparse cmr matrix
X_eval = dv.transform(dict_eval)

Y_train = df_Jan['duration'].values
Y_eval = df_Feb['duration'].values


In [30]:
X_train.shape, X_eval.shape

((73908, 507), (61921, 507))

In [36]:
# lr = LinearRegression()
# lr.fit(X_train, Y_train)

# Y_pred = lr.predict(X_train) # Make prediciton on the train
# mean_squared_error(Y_train, Y_pred, squared=False)

In [37]:
# with open('../models/linreg.bin', 'wb') as f_out:
#     pickle.dump((dv, lr), f_out)

In [33]:
Y_eval.shape, y_pred.shape

((61921,), (73908,))

In [61]:
with mlflow.start_run():
    mlflow.set_tag("developer", "warrie")
    
    mlflow.log_param("train-data-path", green_trip_data_Jan_2021)
    mlflow.log_param("eval-data-path", green_trip_data_Feb_2021)


    alpha = 0.01
    mlflow.log_param("alpha", alpha)
    
    lr_lasso = Lasso(alpha)
    lr_lasso.fit(X_train , Y_train)

    y_pred = lr_lasso.predict(X_eval)

    rmse= mean_squared_error(Y_eval, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path='../models/linreg.bin')

In [38]:
# Using xgboost in logging parameters:

import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

  from pandas import MultiIndex, Int64Index


In [44]:
train = xgb.DMatrix(X_train, label=Y_train)
valid = xgb.DMatrix(X_eval, label=Y_eval)

In [None]:
def objective(params):

    with mlflow.start_run():
        mlflow.set_tag("model",  "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round =1000,
            evals =[(valid ,"validation")],
            early_stopping_rounds =50

        )

        y_pred = booster.predict(valid)
        rmse=mean_squared_error(Y_eval, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return{'loss': rmse, 'status':STATUS_OK}

search_space ={

    "max_depth": scope.int(hp.quniform('max_depth',4,100,1)),
    "learning_rate": hp.loguniform('learning_rate', -3, 0),
    "reg_alpha": hp.loguniform("reg_alpha", -5,-1),
    "reg_lambda": hp.loguniform("reg_lambda", -6, -1),
    "min_cjild_weight": hp.loguniform('min_child_weight', -1, 3),
    "objective": 'reg:linear',
    'seed': 42
}

best_result  = fmin(
    fn=objective,
    space=search_space,
    algo = tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [None]:

params ={
    'learning_rate':	0.050492944008818104,
        'max_depth'	:4,
        'min_child_weight'	:2.287940332571657,
        'objective':	'reg:linear',
        'reg_alpha':	0.03134130292019771,
        'reg_lambda':	0.006582082224902776,
        'seed':	42
}

mlflow.xgboost.autolog()
with mlflow.start_run():
#     mlflow.set_tag()
#     mlflow.log_param()
    
    booster = xgb.train(
                params=params,
                dtrain=train,
                num_boost_round =1000,
                evals =[(valid ,"validation")],
                early_stopping_rounds =50
    )



In [65]:
mlflow.xgboost.autolog(disable=True)

In [68]:
# second way of logging models
with mlflow.start_run():

    best_params ={
        'learning_rate':	0.050492944008818104,
            'max_depth'	:4,
            'min_child_weight'	:2.287940332571657,
            'objective':	'reg:linear',
            'reg_alpha':	0.03134130292019771,
            'reg_lambda':	0.006582082224902776,
            'seed':	42
    }


    mlflow.log_params(best_params)
    booster = xgb.train(
                params=params,
                dtrain=train,
                num_boost_round =1000,
                evals =[(valid ,"validation")],
                early_stopping_rounds =50
    )
    
    y_pred = booster.predict(valid)
    rmse = mean_squared_error(Y_eval, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    with open("../models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("../models/preprocessor.b", artifact_path="preprocessor")


    mlflow.xgboost.log_model(booster, artifact_path='models_mlflow')




[0]	validation-rmse:20.29694
[1]	validation-rmse:19.45103
[2]	validation-rmse:18.65185
[3]	validation-rmse:17.89769
[4]	validation-rmse:17.18630
[5]	validation-rmse:16.51530
[6]	validation-rmse:15.88263
[7]	validation-rmse:15.28688
[8]	validation-rmse:14.72630
[9]	validation-rmse:14.19811
[10]	validation-rmse:13.70389
[11]	validation-rmse:13.23729
[12]	validation-rmse:12.79978
[13]	validation-rmse:12.38939
[14]	validation-rmse:12.00713
[15]	validation-rmse:11.64665
[16]	validation-rmse:11.30872
[17]	validation-rmse:10.99427
[18]	validation-rmse:10.70150
[19]	validation-rmse:10.42771
[20]	validation-rmse:10.17292
[21]	validation-rmse:9.93377
[22]	validation-rmse:9.71044
[23]	validation-rmse:9.50350
[24]	validation-rmse:9.31017
[25]	validation-rmse:9.13132
[26]	validation-rmse:8.96399
[27]	validation-rmse:8.80866
[28]	validation-rmse:8.66515
[29]	validation-rmse:8.53185
[30]	validation-rmse:8.40829
[31]	validation-rmse:8.29376
[32]	validation-rmse:8.18788
[33]	validation-rmse:8.08980
[34

In [66]:
dv

DictVectorizer()

In [71]:

logged_model = 'runs:/0b8aa72bdf694b5285a2ad5c1a848091/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)



# loaded_model.predict(pd.DataFrame(data))



In [73]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: 0b8aa72bdf694b5285a2ad5c1a848091

In [74]:
xgboost_model = mlflow.xgboost.load_model(logged_model)



In [77]:
# xgboost_model
y_pred[:10]

array([17.975735 ,  7.1397743, 21.053053 , 24.254833 , 10.949154 ,
       16.876991 , 13.757508 ,  9.674001 ,  8.705003 , 19.328863 ],
      dtype=float32)