In [8]:
import pandas as pd
import numpy as np
import mlflow
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from pathlib import Path

In [3]:
def file_load(path:str) -> pd.DataFrame:
    
    cols = ['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'PULocationID', 'DOLocationID',
       'trip_distance']
    df = pd.read_parquet(path, columns=cols)
    
    df['duration'] = (df['lpep_dropoff_datetime']-df['lpep_pickup_datetime']).dt.total_seconds()/60
    df = df[(df['duration']>=1)&(df['duration']<=60)]
    
    df['PU_DO'] = df['PULocationID'].astype(str) + "_" + df['DOLocationID'].astype(str)
    
    return df[['duration', 'PU_DO', 'trip_distance']]

In [4]:
df_train = file_load('../Data/green_tripdata_2021-01.parquet')

In [5]:
df_train.head()

Unnamed: 0,duration,PU_DO,trip_distance
0,3.933333,43_151,1.01
1,8.75,166_239,2.53
2,5.966667,41_42,1.12
3,7.083333,168_75,1.99
7,2.316667,75_75,0.45


In [9]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-experiment')
models_folder = Path('models')
models_folder.mkdir(exist_ok=True)

In [10]:
def X_feature(df:pd.DataFrame, dv:DictVectorizer = None):
    
    dic = df[['PU_DO', 'trip_distance']].to_dict(orient = 'records')
    
    if dv is None:
        dv = DictVectorizer()
        x = dv.fit_transform(dic)
        
    else:
        x = dv.fit(dic)
        
    return x

In [None]:
def model_training(X_train, y_train, X_val, y_val, dv):
    
    with mlflow.start_run():
        train = xgb.DMatrix(X_train, y_train)
        val = xgb.DMatrix(X_val, y_val)