In [15]:
import mlflow
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Ridge

from sklearn.metrics import mean_squared_error

In [21]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-experiment2')

2024/07/21 15:57:18 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment2' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/mazin/Documents/GIT/ML/Training/mlruns/2', creation_time=1721595438405, experiment_id='2', last_update_time=1721595438405, lifecycle_stage='active', name='nyc-taxi-experiment2', tags={}>

In [6]:
def read_dataframe(fileurl):
    df = pd.read_parquet(fileurl)

    df['duration']= df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td : td.total_seconds() / 60)
    df= df[(df.duration>=1) & (df.duration <=60)]

    

    categorical=['PULocationID', 'DOLocationID']
    df[categorical]= df[categorical].astype(str)
    df['PU_DO'] = df['PULocationID'] + '-' + df['DOLocationID']

    return df

In [7]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-04.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-03.parquet')

In [8]:
df_train

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration,PU_DO
0,1,2024-04-01 00:02:40,2024-04-01 00:30:42,0.0,5.20,1.0,N,161,7,1,...,3.5,0.5,8.65,0.00,1.0,43.25,2.5,0.0,28.033333,161-7
1,2,2024-04-01 00:41:12,2024-04-01 00:55:29,1.0,5.60,1.0,N,264,264,1,...,1.0,0.5,10.00,0.00,1.0,37.90,0.0,0.0,14.283333,264-264
2,2,2024-04-01 00:48:42,2024-04-01 01:05:30,1.0,3.55,1.0,N,186,236,1,...,1.0,0.5,5.10,0.00,1.0,30.60,2.5,0.0,16.800000,186-236
3,2,2024-04-01 00:56:02,2024-04-01 01:05:09,1.0,1.06,1.0,N,137,164,2,...,1.0,0.5,0.00,0.00,1.0,15.00,2.5,0.0,9.116667,137-164
4,1,2024-04-01 00:08:32,2024-04-01 00:10:24,1.0,0.70,1.0,N,236,263,1,...,3.5,0.5,2.00,0.00,1.0,12.10,2.5,0.0,1.866667,236-263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3514283,2,2024-04-30 23:11:18,2024-04-30 23:20:04,,1.24,,,230,68,0,...,0.0,0.5,3.15,0.00,1.0,18.89,,,8.766667,230-68
3514284,2,2024-04-30 23:11:14,2024-04-30 23:52:18,,21.40,,,155,107,0,...,0.0,0.5,0.00,6.94,1.0,73.89,,,41.066667,155-107
3514286,2,2024-04-30 23:16:22,2024-04-30 23:26:46,,1.98,,,161,234,0,...,0.0,0.5,0.00,0.00,1.0,28.27,,,10.400000,161-234
3514287,2,2024-04-30 23:04:10,2024-04-30 23:09:25,,0.31,,,148,79,0,...,0.0,0.5,0.00,0.00,1.0,23.13,,,5.250000,148-79


In [10]:
categorical= ['PU_DO']
dv = DictVectorizer()
train_dicts= df_train[categorical + ['trip_distance']].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
y_train = df_train['duration'].values

val_dicts= df_val[categorical + ['trip_distance']].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_val['duration'].values

In [16]:
lr = Ridge()
lr.fit(X_train,y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)




5.702997115630224

In [17]:
lr = LinearRegression()
lr.fit(X_train,y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)




5.814239786410228

In [27]:
with mlflow.start_run():

    mlflow.set_tag("developer","mazin")
    mlflow.log_param("train-data-url", "/Users/mazin/Documents/GIT/ML/Training/data/yellow_tripdata_2024-04.parquet")
    mlflow.log_param("valid-data-url", "/Users/mazin/Documents/GIT/ML/Training/data/yellow_tripdata_2024-03.parquet")
    alpha=0.1
    mlflow.log_param("aplha",alpha)
    lr = Ridge(alpha)
    lr.fit(X_train,y_train)

    y_pred = lr.predict(X_val)

    rmse= mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)




In [1]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:

train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)