In [1]:
!python -V

Python 3.9.24


In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error

In [6]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/workspaces/mlops-zoomcamp/03-Orchestration/artifacts/1', creation_time=1761664682548, experiment_id='1', last_update_time=1761664682548, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [7]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    return df

In [8]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [9]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [10]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [11]:
import xgboost as xgb

In [18]:
from pathlib import Path
models_folder = Path('models')
models_folder.mkdir(exist_ok=True)

In [12]:
with mlflow.start_run():
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:linear',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=30,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")



[0]	validation-rmse:11.44482
[1]	validation-rmse:10.77202
[2]	validation-rmse:10.18363
[3]	validation-rmse:9.67396
[4]	validation-rmse:9.23166
[5]	validation-rmse:8.84808
[6]	validation-rmse:8.51883
[7]	validation-rmse:8.23597
[8]	validation-rmse:7.99320
[9]	validation-rmse:7.78709
[10]	validation-rmse:7.61022
[11]	validation-rmse:7.45952
[12]	validation-rmse:7.33049
[13]	validation-rmse:7.22098
[14]	validation-rmse:7.12713
[15]	validation-rmse:7.04752
[16]	validation-rmse:6.98005
[17]	validation-rmse:6.92232
[18]	validation-rmse:6.87112
[19]	validation-rmse:6.82740
[20]	validation-rmse:6.78995
[21]	validation-rmse:6.75792
[22]	validation-rmse:6.72994
[23]	validation-rmse:6.70547
[24]	validation-rmse:6.68390
[25]	validation-rmse:6.66421
[26]	validation-rmse:6.64806
[27]	validation-rmse:6.63280
[28]	validation-rmse:6.61924
[29]	validation-rmse:6.60773




🏃 View run persistent-yak-513 at: http://localhost:5000/#/experiments/1/runs/7b6d6b04648d4c268941d92c1aac31e8
🧪 View experiment at: http://localhost:5000/#/experiments/1


In [13]:
import pandas as pd

results = pd.DataFrame({
    'actual_duration': y_val,
    'predicted_duration': y_pred
})
print(results.head(10))
results['difference'] = results['actual_duration'] - results['predicted_duration']
print("Average difference:", results['difference'].mean())

results.to_csv("models/predictions_vs_actuals.csv", index=False)
mlflow.log_artifact("models/predictions_vs_actuals.csv", artifact_path="predictions")


   actual_duration  predicted_duration
0        17.916667           16.125685
1         6.500000            7.607263
2        15.250000           19.447042
3        18.233333           24.010212
4         8.966667           10.241908
5         7.850000           17.095030
6         9.700000           14.144274
7        11.283333            9.423327
8         8.733333            9.298021
9         1.716667           19.088913
Average difference: 1.2705986995605845


In [16]:
import kagglehub
import pandas as pd
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import mlflow
from pathlib import Path

# -----------------------------
# 🔹 Step 1: Download Dataset
# -----------------------------
path = kagglehub.dataset_download("santanukundu/delhivery-dataset")
print("Path to dataset files:", path)

# The dataset usually contains "train_data.csv" or similar.
# Let's read it (you can adjust filename as per your local path).
df = pd.read_csv(f"{path}/Delhivery.csv")  # adjust if filename differs
print(df.head())

# -----------------------------
# 🔹 Step 2: Basic Cleaning
# -----------------------------
# Let's inspect columns
print("Columns:", df.columns)

# Example columns: ['ID', 'Weight', 'Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
#                   'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases', 'Product_importance',
#                   'Gender', 'Discount_offered', 'Weight_in_gms', 'Reached_on_Time_Y_N']

# Drop ID if present
if 'ID' in df.columns:
    df = df.drop(columns=['ID'])

# Target variable
target = 'Reached_on_Time_Y_N'

# Convert target from Y/N to binary 0/1
df[target] = df[target].map({'Y': 1, 'N': 0})

# -----------------------------
# 🔹 Step 3: Split dataset
# -----------------------------
df_train, df_val = train_test_split(df, test_size=0.2, random_state=42)

# -----------------------------
# 🔹 Step 4: Define features
# -----------------------------
categorical = [
    'Warehouse_block',
    'Mode_of_Shipment',
    'Product_importance',
    'Gender'
]
numerical = [
    'Customer_care_calls',
    'Customer_rating',
    'Cost_of_the_Product',
    'Prior_purchases',
    'Discount_offered',
    'Weight_in_gms'
]

# -----------------------------
# 🔹 Step 5: Vectorize features
# -----------------------------
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

y_train = df_train[target].values
y_val = df_val[target].values

# -----------------------------
# 🔹 Step 6: MLflow setup
# -----------------------------
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("delhivery-predict-delay")

models_folder = Path('models')
models_folder.mkdir(exist_ok=True)

# -----------------------------
# 🔹 Step 7: Train XGBoost model
# -----------------------------
with mlflow.start_run():

    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'learning_rate': 0.1,
        'max_depth': 6,
        'min_child_weight': 1,
        'seed': 42
    }

    mlflow.log_params(params)

    booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )

    # -----------------------------
    # 🔹 Step 8: Evaluate model
    # -----------------------------
    y_pred_prob = booster.predict(valid)
    y_pred = (y_pred_prob > 0.5).astype(int)

    rmse = root_mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("r2", r2)

    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R²: {r2:.4f}")

    # -----------------------------
    # 🔹 Step 9: Save artifacts
    # -----------------------------
    with open("models/preprocessor_delhivery.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("models/preprocessor_delhivery.b", artifact_path="preprocessor")
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

    # Save predictions for analysis
    results = pd.DataFrame({
        "actual": y_val,
        "predicted": y_pred,
        "probability": y_pred_prob
    })
    results.to_csv("models/delhivery_predictions.csv", index=False)
    mlflow.log_artifact("models/delhivery_predictions.csv", artifact_path="predictions")


Path to dataset files: /home/codespace/.cache/kagglehub/datasets/santanukundu/delhivery-dataset/versions/1


FileNotFoundError: [Errno 2] No such file or directory: '/home/codespace/.cache/kagglehub/datasets/santanukundu/delhivery-dataset/versions/1/Delhivery.csv'