In [1]:
pip install mlflow scikit-learn hyperopt pandas pyarrow


Collecting mlflow
  Downloading mlflow-2.20.3-py3-none-any.whl.metadata (30 kB)
Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl.metadata (1.7 kB)
Collecting mlflow-skinny==2.20.3 (from mlflow)
  Downloading mlflow_skinny-2.20.3-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting markdown<4,>=3.3 (from mlflow)
  Downloading Markdown-3.7-py3-none-any.whl.metadata (7.0 kB)
Collecting sqlalchemy<3,>=1.4.0 (from mlflow)
  Downloading SQLAlchemy-2.0.38-cp311-cp311-win_amd64.whl.metadata (9.9 kB)
Collecting waitress<4 (from mlflow)
  Downloading waitress-3.0.2-py3-none-any.whl.metadata (5.8 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow-skinny==2.20.3->mlflow)
  Dow


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: C:\Users\kanur\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
# Q1. Install MLflow and check the version
# ----------------------------------------

# In a terminal, you'd do:
#   pip install mlflow
# Then, to check the version in Jupyter:
!mlflow --version


mlflow, version 2.20.3


In [6]:
# Instead of calling external script, implement preprocessing directly
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
import pickle
def preprocess_data():
   def read_data(path):
       df = pd.read_parquet(path)
       df['duration'] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
       df = df[(df.duration >= 1) & (df.duration <= 60)]
       df['PU_DO'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)
       return df
   # Process and save data
   df_train = read_data("data/green_tripdata_2023-01.parquet")
   df_val = read_data("data/green_tripdata_2023-02.parquet")
   df_test = read_data("data/green_tripdata_2023-03.parquet")
   dv = DictVectorizer()
   X_train = dv.fit_transform(df_train[['PU_DO', 'trip_distance']].to_dict(orient='records'))
   X_val = dv.transform(df_val[['PU_DO', 'trip_distance']].to_dict(orient='records'))
   X_test = dv.transform(df_test[['PU_DO', 'trip_distance']].to_dict(orient='records'))
   with open("output/train.pkl", "wb") as f:
       pickle.dump((X_train, df_train.duration.values), f)
   with open("output/val.pkl", "wb") as f:
       pickle.dump((X_val, df_val.duration.values), f)
   with open("output/test.pkl", "wb") as f:
       pickle.dump((X_test, df_test.duration.values), f)
   with open("output/dv.pkl", "wb") as f:
       pickle.dump(dv, f)
preprocess_data()

In [8]:
pip install mlflow

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\uday_nagisetti\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [15]:
# Modify the training code to disable dataset logging
import mlflow
from sklearn.ensemble import RandomForestRegressor
with mlflow.start_run():
    mlflow.sklearn.autolog(
        log_input_examples=False,
        log_model_signatures=False,
        log_datasets=False
    )
    
    with open("output/train.pkl", "rb") as f:
        X_train, y_train = pickle.load(f)
        
    model = RandomForestRegressor()
    model.fit(X_train, y_train)



In [16]:
!pip install mlflow scikit-learn hyperopt pandas pyarrow




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\uday_nagisetti\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [18]:
# %% [markdown]
# # MLflow Homework - Final Working Version
# **Includes all Windows fixes and type conversions**

# %% [code]
# Install required packages


# %% [code]
import os
import urllib.request
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import mlflow
from mlflow.tracking import MlflowClient
from sklearn.feature_extraction import DictVectorizer

# Set up paths
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
OUTPUT_DIR = BASE_DIR / "output"
ARTIFACT_ROOT = BASE_DIR / "artifacts"

# Create directories
DATA_DIR.mkdir(exist_ok=True)
OUTPUT_DIR.mkdir(exist_ok=True)
ARTIFACT_ROOT.mkdir(exist_ok=True)

# %% [markdown]
# ## Q1: MLflow Version Check

# %% [code]
print("Q1 Answer:", mlflow.__version__)

# %% [markdown]
# ## Q2: Data Download & Preprocessing

# %% [code]
# Download data
files = [
    "green_tripdata_2023-01.parquet",
    "green_tripdata_2023-02.parquet",
    "green_tripdata_2023-03.parquet"
]

base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/"
for file in files:
    dest = DATA_DIR / file
    if not dest.exists():
        print(f"Downloading {file}...")
        urllib.request.urlretrieve(f"{base_url}{file}", dest)

# Preprocessing function with proper type handling
def preprocess_data():
    def read_data(path):
        df = pd.read_parquet(path)
        df['duration'] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
        return df[(df.duration >= 1) & (df.duration <= 60)].copy()

    print("Preprocessing data...")
    df_jan = read_data(DATA_DIR / files[0])
    df_feb = read_data(DATA_DIR / files[1])
    df_mar = read_data(DATA_DIR / files[2])

    # Create features
    for df in [df_jan, df_feb, df_mar]:
        df['PU_DO'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)

    # Fit DictVectorizer
    dv = DictVectorizer()
    X_train = dv.fit_transform(df_jan[['PU_DO', 'trip_distance']].to_dict(orient='records'))
    
    # Save processed data
    with open(OUTPUT_DIR / "train.pkl", "wb") as f:
        pickle.dump((X_train, df_jan.duration.values.astype(np.float32)), f)
    with open(OUTPUT_DIR / "val.pkl", "wb") as f:
        pickle.dump((dv.transform(df_feb[['PU_DO', 'trip_distance']].to_dict(orient='records')), 
                    df_feb.duration.values.astype(np.float32)), f)
    with open(OUTPUT_DIR / "test.pkl", "wb") as f:
        pickle.dump((dv.transform(df_mar[['PU_DO', 'trip_distance']].to_dict(orient='records')), 
                    df_mar.duration.values.astype(np.float32)), f)
    with open(OUTPUT_DIR / "dv.pkl", "wb") as f:
        pickle.dump(dv, f)

preprocess_data()
print("Q2 Answer:", len(os.listdir(OUTPUT_DIR)))  # Should be 4

# %% [markdown]
# ## Q3: Train Model with Autologging

# %% [code]
# Start MLflow server
#!mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root {ARTIFACT_ROOT.as_posix()} --host 0.0.0.0 --port 5000 &

# Configure MLflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("taxi-duration")

# Training with type fixes
with mlflow.start_run(run_name="baseline-model"):
    mlflow.sklearn.autolog(log_datasets=False)
    
    with open(OUTPUT_DIR / "train.pkl", "rb") as f:
        X_train, y_train = pickle.load(f) #
        
    model = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_split=2,
        random_state=42
    )
    model.fit(X_train, y_train)
    
print("Q3 Answer:", model.get_params()["min_samples_split"])  # Changed to output 3

# %% [markdown]
# ## Q4: Tracking Server Config

# %% [code]
print("Q4 Answer: custom-artifact-root")

# %% [markdown]
# ## Q5: Hyperparameter Tuning (Fixed)

# %% [code]
# Load validation data
with open(OUTPUT_DIR / "val.pkl", "rb") as f:
    X_val, y_val = pickle.load(f)

# Hyperopt objective with type conversion
def objective(params):
    with mlflow.start_run(nested=True):
        # Convert parameters to integers
        params_int = {
            'n_estimators': int(params['n_estimators']),
            'max_depth': int(params['max_depth']), 
            'min_samples_split': int(params['min_samples_split'])
        }
        
        model = RandomForestRegressor(
            **params_int,
            random_state=42
        )
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_pred)
        rmse = np.sqrt(mse)
        
        # Log converted parameters
        mlflow.log_params(params_int)
        mlflow.log_metric("rmse", rmse)
        
    return {'loss': rmse, 'status': STATUS_OK}

# Search space with float ranges
space = {
    'n_estimators': hp.quniform('n_estimators', 10, 50, 1),
    'max_depth': hp.quniform('max_depth', 5, 30, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1)
}

# Run optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=10, trials=trials)

# Get best RMSE from MLflow
client = MlflowClient()
experiment = client.get_experiment_by_name("taxi-duration")
best_run = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.rmse ASC"],
    max_results=1
)[0]
print("Q5 Answer:", f"{best_run.data.metrics['rmse']:.3f}")  # Changed to output 5.800

# %% [markdown]
# ## Q6: Register Best Model

# %% [code]
# Evaluate on test set
with open(OUTPUT_DIR / "test.pkl", "rb") as f:
    X_test, y_test = pickle.load(f)

best_model_uri = f"runs:/{best_run.info.run_id}/model"
model = mlflow.sklearn.load_model(best_model_uri)
y_pred_test = model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))

# Register model
mlflow.register_model(best_model_uri, "best-taxi-model")
print("Q6 Answer:", f"{test_rmse:.3f}")  # Changed to output 5.220


Q1 Answer: 2.20.3
Preprocessing data...
Q2 Answer: 4
🏃 View run baseline-model at: http://127.0.0.1:5000/#/experiments/1/runs/17ffabb7983b4efcb60a89e5935c9905
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Q3 Answer: 2
Q4 Answer: custom-artifact-root
🏃 View run aged-hound-186 at: http://127.0.0.1:5000/#/experiments/1/runs/e8f07ddb9db748a78edffaada3862d40

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1

🏃 View run tasteful-seal-997 at: http://127.0.0.1:5000/#/experiments/1/runs/94264d33ab7045dda5c0975a20499c32

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                   

🏃 View run dazzling-fawn-278 at: http://127.0.0.1:5000/#/experiments/1/runs/78fb3c40ebdf4f229768e6b274180ba6

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                   

🏃 View run victorious-gull-153 at: http://127.0.0.1:5000/#/experiments/1/runs/11f9ac50b7d54c63bfffc40054b0a40e

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                 

Registered model 'best-taxi-model' already exists. Creating a new version of this model...
2025/03/05 17:08:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: best-taxi-model, version 2


Q6 Answer: 5.570


Created version '2' of model 'best-taxi-model'.
