In [1]:
!python -V

Python 3.10.12


# **Q1. Install MLflow**

In [16]:
!pip install mlflow scikit-learn pandas fastparquet hyperopt matplotlib



In [4]:
!mlflow --version

mlflow, version 2.13.0


# **Q2. Download and preprocess the data**

## Write the code for `preprocess_data.py`

In [13]:
%%writefile preprocess_data.py
import os
import pickle
import click
import pandas as pd

from sklearn.feature_extraction import DictVectorizer


def dump_pickle(obj, filename: str):
    with open(filename, "wb") as f_out:
        return pickle.dump(obj, f_out)


def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df


def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    if fit_dv:
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)
    return X, dv


@click.command()
@click.option(
    "--raw_data_path",
    help="Location where the raw NYC taxi trip data was saved"
)
@click.option(
    "--dest_path",
    help="Location where the resulting files will be saved"
)
def run_data_prep(raw_data_path: str, dest_path: str, dataset: str = "green"):
    # Load parquet files
    df_train = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-01.parquet")
    )
    df_val = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-02.parquet")
    )
    df_test = read_dataframe(
        os.path.join(raw_data_path, f"{dataset}_tripdata_2023-03.parquet")
    )

    # Extract the target
    target = 'duration'
    y_train = df_train[target].values
    y_val = df_val[target].values
    y_test = df_test[target].values

    # Fit the DictVectorizer and preprocess data
    dv = DictVectorizer()
    X_train, dv = preprocess(df_train, dv, fit_dv=True)
    X_val, _ = preprocess(df_val, dv, fit_dv=False)
    X_test, _ = preprocess(df_test, dv, fit_dv=False)

    # Create dest_path folder unless it already exists
    os.makedirs(dest_path, exist_ok=True)

    # Save DictVectorizer and datasets
    dump_pickle(dv, os.path.join(dest_path, "dv.pkl"))
    dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl"))
    dump_pickle((X_val, y_val), os.path.join(dest_path, "val.pkl"))
    dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl"))


if __name__ == '__main__':
    run_data_prep()

Overwriting preprocess_data.py


In [8]:
!python preprocess_data.py --raw_data_path data --dest_path output

In [12]:
# Calculate how many files were saved to OUTPUT_FOLDER
!ls output | wc -l

4


# **Q3. Train a model with autolog**

## Write the code for `train.py`

In [14]:
%%writefile train.py
import os
import pickle
import click

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


@click.command()
@click.option(
    "--data_path",
    default="output",
    help="Location where the processed NYC taxi trip data was saved"
)
def run_train(data_path: str):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    rf = RandomForestRegressor(max_depth=10, random_state=0)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)


if __name__ == '__main__':
    run_train()

Writing train.py


In [17]:
!mlflow ui

[2024-05-30 14:27:54 +0000] [30681] [INFO] Starting gunicorn 22.0.0
[2024-05-30 14:27:54 +0000] [30681] [INFO] Listening at: http://127.0.0.1:5000 (30681)
[2024-05-30 14:27:54 +0000] [30681] [INFO] Using worker: sync
[2024-05-30 14:27:54 +0000] [30682] [INFO] Booting worker with pid: 30682
[2024-05-30 14:27:54 +0000] [30683] [INFO] Booting worker with pid: 30683
[2024-05-30 14:27:54 +0000] [30684] [INFO] Booting worker with pid: 30684
[2024-05-30 14:27:54 +0000] [30685] [INFO] Booting worker with pid: 30685

Aborted!
[2024-05-30 14:30:10 +0000] [30681] [INFO] Handling signal: int
[2024-05-30 14:30:10 +0000] [30684] [INFO] Worker exiting (pid: 30684)
[2024-05-30 14:30:10 +0000] [30685] [INFO] Worker exiting (pid: 30685)
[2024-05-30 14:30:10 +0000] [30682] [INFO] Worker exiting (pid: 30682)
[2024-05-30 14:30:10 +0000] [30683] [INFO] Worker exiting (pid: 30683)
[2024-05-30 14:30:11 +0000] [30681] [INFO] Shutting down: Master
