Question 1. Select the Tool

You can use the same tool you used when completing the module, or choose a different one for your homework.

What's the name of the orchestrator you chose?


Question 2. Version
What's the version of the orchestrator?

Question 3. Creating a pipeline
Let's read the March 2023 Yellow taxi trips data.

In [1]:
import pandas as pd

# Read the March 2023 Yellow taxi trips data
df = pd.read_parquet('data/yellow_tripdata_2023-03.parquet')

# Count the number of records
num_records = len(df)

# Print the result
print(f"Number of records loaded: {num_records:,}")
print(f"Answer: {num_records}")


Number of records loaded: 3,403,766
Answer: 3403766


In [2]:
# Question 4: Data preparation
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

# Apply the data preparation to our dataset
df_prepared = read_dataframe('data/yellow_tripdata_2023-03.parquet')

# Check the size of the result
result_size = len(df_prepared)
print(f"Size after data preparation: {result_size:,}")
print(f"Original size: {len(df):,}")
print(f"Records removed: {len(df) - result_size:,}")
print(f"Answer: {result_size}")


Size after data preparation: 3,316,216
Original size: 3,403,766
Records removed: 87,550
Answer: 3316216


In [3]:
# Question 5: Train a model
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

def train_model_pipeline(df):
    # Prepare features - use pickup and dropoff locations separately
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']
    
    # Create feature dictionaries
    features = df[categorical + numerical].to_dict(orient='records')
    
    # Fit dict vectorizer (use sparse=True to save memory)
    dv = DictVectorizer(sparse=True)
    X = dv.fit_transform(features)
    
    # Prepare target variable
    y = df['duration'].values
    
    # Train linear regression with default parameters
    lr = LinearRegression()
    lr.fit(X, y)
    
    # Print the intercept
    print(f"Model intercept: {lr.intercept_:.2f}")
    print(f"Answer: {lr.intercept_:.2f}")
    
    return dv, lr

# Use a sample to avoid memory issues
print(f"Original dataset size: {len(df_prepared):,} records")
sample_size = 100000  # Use 100k records for training
df_sample = df_prepared.sample(n=sample_size, random_state=42)
print(f"Training on sample of {len(df_sample):,} records")

# Apply to our prepared data sample
dv, model = train_model_pipeline(df_sample)


: 

In [None]:
# Question 6: Register the model with MLFlow
import mlflow
import mlflow.sklearn
import os
import json

# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("taxi-duration-experiment")

def register_model_with_mlflow(model, dv, df_sample):
    with mlflow.start_run() as run:
        # Log the model
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path="model",
            registered_model_name="taxi-duration-regressor"
        )
        
        # Log the vectorizer
        mlflow.sklearn.log_model(
            sk_model=dv,
            artifact_path="vectorizer"
        )
        
        # Log some metrics for completeness
        from sklearn.metrics import mean_squared_error
        X = dv.transform(df_sample[['PULocationID', 'DOLocationID', 'trip_distance']].to_dict(orient='records'))
        y_pred = model.predict(X)
        y_true = df_sample['duration'].values
        rmse = mean_squared_error(y_true, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        
        print(f"MLflow run ID: {run.info.run_id}")
        print(f"Model logged to: {run.info.artifact_uri}")
        
        return run.info.run_id, run.info.artifact_uri

# Register the model
run_id, artifact_uri = register_model_with_mlflow(model, dv, df_sample)


In [None]:
# Find and read the MLModel file to get model size
import os
import yaml
import json
from pathlib import Path

def find_model_size():
    # MLflow stores artifacts in mlruns directory
    mlruns_dir = Path("mlruns")
    
    if mlruns_dir.exists():
        # Find the most recent run
        experiment_dirs = [d for d in mlruns_dir.iterdir() if d.is_dir() and d.name.isdigit()]
        if experiment_dirs:
            latest_exp = max(experiment_dirs, key=lambda x: x.stat().st_mtime)
            run_dirs = [d for d in latest_exp.iterdir() if d.is_dir() and len(d.name) == 32]
            
            if run_dirs:
                latest_run = max(run_dirs, key=lambda x: x.stat().st_mtime)
                
                # Look for MLmodel file in the model artifact path
                mlmodel_path = latest_run / "artifacts" / "model" / "MLmodel"
                
                if mlmodel_path.exists():
                    with open(mlmodel_path, 'r') as f:
                        mlmodel_content = yaml.safe_load(f)
                    
                    model_size_bytes = mlmodel_content.get('model_size_bytes', 'Not found')
                    print(f"MLmodel file found at: {mlmodel_path}")
                    print(f"Model size in bytes: {model_size_bytes}")
                    print(f"Answer: {model_size_bytes}")
                    
                    return model_size_bytes
                else:
                    print(f"MLmodel file not found at expected path: {mlmodel_path}")
            else:
                print("No run directories found")
        else:
            print("No experiment directories found")
    else:
        print("mlruns directory not found")
    
    return None

# Find the model size
model_size = find_model_size()
