In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

In [None]:
mlflow.set_tracking_uri("http://localhost:5000")

In [2]:
"""
This script trains a linear regression model to predict the age of an abalone
based on its physical characteristics. It uses the MLflow library to track
experiments and log models.

The main steps are:
1. Load the data
2. Prepare the data
3. Split into training and testing sets
4. Train the model
5. Evaluate performance
6. Log results to MLflow

The model is trained with different alpha values to evaluate their impact on performance.
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
import mlflow.sklearn


def load_data():
    """Loads the abalone dataset and performs necessary transformations."""
    df = pd.read_csv("../data/abalone.csv")
    df["Age"] = df["Rings"] + 1.5
    X = df.drop(columns=["Rings", "Age", "Sex"])
    y = df["Age"]
    return X, y


def split_data(X, y):
    """Splits the data into training and testing sets."""
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    return X_train, X_test, y_train, y_test


def train_and_log(X_train, X_test, y_train, y_test, alpha):
    """Trains the model, evaluates its performance, and logs results to MLflow."""
    with mlflow.start_run():
        model = LinearRegression()
        model.fit(X_train, y_train)

        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)

        train_mse = mean_squared_error(y_train, y_pred_train)
        test_mse = mean_squared_error(y_test, y_pred_test)
        test_r2 = r2_score(y_test, y_pred_test)

        mlflow.log_metric("train_mse", train_mse)
        mlflow.log_metric("test_mse", test_mse)
        mlflow.log_metric("test_r2", test_r2)
        mlflow.log_param("alpha", alpha)
        mlflow.sklearn.log_model(model, "linear_model")

        print(f"Train MSE: {train_mse}")
        print(f"Test MSE: {test_mse}")
        print(f"Test R2: {test_r2}")


if __name__ == "__main__":
    X, y = load_data()
    X_train, X_test, y_train, y_test = split_data(X, y)
    train_and_log(X_train, X_test, y_train, y_test, alpha=0.01)

Train MSE: 4.874421563913713
Test MSE: 5.055541144299385
Test R2: 0.532984475772452
