# Init.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import(
    mean_absolute_error as MAE,
    mean_squared_error as MSE,
    mean_absolute_percentage_error as MAPE,

)
import joblib
import warnings
warnings.filterwarnings('ignore')

# Prep.

In [None]:
data = pd.read_parquet("./data/trip_data_clean.parquet")
data.info()
data.sample(5)

## Split Data

In [None]:
features = data.columns[:1]
target = data.columns[-1]

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Modeling

In [None]:
# Baseline Model 1: Random Prediction
def random_predict(X):
    np.random.seed(42)
    return np.random.uniform(low=y.min(), high=y.max(), size=len(X))

# Baseline Model 2: Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_train_pred = lr_model.predict(X_train)
lr_test_pred = lr_model.predict(X_test)

# Expected Best Model: Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
dt_train_pred = dt_model.predict(X_train)
dt_test_pred = dt_model.predict(X_test)

# Evaluation

In [None]:
df_metrics = pd.DataFrame(columns=["Model", "Set_Category", "MAE", "MSE", "RMSE", "MAPE"])

def evaluate_model(name, dataset, y_true, y_pred):
    mae = MAE(y_true, y_pred)
    mse = MSE(y_true, y_pred)
    rmse = np.sqrt(mse)
    mape = MAPE(y_true, y_pred)

    df_metrics.loc[len(df_metrics)] = [name, dataset, mae, mse, rmse, mape]
    return df_metrics

# Evaluate Random Prediction
evaluate_model("Random Prediction", "Train", y_train, random_predict(X_train))
evaluate_model("Random Prediction", "Test", y_test, random_predict(X_test))
# Evaluate Linear Regression
evaluate_model("Linear Regression", "Train", y_train, lr_train_pred)
evaluate_model("Linear Regression", "Test", y_test, lr_test_pred)
# Evaluate Decision Tree Regressor
evaluate_model("Decision Tree Regressor", "Train", y_train, dt_train_pred)
evaluate_model("Decision Tree Regressor", "Test", y_test, dt_test_pred)

df_metrics

## Check Fit Status (Good, Underfit, or Overfit)

In [None]:
def check_fit_status(df, metric="MAE", threshold=0.1):
    """
    Classify models as Overfit / Underfit / Good fit.

    Parameters
    ----------
    df : pd.DataFrame
        Must contain columns ["Model", "Set_Category", metric]
    metric : str, default="RMSE"
        Metric to compare between Train and Test
    threshold : float, default=0.2
        Relative tolerance (20% by default).
        If |Train - Test| > threshold * min(Train, Test), considered misfit.

    Returns
    -------
    pd.DataFrame
        Original df with an extra column 'Fit_Status'
    """
    result = df.copy()

    # Create a column for Fit_Status only for the Test rows
    result["Fit_Status"] = None

    # Work model by model
    for model in result["Model"].unique():
        train_row = result[(result.Model == model) & (result.Set_Category == "Train")].iloc[0]
        test_row = result[(result.Model == model) & (result.Set_Category == "Test")].iloc[0]

        train_err = train_row[metric]
        test_err = test_row[metric]

        gap = abs(test_err - train_err)
        rel_gap = gap / min(train_err, test_err)

        if rel_gap <= threshold:
            status = "Good fit"
        else:
            if test_err > train_err:
                status = "Overfit (high test error)"
            else:
                status = "Underfit (high train error)"  # rare but possible

        # Assign status to the test row
        idx = result[(result.Model == model) & (result.Set_Category == "Test")].index
        result.loc[idx, "Fit_Status"] = status

    return result

df_metrics = check_fit_status(df_metrics) 
df_metrics

# Use Model

In [None]:
# Manual prediction using Linear Regression coefficients
a = lr_model.coef_[0]
b = lr_model.intercept_

f_x = lambda x: a*x + b
print(f"f(x) = {a} * x + {b}")
print(f"f(`trip_distance`) = {a} * `trip_distance` + {b}", "\n")

print(f"f(10) = $ {f_x(10).round(2)}")
print(f"f(20) = $ {f_x(20).round(2)}")
print(f"f(30) = $ {f_x(30).round(2)}")

In [None]:
# Prediction using Linear Regression model
lr_model.predict([[10]]).round(2), lr_model.predict([[20]]).round(2), lr_model.predict([[30]]).round(2)

In [None]:
# Prediction using Decisiont Tree Regressor
dt_model.predict([[10]]).round(2), dt_model.predict([[20]]).round(2), dt_model.predict([[30]]).round(2)

In [None]:
# Export Model
joblib.dump(lr_model, "./model/lr.joblib")
joblib.dump(dt_model, "./model/dt.joblib")
