In [None]:
import numpy as np
import pandas as pd

def top_n_categories(y_true, y_pred, features_df, feature_col, top_n=5, error_type="mae"):
    """
    Compute top N categories (best and worst) by average error for a given feature,
    including category frequency.
    
    Parameters
    ----------
    y_true : pandas Series (array-like)
        Ground truth values
    y_pred : pandas Series (array-like)
        Model predictions
    features_df : DataFrame
        DataFrame with features (aligned with y_true/y_pred)
    feature_col : str
        Feature containing the categories groupby
    top_n : int, default=5
        Number of top categories to return
    error_type : str, default="mae"
        Error metric: mae (mean absolute error) or mse (mean squared error)
        
    """
    
    df = pd.DataFrame({
      "y_true": y_true,
      "y_pred": y_pred,
      "error": y_true - y_pred
    })

    df[feature_col] = features_df[feature_col].values
    if error_type == "mae":
      df["err"] = df["error"].abs()


    if error_type == "mse":
      df["err"] = df["error"]**2
      


    grouped = df.groupby(feature_col).aggregate(avg_error=("err", "mean"),frequency=("err", "count"))

    sorted = grouped.sort_values("avg_error")
    best = sorted.head(top_n).reset_index()
    worst = sorted.tail(top_n).reset_index()
    
    print("Top 5 Best Categories:")
    print(best)

    print("\nTop 5 Least Accurate Predictions:")
    print(worst)

In [None]:
def top_n_accuracy(y_true, y_pred):
    """
    Compute top N accurate samples (best and worst) by error.
    
    Parameters
    ----------
    y_true : pandas Series (array-like)
        Ground truth values
    y_pred : pandas Series (array-like)
        Model predictions
    """
    errors = np.abs(y_true - y_pred)

    results = pd.DataFrame({
        "y_true": y_true,
        "y_pred": y_pred,
        "error": errors  })

    sorted_results = results.sort_values(by="error")
    top5_accurate = sorted_results.head(5)
    top5_inaccurate = sorted_results.tail(5)

    print("Top 5 Most Accurate Predictions:")
    print(top5_accurate)

    print("\nTop 5 Least Accurate Predictions:")
    print(top5_inaccurate)

In [None]:
def test_rand_sample_from_df(df, num_samples, model):
    """
    Predict a random sample from the given dataframe.
    
    Parameters
    ----------
    df : pandas Series (array-like)
        Dataframe to sample from
    num_samples : Integer
        Number of random samples to predict
    model : 
        Trained model used to predict samples
    """
    sample = df.sample(n=num_samples)
    print(type(sample))
    model.predict(sample)


def test_single_sample_from_df(df, df_idx, model):
    sample = df.iloc[df_idx]
    print(type(sample))
    model.predict(sample)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

def plot_residuals(y_true, y_pred):
    residuals = y_true - y_pred

    plt.figure(figsize=(5,5))
    plt.scatter(y_pred, y_true)
    plt.plot([y_pred.min(), y_pred.max()], [y_pred.min(), y_pred.max()])

    plt.xlabel("Predicted values")
    plt.ylabel("True values")
    plt.title("Predicted vs True")
    plt.show()



    plt.figure(figsize=(5,5))

    plt.scatter(y_pred, residuals)
    plt.xlabel("Predicted values")
    plt.ylabel("Residuals")
    plt.title("Residuals vs Predicted")
    plt.show()



    plt.figure(figsize=(12,4))

    plt.subplot(1,2,1)
    sns.histplot(residuals, kde=True, bins=30)
    plt.xlabel("Residuals")
    plt.title("Histogram of Residuals")



    plt.subplot(1,2,2)
    sm.qqplot(residuals, fit=True)
    plt.title("Q-Q Plot of Residuals")

    plt.tight_layout()
    plt.show()

    