In [None]:
import pandas as pd
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
    .appName("RegressionEvaluation") \
    .getOrCreate()

In [None]:
train_data = pd.read_csv("train_predictions.csv")
test_data = pd.read_csv("test_predictions.csv")

In [None]:
train_df = spark.createDataFrame(train_data)
test_df = spark.createDataFrame(test_data)

In [None]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [None]:
train_pandas_df = train_df.select("prediction", "actual").limit(100000).toPandas()
test_pandas_df = test_df.select("prediction", "actual").limit(100000).toPandas()

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

def actual_vs_predicted(train_df, test_df):
    # 1) global data extents for y=x
    all_vals = pd.concat([
        train_df["actual"], train_df["prediction"],
        test_df["actual"],  test_df["prediction"]
    ])
    lo, hi = all_vals.min(), all_vals.max()

    # 2) compute best–fit coefficients
    m_tr, b_tr = np.polyfit(train_df["actual"], train_df["prediction"], 1)
    m_te, b_te = np.polyfit(test_df["actual"],  test_df["prediction"],  1)

    fig = go.Figure()



    # 4) scatter points
    fig.add_trace(go.Scattergl(
        x=train_df["actual"], y=train_df["prediction"],
        mode="markers", name="Train",
        marker=dict(opacity=0.7),
        line=dict(color="royalblue", width=2),
    ))
    fig.add_trace(go.Scattergl(
        x=test_df["actual"], y=test_df["prediction"],
        mode="markers", name="Test",
        marker=dict(symbol="x", opacity=0.7),
        line=dict(color="crimson", width=2),
        
    ))

    # 3) y = x reference line (legend-only)
    fig.add_trace(go.Scatter(
        x=[lo, hi], y=[lo, hi],
        mode="lines",
        name="Perfect fit",
        line=dict(color="black", dash="dash"),
        visible="legendonly"
    ))

    # 5) train best–fit line
    x_tr = [train_df["actual"].min(), train_df["actual"].max()]
    fig.add_trace(go.Scatter(
        x=x_tr,
        y=[m_tr*x_tr[0] + b_tr, m_tr*x_tr[1] + b_tr],
        mode="lines",
        name="Train fit",
        line=dict(color="royalblue", width=2),
        visible="legendonly"
    ))

    # 6) test best–fit line
    x_te = [test_df["actual"].min(), test_df["actual"].max()]
    fig.add_trace(go.Scatter(
        x=x_te,
        y=[m_te*x_te[0] + b_te, m_te*x_te[1] + b_te],
        mode="lines",
        name="Test fit",
        line=dict(color="crimson", width=2),
        visible="legendonly"
    ))

    # 7) add equations box at top-left
    eq_text = (
        f"<b>Train fit</b>: y = {m_tr:.2f}x + {b_tr:.2f}<br>"
        f"<b>Test fit</b>: y = {m_te:.2f}x + {b_te:.2f}"
    )
    fig.add_annotation(
        x=0, y=1, xref="paper", yref="paper",
        text=eq_text,
        showarrow=False,
        align="left",
        bgcolor="white",
        bordercolor="black",
        borderwidth=1,
        xanchor="left",
        yanchor="top"
    )

    # 8) styling
    fig.update_layout(
        title=dict(text="Actual vs. Predicted", x=0.5),
        legend=dict(orientation="h", x=0.5, y=-0.15, xanchor="center", yanchor="top"),
        xaxis_title="Actual",
        yaxis_title="Predicted",
        plot_bgcolor="white",
        paper_bgcolor="white",
        width=700, height=500,
        margin=dict(t=80, b=60)
    )
    fig.update_xaxes(showgrid=True, gridcolor="lightgrey", zeroline=False, showline=False)
    fig.update_yaxes(showgrid=True, gridcolor="lightgrey", zeroline=False, showline=False)

    return fig


In [None]:
actual_vs_predicted(train_pandas_df, test_pandas_df)

In [None]:
import pandas as pd
import plotly.graph_objects as go

def residuals_vs_predicted(train_df, test_df):
    # compute residuals
    train = train_df.copy()
    train["residual"] = train["actual"] - train["prediction"]
    test = test_df.copy()
    test["residual"] = test["actual"] - test["prediction"]

    fig = go.Figure()

    # train residuals
    fig.add_trace(go.Scattergl(
        x=train["prediction"],
        y=train["residual"],
        mode="markers",
        name="Train",
        line=dict(color="royalblue", width=2),
        marker=dict(opacity=0.8)
    ))

    # test residuals
    fig.add_trace(go.Scattergl(
        x=test["prediction"],
        y=test["residual"],
        mode="markers",
        name="Test",
        line=dict(color="crimson", width=2),
        marker=dict(symbol="x", opacity=0.8)
    ))

    # horizontal zero‐residual line spanning full plot width
    fig.add_shape(
        type="line",
        xref="paper", yref="y",
        x0=0, x1=1,
        y0=0, y1=0,
        line=dict(color="grey", dash="dash")
    )

    # layout: centered title, bottom legend, white bg, grey grid
    fig.update_layout(
        title=dict(
            text="Residuals vs. Predicted",
            x=0.5, xanchor="center"
        ),
        legend=dict(
            orientation="h",
            x=0.5, xanchor="center",
            y=-0.15, yanchor="top"
        ),
        xaxis_title="Predicted",
        yaxis_title="Residual",
        plot_bgcolor="white",
        paper_bgcolor="white",
        width=700,
        height=500,
        margin=dict(t=80, b=60)
    )

    # grey gridlines
    fig.update_xaxes(showgrid=True, gridcolor="lightgrey", zeroline=False, showline=False)
    fig.update_yaxes(showgrid=True, gridcolor="lightgrey", zeroline=False, showline=False)

    return fig


In [None]:
residuals_vs_predicted(train_pandas_df, test_pandas_df)

In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import plotly.graph_objects as go
import plotly.express as px

def residuals_qq_plot(train_df, test_df):
    """
    Creates a Normal Q–Q plot of residuals for train and test sets.
    
    Parameters
    ----------
    train_df : pandas.DataFrame
        DataFrame containing 'actual' and 'prediction' columns for training set.
    test_df : pandas.DataFrame
        DataFrame containing 'actual' and 'prediction' columns for test set.
    
    Returns
    -------
    fig : plotly.graph_objects.Figure
    """
    # 1) compute residuals
    train_res = train_df["actual"] - train_df["prediction"]
    test_res  = test_df["actual"]  - test_df["prediction"]

    # 2) helper to get sample vs theoretical quantiles
    def qq_data(res):
        res_sorted = np.sort(res)
        n = len(res_sorted)
        probs = (np.arange(1, n+1) - 0.5) / n
        # theoretical N(μ,σ) quantiles
        mu, sigma = res_sorted.mean(), res_sorted.std(ddof=0)
        theor = stats.norm.ppf(probs) * sigma + mu
        return theor, res_sorted
    
    theor_tr, res_tr = qq_data(train_res)
    theor_te, res_te = qq_data(test_res)
    
    # 3) determine global plotting range for y=x reference
    lo = min(theor_tr.min(), res_tr.min(), theor_te.min(), res_te.min())
    hi = max(theor_tr.max(), res_tr.max(), theor_te.max(), res_te.max())
    
    # 4) build figure
    fig = go.Figure()
    # colors
    palette = px.colors.qualitative.Plotly
    train_color, test_color = palette[0], palette[1]
    
    # train Q–Q
    fig.add_trace(go.Scatter(
        x=theor_tr,
        y=res_tr,
        mode="markers",
        name="Train",
        line=dict(color=train_color, width=2),
        marker=dict(opacity=0.8)
    ))
    
    # test Q–Q
    fig.add_trace(go.Scatter(
        x=theor_te,
        y=res_te,
        mode="markers",
        name="Test",
        line=dict(color=test_color, width=2),
        marker=dict(symbol="x", opacity=0.8),
    ))

    # dashed y=x reference, hidden by default
    fig.add_trace(go.Scatter(
        x=[lo, hi], y=[lo, hi],
        mode="lines",
        name="Baseline (y=x)",
        line=dict(color="grey", dash="dash"),
        visible="legendonly"
    ))

    # 5) styling
    fig.update_layout(
        title=dict(text="Normal Q–Q Plot of Residuals (Train & Test)", x=0.5),
        legend=dict(orientation="h", x=0.5, y=-0.15, xanchor="center", yanchor="top"),
        xaxis_title="Theoretical Quantiles (Normal)",
        yaxis_title="Sample Quantiles (Residuals)",
        plot_bgcolor="white",
        paper_bgcolor="white",
        width=700, height=500,
        margin=dict(t=80, b=60)
    )
    fig.update_xaxes(showgrid=True, gridcolor="lightgrey", zeroline=False, showline=False)
    fig.update_yaxes(showgrid=True, gridcolor="lightgrey", zeroline=False, showline=False)
    
    return fig


In [None]:
residuals_qq_plot(train_pandas_df, test_pandas_df)

In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

def residual_and_error_toggle(train_df, test_df, bins=50):
    """
    Creates a single Plotly figure with a toggle between:
      - signed residual distribution (y - ŷ)
      - absolute error distribution |y - ŷ|
    
    Parameters
    ----------
    train_df, test_df : pandas.DataFrame
        Must each have 'actual' and 'prediction' columns.
    bins : int
        Number of bins for the histograms.
    """
    # compute residuals and errors
    train_res = train_df["actual"] - train_df["prediction"]
    test_res  = test_df["actual"]  - test_df["prediction"]
    train_err = train_res.abs()
    test_err  = test_res.abs()

    # colors
    palette = px.colors.qualitative.Plotly
    train_color, test_color = palette[0], palette[1]

    # build all four traces
    fig = go.Figure([
        # residuals
        go.Histogram(x=train_res, nbinsx=bins, name="Train Residual", marker=dict(color=train_color, line=dict(color='white', width=1)), opacity=0.8),
        go.Histogram(x=test_res,  nbinsx=bins, name="Test Residual",  marker=dict(color=test_color, line=dict(color='white', width=1)), opacity=0.8),
        # absolute errors
        go.Histogram(x=train_err, nbinsx=bins, name="Train Error",    marker=dict(color=train_color, line=dict(color='white', width=1)), opacity=0.8,),
        go.Histogram(x=test_err,  nbinsx=bins, name="Test Error",     marker=dict(color=test_color, line=dict(color='white', width=1)), opacity=0.8,),
    ])

    # overlay mode
    fig.update_layout(barmode='overlay', bargap=0.1)

    # initially show residuals only
    fig.data[0].visible = True
    fig.data[1].visible = True
    fig.data[2].visible = False
    fig.data[3].visible = False

    # updatemenu buttons
    buttons = [
        dict(label="Residuals",
             method="update",
             args=[{"visible": [True, True, False, False]},
                   {"title": "Residual Distribution",
                    "yaxis":{"title":"Count"}}]),
        dict(label="Absolute Errors",
             method="update",
             args=[{"visible": [False, False, True, True]},
                   {"title": "Absolute Error Distribution",
                    "yaxis":{"title":"Count"}}]),
    ]

    fig.update_layout(
        updatemenus=[dict(
            type="buttons",
            buttons=buttons,
            direction="left",
            pad={"r":10,"t":10},
            showactive=True,
            x=0.5,
            xanchor="center",
            y=1.15,
            yanchor="top"
        )],
        title=dict(text="Residual Distribution", x=0.5),
        legend=dict(orientation="h", x=0.5, y=-0.15, xanchor="center", yanchor="top"),
        xaxis_title="Value",
        yaxis_title="Count",
        plot_bgcolor="white",
        paper_bgcolor="white",
        width=700,
        height=500,
        margin=dict(t=100, b=60),
        hovermode="x"
    )

    fig.update_xaxes(showgrid=True, gridcolor="lightgrey", zeroline=False, showline=False)
    fig.update_yaxes(showgrid=True, gridcolor="lightgrey", zeroline=False, showline=False)

    return fig


In [None]:
residual_and_error_toggle(train_pandas_df, test_pandas_df)

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

def rec_curve(train_df, test_df, num_points=200):
    """
    Plots Regression Error Characteristic (REC) curves for train & test.
    
    Parameters
    ----------
    train_df : pandas.DataFrame
        Must contain 'actual' and 'prediction' columns.
    test_df : pandas.DataFrame
        Must contain 'actual' and 'prediction' columns.
    num_points : int, default=200
        Number of points in the tolerance grid.
        
    Returns
    -------
    fig : plotly.graph_objects.Figure
    """
    # compute abs errors
    train_err = np.abs(train_df["actual"] - train_df["prediction"])
    test_err  = np.abs(test_df["actual"]  - test_df["prediction"])
    
    # tolerance grid from 0 to max error
    max_err = max(train_err.max(), test_err.max())
    tol = np.linspace(0, max_err, num_points)
    
    # compute CDF (% of samples within each tolerance)
    train_cdf = [(train_err <= t).mean() * 100 for t in tol]
    test_cdf  = [(test_err  <= t).mean() * 100 for t in tol]
    
    # build figure
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=tol, y=train_cdf, mode="lines", name="Train",
        line=dict(width=2, dash="dot")
    ))
    fig.add_trace(go.Scatter(
        x=tol, y=test_cdf, mode="lines", name="Test",
        line=dict(width=2, dash="dot")
    ))
    
    # styling
    fig.update_layout(
        title=dict(text="Regression Error Characteristic (REC) Curve", x=0.5),
        xaxis_title="Error Tolerance (|y – ŷ|)",
        yaxis_title="% Samples within Tolerance",
        legend=dict(orientation="h", x=0.5, y=-0.15, xanchor="center", yanchor="top"),
        plot_bgcolor="white",
        paper_bgcolor="white",
        width=700, height=500,
        margin=dict(t=80, b=60),
        hovermode="x unified"
    )
    fig.update_xaxes(showgrid=True, gridcolor="lightgrey", zeroline=False, showline=False)
    fig.update_yaxes(showgrid=True, gridcolor="lightgrey", zeroline=False, showline=False, range=[0,100])
    
    return fig


In [None]:
rec_curve(train_pandas_df, test_pandas_df)

In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

def calibration_plot(train_df, test_df, bins=10):
    """
    Plots a Binned Residual Mean (Calibration) plot:
    - Bins data by predicted values into quantiles
    - Computes average predicted vs average actual in each bin
    - Overlays train & test curves plus the y=x identity line

    Parameters
    ----------
    train_df : pandas.DataFrame
        Must contain 'actual' and 'prediction' columns.
    test_df : pandas.DataFrame
        Must contain 'actual' and 'prediction' columns.
    bins : int
        Number of quantile-based bins.

    Returns
    -------
    fig : plotly.graph_objects.Figure
    """
    # 1) make copies & bin by predicted quantiles
    t = train_df.copy()
    v = test_df.copy()
    t['bin'] = pd.qcut(t['prediction'], q=bins, duplicates='drop')
    v['bin'] = pd.qcut(v['prediction'], q=bins, duplicates='drop')

    # 2) aggregate means per bin
    train_grp = (
        t.groupby('bin')
         .agg(avg_pred=('prediction','mean'),
              avg_actual=('actual','mean'))
         .reset_index()
    )
    test_grp = (
        v.groupby('bin')
         .agg(avg_pred=('prediction','mean'),
              avg_actual=('actual','mean'))
         .reset_index()
    )

    # 3) global min/max for identity line
    all_vals = pd.concat([
        train_grp['avg_pred'], train_grp['avg_actual'],
        test_grp['avg_pred'],  test_grp['avg_actual']
    ])
    lo, hi = all_vals.min(), all_vals.max()

    # 4) colors
    palette = px.colors.qualitative.Plotly
    train_color, test_color = palette[0], palette[1]

    # 5) build figure
    fig = go.Figure()

    # identity line y = x
    fig.add_shape(
        type='line',
        xref='x', yref='y',
        x0=lo, y0=lo,
        x1=hi, y1=hi,
        line=dict(color='black', dash='dash')
    )

    # train curve
    fig.add_trace(go.Scatter(
        y=train_grp['avg_pred'],
        x=train_grp['avg_actual'],
        mode='lines+markers',
        name='Train',
        line=dict(color=train_color),
        marker=dict(color=train_color)
    ))

    # test curve
    fig.add_trace(go.Scatter(
        y=test_grp['avg_pred'],
        x=test_grp['avg_actual'],
        mode='lines+markers',
        name='Test',
        line=dict(color=test_color, dash='dot'),
        marker=dict(color=test_color, symbol='x')
    ))

    # 6) styling
    fig.update_layout(
        title=dict(text='Calibration Plot (Binned Actual vs Predicted)', x=0.5),
        legend=dict(orientation='h',
                    x=0.5, xanchor='center',
                    y=-0.15, yanchor='top'),
        yaxis_title='Average Predicted',
        xaxis_title='Average Actual',
        plot_bgcolor='white',
        paper_bgcolor='white',
        width=700,
        height=500,
        margin=dict(t=80, b=60)
    )
    fig.update_xaxes(showgrid=True, gridcolor='lightgrey',
                     zeroline=False, showline=False)
    fig.update_yaxes(showgrid=True, gridcolor='lightgrey',
                     zeroline=False, showline=False)

    return fig


In [None]:
calibration_plot(train_pandas_df, test_pandas_df)