# Import libraries and dependencies

In [6]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
from analysis_helpers import assign_combined_bucket 
from analysis_helpers import analyze_stocks
import pandas as pd
import utils
import numpy as np

# Load and preprocess data

In [7]:
# Set the path to the file you'd like to load
file_path = "sp500_data.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "shannanl/sp500-dataset",
  file_path,
)
df.rename(columns={df.columns[0]: "day"}, inplace=True)
date_lookup = dict(enumerate(df["day"]))
bucket_df = analyze_stocks(df)
HV_HL = bucket_df[(bucket_df["vol_bucket"] == "high") & 
                  (bucket_df["liq_bucket"] == "high")].index.tolist()[:5]
HV_LL = bucket_df[(bucket_df["vol_bucket"] == "high") & 
                  (bucket_df["liq_bucket"] == "low")].index.tolist()[:5]
LV_HL = bucket_df[(bucket_df["vol_bucket"] == "low") & 
                  (bucket_df["liq_bucket"] == "high")].index.tolist()[:5]
LV_LL = bucket_df[(bucket_df["vol_bucket"] == "low") & 
                  (bucket_df["liq_bucket"] == "low")].index.tolist()[:5]

  df = kagglehub.load_dataset(
  returns = df[close_col].pct_change()


# Train Linear Regression

In [8]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score
import matplotlib.pyplot as plt

# -------------------------------------------------------------------
# Helper: run model on one ticker
# -------------------------------------------------------------------
def run_single_stock_pipeline(df, ticker, window_size=20):
    """
    Loads a stock, runs feature engineering, trains Linear Regression,
    returns performance metrics as a dict.
    """

    try:
        stock_df = utils.load_comp_data(df, ticker)
        if stock_df is None or len(stock_df) == 0:
            return None

        fe_df = utils.compute_features(stock_df, w=window_size, cut=False)

        # If not enough data, skip
        if fe_df.shape[0] < 50:
            return None

        # Prepare X/y
        X = fe_df.drop(columns=["timestep", "log_next_day_max_return"])
        y = fe_df["log_next_day_max_return"]

        split_index = int(0.8 * len(fe_df))
        X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
        y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

        # Fit model
        model = LinearRegression()
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        # Evaluation
        metrics = {
            "ticker": ticker,
            "mse": mean_squared_error(y_test, y_pred),
            "mae": mean_absolute_error(y_test, y_pred),
            "explained_var": explained_variance_score(y_test, y_pred),
            "r2": r2_score(y_test, y_pred)
        }
        return metrics

    except Exception as e:
        print(f"Error processing {ticker}: {e}")
        return None


# -------------------------------------------------------------------
# Helper: run over entire bucket and average results
# -------------------------------------------------------------------
def evaluate_bucket(df, ticker_list, bucket_name, window_size=20):
    results = []

    print(f"\n=== Running bucket: {bucket_name} ===")

    for t in ticker_list:
        metrics = run_single_stock_pipeline(df, t, window_size)
        if metrics is not None:
            results.append(metrics)

    if len(results) == 0:
        print("No valid results for bucket.")
        return None

    result_df = pd.DataFrame(results)

    # Print per-stock results for transparency
    print(result_df)

    # Compute averages
    averaged = {
        "bucket": bucket_name,
        "tickers_used": len(result_df),
        "avg_mse":  result_df["mse"].mean(),
        "avg_mae":  result_df["mae"].mean(),
        "avg_explained_var": result_df["explained_var"].mean(),
        "avg_r2": result_df["r2"].mean(),
    }

    print("\n>> Average Metrics:")
    print(pd.DataFrame([averaged]))

    return averaged


# -------------------------------------------------------------------
# MAIN PIPELINE
# -------------------------------------------------------------------

bucket_results = []

bucket_results.append(
    evaluate_bucket(df, HV_HL, "High Vol / High Liquidity")
)

bucket_results.append(
    evaluate_bucket(df, HV_LL, "High Vol / Low Liquidity")
)

bucket_results.append(
    evaluate_bucket(df, LV_HL, "Low Vol / High Liquidity")
)

bucket_results.append(
    evaluate_bucket(df, LV_LL, "Low Vol / Low Liquidity")
)

# Final summary
print("\n================ FINAL BUCKET SUMMARY ================")
final_df = pd.DataFrame([b for b in bucket_results if b is not None])
print(final_df)



=== Running bucket: High Vol / High Liquidity ===
Error processing KEY: 'str' object has no attribute 'shape'


  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)


  ticker       mse       mae  explained_var        r2
0   VIAC  0.000717  0.016241       0.021616  0.008814
1    CCL  0.001237  0.018376       0.032149  0.011342
2   AAPL  0.000242  0.011104       0.022148 -0.033379
3   GILD  0.000244  0.012181      -0.047737 -0.266955

>> Average Metrics:
                      bucket  tickers_used  avg_mse   avg_mae  \
0  High Vol / High Liquidity             4  0.00061  0.014476   

   avg_explained_var    avg_r2  
0           0.007044 -0.070045  

=== Running bucket: High Vol / Low Liquidity ===


  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)


  ticker       mse       mae  explained_var        r2
0     RL  0.000465  0.013444       0.029295  0.011112
1    CMG  0.000365  0.011433       0.005361  0.004780
2   GNRC  0.000521  0.015448      -0.014511 -0.086817
3    KSU  0.000340  0.011510      -0.047637 -0.065906
4   ODFL  0.000204  0.010132       0.005939 -0.011703

>> Average Metrics:
                     bucket  tickers_used   avg_mse   avg_mae  \
0  High Vol / Low Liquidity             5  0.000379  0.012393   

   avg_explained_var    avg_r2  
0          -0.004311 -0.029707  

=== Running bucket: Low Vol / High Liquidity ===
Error processing PG: 'str' object has no attribute 'shape'


  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)


  ticker       mse       mae  explained_var        r2
0    JNJ  0.000087  0.006258       0.052464  0.050653
1    PEP  0.000100  0.006163       0.046386  0.046267
2     KO  0.000109  0.006449       0.038485  0.038311
3    NEE  0.000148  0.007317       0.031117  0.027796

>> Average Metrics:
                     bucket  tickers_used   avg_mse   avg_mae  \
0  Low Vol / High Liquidity             4  0.000111  0.006547   

   avg_explained_var    avg_r2  
0           0.042113  0.040757  

=== Running bucket: Low Vol / Low Liquidity ===


  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)
  result.fillna(method='ffill', inplace=True)


  ticker       mse       mae  explained_var        r2
0    WEC  0.000128  0.006632       0.040392  0.035797
1    AEE  0.000133  0.006759       0.048391  0.043671
2    DTE  0.000154  0.007263       0.043711  0.043655
3   VRSK  0.000191  0.008201       0.021269 -0.008055
4    MKC  0.000149  0.007034       0.039020  0.026460

>> Average Metrics:
                    bucket  tickers_used   avg_mse   avg_mae  \
0  Low Vol / Low Liquidity             5  0.000151  0.007178   

   avg_explained_var    avg_r2  
0           0.038557  0.028306  

                      bucket  tickers_used   avg_mse   avg_mae  \
0  High Vol / High Liquidity             4  0.000610  0.014476   
1   High Vol / Low Liquidity             5  0.000379  0.012393   
2   Low Vol / High Liquidity             4  0.000111  0.006547   
3    Low Vol / Low Liquidity             5  0.000151  0.007178   

   avg_explained_var    avg_r2  
0           0.007044 -0.070045  
1          -0.004311 -0.029707  
2           0.042113  0.04075

  result.fillna(method='ffill', inplace=True)
