In [1]:
import numpy as np
import pandas as pd

# Reproducibility
np.random.seed(10)

# Number of total samples
N = 80

# Generate base factors for momentum and volatility
momentum_base = np.random.normal(0, 1, N)             # underlying true momentum factor
volatility_base = np.random.normal(0, 1, N)           # underlying true volatility factor

# Construct features (with deliberate multicollinearity)
df = pd.DataFrame({
    "Momentum_3m": momentum_base,
    "Momentum_1m": momentum_base + np.random.normal(0, 0.1, N),      # highly correlated with Momentum_3m
    "Volatility_3m": volatility_base + np.random.normal(0, 0.1, N),
    "Volatility_1m": volatility_base + np.random.normal(0, 0.1, N),
    "Sector_Return_3m": 0.5 * momentum_base + np.random.normal(0, 1, N)  # somewhat correlated with momentum
})

# Add macroeconomic and other noise features (irrelevant to target)
df["Interest_Rate"] = np.random.normal(0, 1, N)
df["Consumer_Confidence"] = np.random.normal(0, 1, N)
df["Inflation_Rate"] = np.random.normal(0, 1, N)
df["GDP_Growth"] = np.random.normal(0, 1, N)
df["Unemployment_Rate"] = np.random.normal(0, 1, N)
df["Market_Volatility_Index"] = np.random.normal(0, 1, N)
df["PE_Ratio"] = np.random.normal(0, 1, N)
df["Exchange_Rate"] = np.random.normal(0, 1, N)
df["Credit_Spread"] = np.random.normal(0, 1, N)

# Construct the target variable as a linear combination of a few features + noise
noise = np.random.normal(0, 2, N)  # random noise
df["Target"] = (
    5 * df["Momentum_3m"]                # true weight 5
    - 3 * df["Volatility_3m"]            # true weight -3
    + 2 * df["Sector_Return_3m"]         # true weight 2
    + noise
)

# Quick check on correlation between some features to confirm multicollinearity
corr_mom = df["Momentum_3m"].corr(df["Momentum_1m"])
corr_vol = df["Volatility_3m"].corr(df["Volatility_1m"])
print(f"Correlation (Momentum_3m vs Momentum_1m): {corr_mom:.3f}")
print(f"Correlation (Volatility_3m vs Volatility_1m): {corr_vol:.3f}")

# Display first five rows of the dataset
print(df.head())

Correlation (Momentum_3m vs Momentum_1m): 0.996
Correlation (Volatility_3m vs Volatility_1m): 0.993
   Momentum_3m  Momentum_1m  Volatility_3m  Volatility_1m  Sector_Return_3m  \
0     1.331587     1.453401       1.783313       2.095198          1.128179   
1     0.715279     0.709627       1.798868       1.782026         -0.862216   
2    -1.545400    -1.499311      -2.000415      -1.944532         -0.580127   
3    -0.008384     0.057854      -0.383659      -0.106969          0.431258   
4     0.621336     0.391826      -0.166505      -0.186213         -1.324276   

   Interest_Rate  Consumer_Confidence  Inflation_Rate  GDP_Growth  \
0       1.027840            -0.424211        0.514070   -1.325366   
1      -0.802890            -0.050850       -0.234667    0.018548   
2      -0.428303            -1.123754       -1.085579    0.754555   
3      -0.755416            -1.857358        0.951746   -1.024803   
4       0.220497            -0.216525       -0.230392    0.569934   

   Unemplo

In [2]:
df.to_csv("data.csv", index=False)

In [3]:
df

Unnamed: 0,Momentum_3m,Momentum_1m,Volatility_3m,Volatility_1m,Sector_Return_3m,Interest_Rate,Consumer_Confidence,Inflation_Rate,GDP_Growth,Unemployment_Rate,Market_Volatility_Index,PE_Ratio,Exchange_Rate,Credit_Spread,Target
0,1.331587,1.453401,1.783313,2.095198,1.128179,1.027840,-0.424211,0.514070,-1.325366,0.021043,-0.372416,-0.968989,0.090173,-2.462223,4.976272
1,0.715279,0.709627,1.798868,1.782026,-0.862216,-0.802890,-0.050850,-0.234667,0.018548,0.156372,-0.967326,0.864171,-0.037449,-2.411232,-2.098668
2,-1.545400,-1.499311,-2.000415,-1.944532,-0.580127,-0.428303,-1.123754,-1.085579,0.754555,1.037654,-1.207669,-1.044383,-2.333174,-0.995947,0.630936
3,-0.008384,0.057854,-0.383659,-0.106969,0.431258,-0.755416,-1.857358,0.951746,-1.024803,0.045666,-2.007975,1.027839,-0.066950,1.527890,0.865355
4,0.621336,0.391826,-0.166505,-0.186213,-1.324276,0.220497,-0.216525,-0.230392,0.569934,1.107336,0.469496,-0.196828,-1.453109,0.912010,-0.254416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,2.467651,2.512553,0.138807,0.137712,-0.863966,-0.049997,-0.640832,-1.891645,-0.702644,-0.103564,0.084983,-0.894145,0.238817,0.420191,11.041550
76,-1.508321,-1.536922,-0.700640,-0.810132,-0.583866,-0.149906,-0.481395,0.840366,-0.060600,1.042892,0.132168,-3.317669,0.350525,0.350443,-5.602609
77,0.620601,0.534325,0.530024,0.410895,-0.447529,-0.035328,-0.810912,-0.225199,0.217228,0.784856,0.946299,0.895280,-0.646656,2.448005,1.840717
78,-1.045133,-1.119315,0.848429,0.915850,0.445244,0.033617,-0.108874,-0.499420,-1.525034,-0.741489,0.224227,1.199629,0.714838,-0.783775,-7.438411


In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, train_size=30, test_size=50, random_state=42)

# Separate features and target


# Fit OLS Linear Regression


# Fit Ridge Regression (using default alpha=1.0 here for demonstration)
ridge_model = Ridge(alpha=1.0)

# Evaluate performance


# Display results




OLS Train MSE: 2.733
OLS Test MSE:  7.822
Ridge Train MSE: 3.365
Ridge Test MSE:  5.890
