In [8]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from tqdm import tqdm

# Load Data
data = pd.read_parquet("train.parquet")

num_stocks = data["stock_id"].nunique()
num_dates = data["date_id"].nunique()
num_updates = data["seconds_in_bucket"].nunique()

print(f"# stocks         : {num_stocks}")
print(f"# dates          : {num_dates}")
print(f"# updates per day: {num_updates}")

# Initialise 3D return arrays
stock_returns = np.zeros((num_stocks, num_dates, num_updates))
index_returns = np.zeros((num_stocks, num_dates, num_updates))

# Calculate Returns Per Stock-Date
for (stock_id, date_id), frame in tqdm(data.groupby(["stock_id", "date_id"])):
    frame["stock_return"] = ((frame["wap"] / frame["wap"].shift(6)).shift(-6) - 1) * 10_000
    frame["index_return"] = frame["stock_return"] - frame["target"]

    stock_returns[stock_id, date_id] = frame["stock_return"].values
    index_returns[stock_id, date_id] = frame["index_return"].values

# Estimate Synthetic Index Return
index_return = np.mean(index_returns, axis=0)

# Train Linear Regression
lr = LinearRegression()
y = index_return.reshape(-1)
X = stock_returns.reshape((num_stocks, -1)).T

# Removes rows where the target `y` or any feature in `X` contains missing (`NaN`) values to ensure clean data for model training.
mask = ~((np.isnan(y) | np.isnan(X).any(axis=1)))
X, y = X[mask], y[mask]

lr.fit(X, y)

print(" Fit ".center(80, ">"))
print("Coef:", lr.coef_)
print("Intercept:", lr.intercept_)
print("R2:", r2_score(y, lr.predict(X)))

lr.coef_ = lr.coef_.round(3)
lr.intercept_ = 0.0
print(" Round with 3 digits ".center(80, ">"))
print("Coef:", lr.coef_)
print("Sum of Coef:", lr.coef_.sum())
print("R2:", r2_score(y, lr.predict(X)))

# stocks         : 200
# dates          : 481
# updates per day: 55


100%|██████████| 95236/95236 [00:23<00:00, 4027.67it/s]


>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> Fit >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Coef: [0.004      0.00099987 0.00200041 0.00599891 0.00400068 0.00399949
 0.00200014 0.0059992  0.00600029 0.00200012 0.00200072 0.00800012
 0.00600039 0.00200007 0.0080014  0.00600051 0.00199993 0.0059999
 0.00400013 0.00199974 0.00399933 0.00099993 0.00599955 0.00399971
 0.00199876 0.00200008 0.00400097 0.0020008  0.00400021 0.00400005
 0.00100001 0.00100016 0.00199924 0.00199993 0.00599995 0.00399917
 0.00400017 0.00399906 0.00599977 0.00200015 0.0020007  0.03999955
 0.0020001  0.00199981 0.00399968 0.03999983 0.00200033 0.00099959
 0.00600002 0.00399944 0.00399979 0.00600017 0.00099921 0.00399993
 0.00399936 0.0019985  0.00599995 0.00400028 0.00599981 0.00400045
 0.00600029 0.00399988 0.00200012 0.00100003 0.00200041 0.00400043
 0.00200011 0.00799989 0.00400056 0.00400011 0.00199956 0.00399921
 0.00599994 0.00199992 0.00399979 0.00400047 0.00200035 0.00399996
 0.00400003 0.00399989 0.00100029 0.0019999