In [1]:
import polars as pl
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
DATA_PATH = "jane-street-real-time-market-data-forecasting"
data = pl.read_parquet(DATA_PATH + "/train.parquet")

Train Test Split Method
In training data, we preserve symbol_id from 0~30. And the date is from 0 to 1698.
In testing data, we use simply the last date 1699 for all symbol_id 0~39.

In [3]:
train = data.filter((pl.col("symbol_id") < 31) & (pl.col("date_id") < 1698))

In [4]:
test = data.filter(pl.col("date_id") == 1698)

In [5]:
train = train.drop_nulls() 

In [6]:
train_grouped = train.partition_by("symbol_id")

In [None]:
train_grouped[0]

In [None]:
feature = [col for col in train_pd.columns if col.startswith('feature')]
responder = 'responder_6'

In [9]:
# Set parameters for LightGBM
params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,  # Lower learning rate
        'num_leaves': 31,
        'min_data_in_leaf': 20,  # Minimum number of data points in a leaf
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1
    }

In [None]:
import lightgbm as lgb
models = []
predictions = []
true_values = []
for training_df in train_grouped:
    training_df = training_df.to_pandas()
    X_train = training_df[feature]
    y_train = training_df[responder]
    # Create LightGBM dataset
    train_data = lgb.Dataset(X_train, label=y_train)
    # Train the model
    model = lgb.train(params, train_data, num_boost_round=1000, valid_sets=[train_data])
    models.append((training_df["symbol_id"][0], model))
    print(training_df.shape)

In [7]:
import pickle
import pandas as pd
# with open("models.pkl", "wb") as f:
#     pickle.dump(models, f)

# Assuming train, train_grouped, and test are already defined as Polars DataFrames

# Convert Polars DataFrames to Pandas DataFrames for pickling
train_pd = train.to_pandas()
test_pd = test.to_pandas()

# Save the DataFrames to pickle files
with open('train.pkl', 'wb') as f:
    pickle.dump(train_pd, f)

with open('train_grouped.pkl', 'wb') as f:
    pickle.dump(train_grouped, f)

with open('test.pkl', 'wb') as f:
    pickle.dump(test_pd, f)


# with open("models.pkl", "rb") as f:
#     models = pickle.load(f)

Use Guassian Mixture Model to cluster the data. The number of clusters is 30.

In [None]:
import pickle
with open('train.pkl', 'rb') as f:
    train_pd = pickle.load(f)

with open('train_grouped.pkl', 'rb') as f:  
    train_grouped = pickle.load(f)

In [None]:
X_train = train_grouped[30].to_pandas()[feature]

In [8]:
from sklearn.mixture import GaussianMixture

# Define the number of clusters
n_clusters = 2

# Initialize and fit the Gaussian Mixture Model
gmm = GaussianMixture(n_components=n_clusters, random_state=42).fit(X_train)


In [9]:
n_clusters = 1
gmm_list = []
for training_df in train_grouped:
    X_train = training_df.to_pandas()[feature]
    # Predict the cluster for each data point
    gmm = GaussianMixture(n_components=n_clusters, random_state=42).fit(X_train)
    gmm_list.append((training_df["symbol_id"][0], gmm))

In [10]:
with open("gmm_list.pkl", "wb") as f:
    pickle.dump(gmm_list, f)

In [None]:

# Predict the cluster for each sample
train['cluster'] = gmm.predict(X_train)

# Display the first few rows with the cluster assignments
print(train.head())

In [13]:
with open("models.pkl", "rb") as f:
    models = pickle.load(f)

In [None]:
import pandas as pd
predictions = pd.DataFrame(index=train_pd.index)
for i, model in models:
    predictions[f'prediction_{i}'] = model.predict(train_pd[feature])

In [15]:
with open("predictions.pkl", "wb") as f:
    pickle.dump(predictions, f)



In [16]:
gmm_models = gmm_list[0]

In [None]:
gmm_list[30]

In [None]:
X_train.iloc[0:1]

In [40]:
gmm_test = X_train.iloc[0:1]

In [None]:
gmm_models[1].means_.shape

In [None]:
import numpy as np

gmm_list[3][1].predict_proba(gmm_test)

In [49]:
predictions["answer"] = train_pd[responder]
predictions["symbol_id"] = train_pd["symbol_id"]

In [60]:
predictions_grouped = predictions.groupby("symbol_id")

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
lasso_list = []
lm_list = []
for dat in predictions_grouped:
    symbol_id = dat[0]
    dat = dat[1]
    X = dat.drop(columns=["answer", "symbol_id"])
    y = dat["answer"]
    clf = linear_model.Lasso(alpha=0.1).fit(X, y)
    reg = LinearRegression().fit(X, y)
    lasso_list.append((symbol_id, clf))
    lm_list.append((symbol_id, reg))

In [None]:
prob_df = pd.DataFrame(index=test_pd.index)
for i, model in gmm_models:
    prob_df[f'prob_{i}'] = model.predict(test_pd[feature])

final_group = prob_df.idxmax(axis=1)
