In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from lightgbm import early_stopping, log_evaluation
import seaborn as sns
import pandas as pd
import pickle 


In [2]:
#loads cleaned data
df = pd.read_parquet("/Users/tobiasshin/Downloads/cleaned_data.parquet")

df_test = pd.read_parquet("/Users/tobiasshin/Downloads/cleaned_test_data.parquet")

lightgbm_model = joblib.load("lightgbm_model.pkl")



In [2]:
""" Run cell 2 and 3 once, just to clean data
    From then on, just reload cleaned data from cell 1
    Will be commented out after cleaned


df = pd.read_parquet("/Users/tobiasshin/Downloads/drw-crypto-market-prediction/train.parquet")

print(df.head())
print(df.shape)
"""

                     bid_qty  ask_qty  buy_qty  sell_qty   volume        X1  \
2023-03-01 00:00:00   15.283    8.425  176.405    44.984  221.389  0.181844   
2023-03-01 00:01:00   38.590    2.336  525.846   321.950  847.796  0.489497   
2023-03-01 00:02:00    0.442   60.250  159.227   136.369  295.596  0.260121   
2023-03-01 00:03:00    4.865   21.016  335.742   124.963  460.705  0.099976   
2023-03-01 00:04:00   27.158    3.451   98.411    44.407  142.818  0.270893   

                           X2        X3        X4        X5  ...      X772  \
2023-03-01 00:00:00 -0.637860  0.006652  0.136870  0.116698  ...  0.333753   
2023-03-01 00:01:00 -0.075619  0.431594  0.522400  0.475255  ...  0.333657   
2023-03-01 00:02:00 -0.444684  0.100695  0.224729  0.203282  ...  0.333667   
2023-03-01 00:03:00 -0.666728 -0.123858  0.019197  0.014459  ...  0.333174   
2023-03-01 00:04:00 -0.325973  0.116336  0.234311  0.214073  ...  0.333171   

                         X773      X774      X775      X

In [3]:
"""
missing = df.isnull().mean().sort_values(ascending=False)
print(missing[missing > 0])

# drop all features that are all constant (variance 0)
zero_var = df.loc[:, df.nunique() == 1]
df = df.drop(columns=zero_var)
print(zero_var.columns)

# save cleaned data set
df.to_parquet("/Users/tobiasshin/Downloads/cleaned_data.parquet") 
df.to_csv("cleaned_data.csv", index=False)
"""

Series([], dtype: float64)
Index([], dtype='object')


In [9]:
# Compute correlation with the label
correlations = df.corr(numeric_only=True)["label"].drop("label")  # drop label itself

# Sort by absolute correlation
top_corr_features = correlations.abs().sort_values(ascending=False).head(20).index.tolist()

In [4]:
def clip_outliers(train_df, test_df, lower=0.01, upper=0.99):
    lower_bounds = train_df.quantile(lower)
    upper_bounds = train_df.quantile(upper)
    return (
        train_df.clip(lower=lower_bounds, upper=upper_bounds, axis=1),
        test_df.clip(lower=lower_bounds, upper=upper_bounds, axis=1)
    )


In [5]:
#Now let's try gradient boosting on the full data set


# 1. Split data chronologically
split_index = int(0.8 * len(df))
train = df.iloc[:split_index]
test = df.iloc[split_index:]

# 2. Prepare features and target
X_train = train.drop(columns=["label"])
y_train = train["label"]
X_test = test.drop(columns=["label"])
y_test = test["label"]

# 3. Clip the training and test data
X_train_clipped, X_test_clipped = clip_outliers(X_train, X_test)


In [6]:
# The above cell takes way too long, we use light GBM instead

model = LGBMRegressor(
    n_estimators=1000,        # allow early stopping to cut it off early
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,            # randomly sample rows
    colsample_bytree=0.8,     # randomly sample features
    n_jobs=-1,                # use all CPU cores
    random_state=42
)

# Fit the model with early stopping on test set

model.fit(
    X_train_clipped, y_train,
    eval_set=[(X_test_clipped, y_test)],
    eval_metric="l2",
    callbacks=[early_stopping(stopping_rounds=50), log_evaluation(50)]
)

# Predictions and evaluation
y_pred = model.predict(X_test_clipped)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse:.4f}")
print(f"R²: {r2:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.204758 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 200173
[LightGBM] [Info] Number of data points in the train set: 420708, number of used features: 785
[LightGBM] [Info] Start training from score 0.025639
Training until validation scores don't improve for 50 rounds
[50]	valid_0's l2: 1.18424
Early stopping, best iteration is:
[5]	valid_0's l2: 1.08265
MSE: 1.0826
R²: -0.0025


In [7]:
print(y_test.var())

1.0799601349846195


In [8]:
correlation = np.corrcoef(y_test, y_pred)[0, 1]
print(f"Correlation between prediction and label: {correlation:.4f}")

Correlation between prediction and label: 0.0430


In [9]:
#Let's apply above model to the test data

df_test = pd.read_parquet("/Users/tobiasshin/Downloads/drw-crypto-market-prediction/test.parquet")
print(df_test.shape)
print(df_test.head())



(538150, 786)
    bid_qty  ask_qty  buy_qty  sell_qty   volume        X1        X2  \
ID                                                                     
1     0.317    8.102   13.164    10.272   23.436 -0.341229  0.041851   
2     2.608    2.111  123.562    40.163  163.725 -1.029564 -1.382505   
3     2.768   10.787  126.137   118.266  244.403 -2.594090 -5.486158   
4     0.948   12.157   16.069    31.723   47.792  0.240745  0.997585   
5     1.084    3.493   32.679    37.327   70.006  0.067189  0.772852   

          X3        X4        X5  ...      X772      X773      X774      X775  \
ID                                ...                                           
1  -0.020094 -0.206221 -0.297124  ... -0.147911 -0.043417  1.521787  1.548965   
2  -1.214935 -1.020241 -0.960397  ... -0.126703 -0.077090 -0.703054 -0.716951   
3  -4.744466 -3.930152 -3.275324  ... -0.147750 -0.030627 -0.703514 -0.717525   
4   1.028965  1.081052  0.811895  ... -0.136737 -0.033380  1.521167  1.55177

In [10]:
"""
# drop all features that are all constant (variance 0)
zero_var = df_test.loc[:, df_test.nunique() == 1]
df = df_test.drop(columns=zero_var)
print(zero_var.columns)

# save cleaned data set
df_test.to_parquet("/Users/tobiasshin/Downloads/cleaned_test_data.parquet") 
df_test.to_csv("cleaned_test_data.csv", index=False)
"""

Index(['label'], dtype='object')


In [11]:
X_test = df_test.drop(columns=["label"])
y_test = df_test["label"]

In [12]:
X_train = train.drop(columns=["label"])
y_train = train["label"]

In [23]:
X_train_clipped, X_test_clipped = clip_outliers(X_train, X_test)

In [24]:
feature_columns = X_train_clipped.columns.tolist()
X_test = X_test[feature_columns]

X_train_clipped, X_test_clipped = clip_outliers(X_train, X_test)

In [25]:
#Retrain model on all training data WITHOUT validation set

model = LGBMRegressor(
    n_estimators=1000,  # if early stopping told you 180 rounds, set n_estimators=180
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42
)

model.fit(X_train_clipped, y_train)

import joblib
joblib.dump(model, "lightgbm_model.pkl")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.172087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 200173
[LightGBM] [Info] Number of data points in the train set: 420708, number of used features: 785
[LightGBM] [Info] Start training from score 0.025639


['lightgbm_model.pkl']

In [26]:
print(X_train_clipped.shape)
print(X_test_clipped.shape)

(420708, 785)
(538150, 785)


In [27]:
y_pred = model.predict(X_test_clipped)


In [28]:
print(y_pred.shape)

(538150,)


In [29]:
print(X_test_clipped.shape[0])

538150


In [30]:
print("y_pred std dev:", np.std(y_pred))
print("y_pred unique values:", np.unique(y_pred))

y_pred std dev: 0.7919130449233909
y_pred unique values: [-5.10374362 -4.75894121 -4.66580793 ...  6.45280346  6.50889712
  6.55739305]


In [31]:
print(df_test.columns)

Index(['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume', 'X1', 'X2', 'X3',
       'X4', 'X5',
       ...
       'X772', 'X773', 'X774', 'X775', 'X776', 'X777', 'X778', 'X779', 'X780',
       'label'],
      dtype='object', length=786)


In [32]:
sample_sub = pd.read_csv('/Users/tobiasshin/Downloads/sample_submission.csv')
print(sample_sub.columns)

Index(['ID', 'prediction'], dtype='object')


In [34]:
# Prepare submission dataframe, for example if there's an ID column 'id'
submission = pd.DataFrame({
    'ID': sample_sub['ID'],  # or the correct ID column
    'prediction': y_pred     # or whatever the competition target is called
})

# Save to CSV for submission
submission.to_csv('submission2.csv', index=False)

In [None]:
# OK so they changed the training set so now we have to re-clean and re-explore and then re-train