In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl
import gc
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor, Pool
import seaborn as sns
import joblib
import lightgbm as lgb
from sklearn.inspection import permutation_importance
import os
import wandb
import time
import lightgbm as lgb
from wandb.integration.lightgbm import wandb_callback, log_summary

In [2]:
df_list = []
for part_in in ["6","7","8","9"]:
    part_id = part_in
    data_dir = f"train.parquet/partition_id={part_id}/part-0.parquet"
    df_list.append(pl.read_parquet(data_dir))
    gc.collect()

responder_cols = [f"responder_{col}" for col in range(9)]

In [3]:
train_df = pl.concat(df_list)
del df_list
gc.collect()

0

In [4]:
def create_daily_stats(train_df, target_col):
    daily_avg = (
        train_df
        .group_by(["symbol_id", "date_id"], maintain_order = True)
        .agg([pl.col(target_col).mean().alias("daily_avg"), 
            pl.col(target_col).std().alias("daily_std"),
            pl.col(target_col).min().alias("daily_min"),
            pl.col(target_col).max().alias("daily_max"),
            pl.col(target_col).median().alias("daily_median"),
            pl.col(target_col).skew().alias("daily_skew"),
            pl.col(target_col).kurtosis().alias("daily_kurtosis"),
            pl.col(target_col).last().alias("last_value"),
            pl.col(target_col).first().alias("first_value"),
            (pl.col(target_col).max() - pl.col(target_col).min()).alias("daily_range"),
            pl.col(target_col).sum().alias("target_sum"),
            pl.col(target_col).count().alias("daily_count")
            ]
            )
    )
    daily_avg = daily_avg.sort(["symbol_id", "date_id"])

    columns_to_shift = ["daily_avg", "daily_std", "daily_min", "daily_max", "daily_median", "daily_skew", "daily_kurtosis", "last_value", "first_value",
    "daily_range", "target_sum", "date_id", "daily_count"]

    daily_avg = daily_avg.with_columns([
        pl.col(col_name)
        .shift(1)
        .over("symbol_id")
        .alias(f"lag_1_{col_name}_{target_col}")
        for col_name in columns_to_shift
    ])

    daily_avg = daily_avg.with_columns(
    (pl.col("date_id") - pl.col("lag_1_date_id_responder_6"))
    .alias("days_since_lag_1")
    )

    s1 = [f"lag_1_{col_name}_{target_col}" for col_name in columns_to_shift if col_name != "date_id"]

    selected_cols = ["symbol_id","date_id", "days_since_lag_1"] + s1
    daily_avg = daily_avg.select(selected_cols)

    train_df = train_df.join(daily_avg,
              on=["symbol_id", "date_id"],
              how="left")

    return train_df

def create_daily_stats2(train_df, target_col):
    daily_avg = (
        train_df
        .group_by(["symbol_id", "date_id"], maintain_order = True)
        .agg([pl.col(target_col).mean().alias("daily_avg"), 
            pl.col(target_col).std().alias("daily_std"),
            pl.col(target_col).min().alias("daily_min"),
            pl.col(target_col).max().alias("daily_max"),
            pl.col(target_col).median().alias("daily_median"),
            pl.col(target_col).last().alias("last_value"),
            (pl.col(target_col).max() - pl.col(target_col).min()).alias("daily_range"),
            pl.col(target_col).sum().alias("target_sum"),
            ]
            )
    )
    daily_avg = daily_avg.sort(["symbol_id", "date_id"])

    columns_to_shift = ["daily_avg", "daily_std", "daily_min", "daily_max", "daily_median", "last_value","daily_range", "target_sum"]

    daily_avg = daily_avg.with_columns([
        pl.col(col_name)
        .shift(1)
        .over("symbol_id")
        .alias(f"lag_1_{col_name}_{target_col}")
        for col_name in columns_to_shift
    ])


    s1 = [f"lag_1_{col_name}_{target_col}" for col_name in columns_to_shift if col_name != "date_id"]

    selected_cols = ["symbol_id","date_id"] + s1
    daily_avg = daily_avg.select(selected_cols)

    train_df = train_df.join(daily_avg,
              on=["symbol_id", "date_id"],
              how="left")

    train_df = train_df.sort(["date_id", "time_id", "symbol_id"])
    gc.collect()
    return train_df

In [5]:
%%time
for col_name in ["responder_6"]:
    train_df = create_daily_stats(train_df, col_name)
gc.collect()

for col_name in ["responder_0","responder_1","responder_2","responder_3","responder_4","responder_5","responder_7","responder_8"]:
    train_df = create_daily_stats2(train_df, col_name)
gc.collect()

CPU times: user 2min 21s, sys: 2min 11s, total: 4min 33s
Wall time: 27.9 s


0

In [6]:
train_df = train_df.with_columns([
    (2 * np.pi * pl.col("time_id") / 967).sin().alias("sin_time_id").cast(pl.Float32),
    (2 * np.pi * pl.col("time_id") / 967).cos().alias("cos_time_id").cast(pl.Float32),
    (2 * np.pi * pl.col("time_id") / 483).sin().alias("sin_time_id_halfday").cast(pl.Float32),
    (2 * np.pi * pl.col("time_id") / 483).cos().alias("cos_time_id_halfday").cast(pl.Float32),
])
gc.collect()

0

In [7]:
%%time
default_features = [f"feature_{idx:02d}" for idx in range(79)]
train_df = train_df.with_columns(null_count = pl.sum_horizontal([pl.col(col).is_null() for col in default_features]))
gc.collect()

CPU times: user 1.99 s, sys: 4.47 s, total: 6.46 s
Wall time: 543 ms


0

In [8]:
polars_train_df = train_df
train_df = polars_train_df.to_pandas()
del polars_train_df
gc.collect()

0

In [9]:
train_df["symbol_id"] = train_df["symbol_id"].astype("category")
train_df["feature_10"] = train_df["feature_10"].astype("category")
train_df["feature_11"] = train_df["feature_11"].astype("category")
train_df["feature_09"] = train_df["feature_09"].astype("category")
gc.collect()

0

In [10]:
X_train = train_df[train_df["date_id"] < 1550].copy()
X_test = train_df[train_df["date_id"] >= 1550].copy()
del train_df
gc.collect()

0

In [11]:
y_train = X_train["responder_6"].copy()
y_test = X_test["responder_6"].copy()
test_weights = X_test["weight"].values.copy()
train_weights = X_train["weight"].values.copy()
X_train.drop(columns = responder_cols, axis=1, inplace = True)
X_test.drop(columns = responder_cols, axis=1, inplace = True)
X_train.drop(columns = ["weight"], axis = 1, inplace = True)
X_test.drop(columns = ["weight"], axis = 1, inplace = True)
gc.collect()

0

In [12]:
feature_cols = joblib.load("model_cols/lgb_v7_model_cols.pkl")
feature_cols = feature_cols 
len(feature_cols)

83

In [None]:
lgb_params = {
    "objective": "rmse",
    "random_state": 16,
    "max_cat_to_onehot": 64,
    "learning_rate": 0.01,
    "max_depth": 10,
    "verbosity": -1,
    "metric": "rmse",
    "n_estimators" : 500,
    "n_jobs" : 16,
    "force_row_wise" : True,
}

In [14]:
def weighted_r2_metric(y_true, y_pred, weights=None):
    if weights is None:
        weights = np.ones_like(y_true)

    numerator = np.sum(weights * (y_true - y_pred)**2)
    y_weighted_mean = np.sum(weights * y_true) / np.sum(weights)
    denominator = np.sum(weights * (y_true - y_weighted_mean)**2)

    score = 1 - (numerator / denominator)
    return 'weighted_r2', score, True

lgb_model = lgb.LGBMRegressor(objective = "rmse",
                                 random_state = 16,
                                 max_cat_to_onehot = 64,
                                 n_estimators = 500,
                                 learning_rate = 0.01,
                                 verbosity = -1,
                                 max_depth = 10,
                                 force_row_wise = True,
                                 histogram_pool_size = 30_000)

lgb_model.fit(X_train, y_train, 
                eval_set = [(X_test, y_test), (X_train, y_train)],
                eval_metric = weighted_r2_metric,
                eval_names = ["test", "train"],
                callbacks=[lgb.log_evaluation(50)], 
                sample_weight = train_weights,
                eval_sample_weight = [test_weights])

y_pred_lgb = np.clip(lgb_model.predict(X_test), -5, 5)

r_score_lgb = r2_score(y_test, y_pred_lgb, sample_weight = test_weights)
print(f"R2 Score: {r_score_lgb:.5f}")
gc.collect()

[50]	train's rmse: 0.815762	train's weighted_r2: 0.00650369	test's rmse: 0.800965	test's weighted_r2: 0.00261528
[100]	train's rmse: 0.814169	train's weighted_r2: 0.0103792	test's rmse: 0.800419	test's weighted_r2: 0.0039735
[150]	train's rmse: 0.812929	train's weighted_r2: 0.0133919	test's rmse: 0.80007	test's weighted_r2: 0.00484244
[200]	train's rmse: 0.811805	train's weighted_r2: 0.0161187	test's rmse: 0.799856	test's weighted_r2: 0.00537484
[250]	train's rmse: 0.810677	train's weighted_r2: 0.0188507	test's rmse: 0.799677	test's weighted_r2: 0.00581905
[300]	train's rmse: 0.809553	train's weighted_r2: 0.021568	test's rmse: 0.799566	test's weighted_r2: 0.00609543
[350]	train's rmse: 0.808543	train's weighted_r2: 0.024008	test's rmse: 0.799462	test's weighted_r2: 0.0063535
[400]	train's rmse: 0.80771	train's weighted_r2: 0.0260193	test's rmse: 0.799399	test's weighted_r2: 0.00651183
[450]	train's rmse: 0.806876	train's weighted_r2: 0.0280289	test's rmse: 0.79936	test's weighted_r2: 0

4

In [14]:
short_id = int(time.time()) % 100_000

run_name = f"lgb_run_{short_id}"

lgb_params = {
    "objective": "rmse",
    "random_state": 16,
    "max_cat_to_onehot": 64,
    "learning_rate": 0.01,
    "max_depth": 10,
    "verbosity": -1,
    "metric": "rmse",
    "n_estimators" : 500,
    "n_jobs" : 16,
    "force_row_wise" : True,
}

wandb.init(project='js_lgbm', 
        name = run_name,
        tags = ["exp", "lgb_v8"], 
        config=lgb_params)

def weighted_r2_metric(y_pred, dataset):
    y_true = dataset.get_label()
    weight = dataset.get_weight()
    ss_res = ((weight * (y_true - y_pred) ** 2).sum())
    ss_tot = ((weight * (y_true - y_true.mean()) ** 2).sum())
    r2 = 1 - (ss_res / ss_tot)
    return "weighted_r2", r2, True  

train_data = lgb.Dataset(
    data=X_train[feature_cols],
    label=y_train, 
    weight=train_weights, 
    categorical_feature=["symbol_id","feature_09","feature_10", "feature_11"],
)
test_data = lgb.Dataset(
    data=X_test[feature_cols], 
    label=y_test, 
    weight=test_weights, 
    categorical_feature=["symbol_id","feature_09","feature_10","feature_11"],
    reference = train_data)


lgb_model = lgb.train(
    params=lgb_params,
    train_set=train_data,
    valid_sets=[train_data,test_data], 
    valid_names=["train","test"],
    feval=weighted_r2_metric,  
    callbacks=[lgb.log_evaluation(50),
                wandb_callback()] 
    )

best_test_r2 = lgb_model.best_score["test"]["weighted_r2"]
best_train_r2 = lgb_model.best_score["train"]["weighted_r2"]

wandb.run.summary["best_test_r2"] = best_test_r2
wandb.run.summary["best_train_r2"] = best_train_r2

log_summary(lgb_model, save_model_checkpoint=True)
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mturkenm[0m. Use [1m`wandb login --relogin`[0m to force relogin




[50]	train's rmse: 0.827328	train's weighted_r2: 0.00552862	test's rmse: 0.800932	test's weighted_r2: 0.0027052
[100]	train's rmse: 0.825996	train's weighted_r2: 0.00872725	test's rmse: 0.800409	test's weighted_r2: 0.00400766
[150]	train's rmse: 0.825052	train's weighted_r2: 0.0109913	test's rmse: 0.80006	test's weighted_r2: 0.00487487
[200]	train's rmse: 0.82424	train's weighted_r2: 0.012938	test's rmse: 0.799796	test's weighted_r2: 0.00553064
[250]	train's rmse: 0.823543	train's weighted_r2: 0.0146055	test's rmse: 0.799608	test's weighted_r2: 0.00599804
[300]	train's rmse: 0.822894	train's weighted_r2: 0.0161597	test's rmse: 0.799449	test's weighted_r2: 0.00639347
[350]	train's rmse: 0.822288	train's weighted_r2: 0.0176072	test's rmse: 0.799318	test's weighted_r2: 0.00671971
[400]	train's rmse: 0.82165	train's weighted_r2: 0.0191315	test's rmse: 0.799211	test's weighted_r2: 0.00698654
[450]	train's rmse: 0.821056	train's weighted_r2: 0.0205482	test's rmse: 0.799122	test's weighted_r2

VBox(children=(Label(value='1.776 MB of 1.776 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
test_rmse,█▇▇▆▆▅▅▅▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁
test_weighted_r2,▁▂▂▃▃▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇██████████
train_rmse,█▇▇▇▆▆▆▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁
train_weighted_r2,▁▂▂▂▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇█████

0,1
best_iteration,0.0
best_test_r2,0.00739
best_train_r2,0.0218
iteration,499.0
test_weighted_r2,0.00739
train_weighted_r2,0.0218


In [15]:
lgb_model.save_model("lgb_model_results/lgb_v8_train.txt")

<lightgbm.basic.Booster at 0x75fd08199f30>

In [None]:
y_train = X_train["responder_6"].copy()
y_test = X_test["responder_6"].copy()
test_weights = X_test["weight"].values.copy()
train_weights = X_train["weight"].values.copy()
X_train.drop(columns = responder_cols, axis=1, inplace = True)
X_test.drop(columns = responder_cols, axis=1, inplace = True)
X_train.drop(columns = ["weight"], axis = 1, inplace = True)
X_test.drop(columns = ["weight"], axis = 1, inplace = True)
gc.collect()

In [14]:
y_train = train_df["responder_6"].copy()
train_weights = train_df["weight"].values.copy()
train_df.drop(columns=[col for col in train_df.columns if col not in feature_cols], inplace=True)
gc.collect()

17

In [20]:
short_id = int(time.time()) % 100_000

run_name = f"lgb_run_{short_id}"

lgb_params = {
    "objective": "rmse",
    "random_state": 16,
    "max_cat_to_onehot": 64,
    "learning_rate": 0.01,
    "max_depth": 10,
    "verbosity": -1,
    "metric": "rmse",
    "n_estimators" : 750,
    "n_jobs" : 16,
    "force_row_wise" : True,
}

inference_data = lgb.Dataset(
    data=train_df[feature_cols],
    label=y_train, 
    weight=train_weights, 
    categorical_feature=["symbol_id","feature_09","feature_10", "feature_11"],
)

wandb.init(project='js_lgbm', 
        name = run_name,
        tags = ["inference", "lgb_v8"], 
        config=lgb_params)

lgb_model = lgb.train(
    params=lgb_params,
    train_set=inference_data,
    valid_sets=[inference_data], 
    valid_names=["train"],
    feval=weighted_r2_metric,  
    callbacks=[lgb.log_evaluation(50),
                wandb_callback()] 
    )

best_train_r2 = lgb_model.best_score["train"]["weighted_r2"]
wandb.run.summary["best_train_r2"] = best_train_r2
log_summary(lgb_model)
wandb.finish()



[50]	train's rmse: 0.821027	train's weighted_r2: 0.00478899
[100]	train's rmse: 0.819917	train's weighted_r2: 0.00747854
[150]	train's rmse: 0.819107	train's weighted_r2: 0.00943679
[200]	train's rmse: 0.818458	train's weighted_r2: 0.0110077
[250]	train's rmse: 0.817844	train's weighted_r2: 0.012489
[300]	train's rmse: 0.817194	train's weighted_r2: 0.0140602
[350]	train's rmse: 0.816559	train's weighted_r2: 0.0155919
[400]	train's rmse: 0.815984	train's weighted_r2: 0.0169773
[450]	train's rmse: 0.815454	train's weighted_r2: 0.0182533
[500]	train's rmse: 0.814917	train's weighted_r2: 0.0195451
[550]	train's rmse: 0.814402	train's weighted_r2: 0.0207844
[600]	train's rmse: 0.813905	train's weighted_r2: 0.0219783
[650]	train's rmse: 0.813462	train's weighted_r2: 0.0230433
[700]	train's rmse: 0.813097	train's weighted_r2: 0.0239197
[750]	train's rmse: 0.812644	train's weighted_r2: 0.0250087


VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
iteration,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_rmse,██▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁
train_weighted_r2,▁▁▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇█████

0,1
best_iteration,0.0
best_train_r2,0.02501
iteration,749.0
train_weighted_r2,0.02501


In [21]:
lgb_model.save_model("lgb_model_results/lgb_v8.txt")
joblib.dump(feature_cols, "model_cols/lgb_v8_model_cols.pkl")

['model_cols/lgb_v8_model_cols.pkl']