In [1]:
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

datananme = 'js'

In [2]:
# read the data using pd

# pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 200)

# the data is stored in the parent dir with dir_name being datananme
train_pd = pd.read_parquet(f'../{datananme}/train.parquet/')
train_pd.shape

(47127338, 93)

In [None]:
# read the data using pl

train_pl = pl.read_parquet(f'../{datananme}/train.parquet/')
train_pl.shape

### Data checking/visualizing

In [None]:
# missing values for all of the features for rows that 'responder_6'
# is not None

from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter

supervised_usable = (
    train_pl
    .filter(pl.col('responder_6').is_not_null())
)

missing_count = (
    supervised_usable
    .null_count()
    .transpose(include_header=True,
               header_name='feature',
               column_names=['null_count'])
    .sort('null_count', descending=True)
    .with_columns((pl.col('null_count') / len(supervised_usable)).alias('null_ratio'))
)

plt.figure(figsize=(6, 20))
plt.title(f'Missing values over the {len(supervised_usable)} samples which have a target')
plt.barh(np.arange(len(missing_count)), missing_count.get_column('null_ratio'), color='coral', label='missing')
plt.barh(np.arange(len(missing_count)), 
         1 - missing_count.get_column('null_ratio'),
         left=missing_count.get_column('null_ratio'),
         color='darkseagreen', label='available')
plt.yticks(np.arange(len(missing_count)), missing_count.get_column('feature'))
plt.gca().xaxis.set_major_formatter(PercentFormatter(xmax=1, decimals=0))
plt.xlim(0, 1)
plt.legend()
plt.show()

In [None]:
# check NaNs for 'responder_6'
train_pd['responder_6'].isna().sum()

In [None]:
# view lags.parquet

lags_pd = pd.read_parquet(f'../{datananme}/lags.parquet/date_id=0')
lags_pd

In [None]:
# view the sample test data

test_pd = pd.read_parquet(f'../{datananme}/test.parquet/date_id=0/part-0.parquet')
test_pd

### Data pre-processing

In [3]:
# drop some features

col_to_drop = [f'responder_{i}' for i in range(9) if i != 6] \
                + ['partition_id']

train_pd1 = train_pd.drop(labels=col_to_drop, axis=1)

In [4]:
# drop rows with NAs
# but perhaps no need? xgboost shall be able to handle NA by default

train_pd2 = train_pd1.dropna(axis=0)
print(train_pd2.shape)

(35370822, 84)


In [5]:
# split to predictors and y, since there is no categorical features

col_weight = ['weight']
col_y = ['responder_6']
col_num = [col for col in train_pd2.columns.tolist() if col not in col_y]

### Model training

In [6]:
def calculate_r2(y_true, y_pred, weights):
    numerator = np.sum(weights * (y_true - y_pred) ** 2)
    denominator = np.sum(weights * (y_true ** 2))
    r2_score = 1 - (numerator / denominator)
    return r2_score

In [None]:
# xgboost
# cross-validation on dates
import xgboost as xgb

date_ids = train_pd2['date_id'].unique()
k = 5
fold_size = date_ids.shape[0] // k

for i in range(k):
    start = i * fold_size
    end = (i + 1) * fold_size if i != k - 1 else len(date_ids_ls)
    valid_dates = date_ids[start:end]
    train_dates = np.concatenate([date_ids[:start], date_ids[end:]])
    
    X_train = train_pd2[col_num].loc[train_pd2['date_id'].isin(train_dates)]
    y_train = train_pd2[col_y].loc[train_pd2['date_id'].isin(train_dates)]
    w_train = train_pd2[col_weight].loc[train_pd2['date_id'].isin(train_dates)]

    X_valid = train_pd2[col_num].loc[train_pd2['date_id'].isin(valid_dates)]
    y_valid = train_pd2[col_y].loc[train_pd2['date_id'].isin(valid_dates)]
    w_valid = train_pd2[col_weight].loc[train_pd2['date_id'].isin(valid_dates)]

    dtrain = xgb.DMatrix(X_train, label=y_train, weight=w_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid, weight=w_valid)

    XGB_PARAMS = {
        'eval_metric': 'rmse',
        'learning_rate': 0.5,
        'max_depth': 12,
        'min_child_weight': 1.5,
        'subsample': 0.8555,
        'colsample_bytree': 0.85555555,
        'random_state': 42,
        'tree_method': 'hist',
        'device': 'cuda'
    }

    model = xgb.train(XGB_PARAMS, dtrain, num_boost_round=1000, evals=[(dtrain, 'train'), (dvalid, 'valid')], early_stopping_rounds=100, verbose_eval=50)

    y_valid_pred = model.predict(dvalid)
    r2_score = calculate_r2(X_valid.to_numpy(), y_valid_pred, w_valid.to_numpy())
    print(f"Fold {fold_idx} validation R2 score: {r2_score}")



[0]	train-rmse:0.77472	valid-rmse:0.82183


#### Evaluation 