The first attempt kept crashing the kernel, so this is a more "optimized" version to not do that.

In [1]:
import gc
import warnings
warnings.filterwarnings('ignore')
%load_ext cudf.pandas
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

In [2]:
def regression_imputer(data:pd.DataFrame, target:str, drops:list=[]) -> pd.DataFrame:
    X_train = data[data[target].notna()].drop(drops, axis=1)
    y_train = data[target].loc[data.index.isin(X_train.index)]
    model = LinearRegression()
    model.fit(X_train, y_train)
    X_test = data[data[target].isna()].drop(drops, axis=1).dropna()
    y_test = model.predict(X_test)
    return pd.DataFrame(y_test, columns=[target], index=X_test.index)

In [3]:
def preprocess(data:pd.DataFrame, target:str, imputes:list[str]=[], skips:list[str]=[], lags:int=0) -> pd.DataFrame:
    temp = data.copy()
    # impute
    for col in imputes:
        imp = regression_imputer(temp, col, drops=[*imputes, target])
        temp = temp.fillna(imp)
    temp = temp.interpolate()
    # normalize
    skipped = temp[[*skips, target]]
    temp = temp.drop([*skips, target], axis=1)
    temp = (temp - temp.min()) / (temp.max() - temp.min())
    # lag features
    lagged = []
    for i in range(1, lags+1):
        lag = temp.shift(i)
        lag.columns = [f'{col}_lag_{i}0s' for col in lag.columns]
        lagged.append(lag)
    # recombine
    temp = temp.join(lagged, how='inner').dropna(axis=0)
    temp = skipped.join(temp, how='inner')
    return temp

In [4]:
data = pd.read_csv('./.data/train.csv').set_index(['stock_id', 'date_id', 'seconds_in_bucket']).sort_index()

In [6]:
preprocessed = []
for stock_id in range(200):
    print(f'Preprocessing {stock_id/2:04.1f}% Complete', end='\r')
    stock_data = pd.DataFrame(data.loc[stock_id])
    stock_data = preprocess(stock_data, 'target', imputes=['far_price', 'near_price'], skips=['time_id', 'row_id'], lags=6)
    stock_data['stock_id'] = stock_id
    preprocessed.append(stock_data)
print('Preprocessing 100.0% Complete')

Preprocessing 100.0% Complete


In [7]:
gc.collect() # kernel crashes otherwise
pass

In [8]:
data = pd.concat(preprocessed).reset_index()

In [9]:
enc = OneHotEncoder(handle_unknown='ignore')
enc_id = enc.fit_transform(np.array(data.stock_id).reshape(-1, 1))
encoded = pd.DataFrame(enc_id.toarray().reshape(len(data), 200), columns=[f'stock_id_{id_}' for id_ in enc.categories_[0]])

In [10]:
data = data.drop('stock_id', axis=1).join(encoded, how='inner')
data = data.set_index('row_id')

In [11]:
data

Unnamed: 0_level_0,date_id,seconds_in_bucket,time_id,target,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,...,stock_id_190,stock_id_191,stock_id_192,stock_id_193,stock_id_194,stock_id_195,stock_id_196,stock_id_197,stock_id_198,stock_id_199
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
33_250_0,33,250,1840,21.890402,0.090838,0.0,0.335663,0.334303,0.044176,0.020911,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33_260_0,33,260,1841,36.779640,0.095902,0.0,0.288033,0.334422,0.043642,0.020141,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33_270_0,33,270,1842,31.839610,0.100920,0.0,0.270222,0.334546,0.043019,0.020009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33_280_0,33,280,1843,46.720505,0.117195,0.0,0.264247,0.345710,0.039841,0.017835,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33_290_0,33,290,1844,18.559694,0.039685,1.0,0.299983,0.428781,0.046246,0.025922,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417_340_199,417,340,22969,-3.769994,0.015103,1.0,0.446547,0.056509,0.037899,0.016869,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
417_360_199,417,360,22971,-10.169745,0.014564,1.0,0.446547,0.056753,0.037706,0.016511,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
417_420_199,417,420,22977,0.560284,0.013667,1.0,0.416613,0.057179,0.037367,0.015899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
417_440_199,417,440,22979,0.230074,0.013656,1.0,0.431601,0.057183,0.036932,0.015899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
