In [22]:
import os
import pandas as pd
import polars as pl
import dproc, sgml, sgutil

import joblib

In [2]:
data_path = 'data'
files = {
    'train': os.path.join(data_path, 'train.csv'),
    'test': os.path.join(data_path, 'test.csv'),
    'vars': os.path.join(data_path, 'vars.pkl')
}

In [3]:
if (not os.path.isfile(files['train'])) and (not os.path.isfile(files['vars'])):
    !kaggle competitions download -c playground-series-s5e1
    if not os.path.exists(data_path):
        !mkdir data
    !unzip playground-series-s5e1.zip -d data
    !rm playground-series-s5e1.zip

In [4]:
df_type = dproc.merge_type_df([
    pl.scan_csv(files[i]).pipe(dproc.get_type_df) for i in ['train', 'test']
])

In [5]:
pl_type = dproc.get_type_pl(
    df_type, {'date': pl.Datetime}
)

In [21]:
df_type

Unnamed: 0_level_0,min,max,na,count,n_unique,dtype,f32,i32,i16,i8
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
country,,,0.0,328680.0,6.0,String,True,True,True,True
date,,,0.0,328680.0,1826.0,String,True,True,True,True
id,0.0,328679.0,0.0,328680.0,164340.0,Int64,True,True,False,False
num_sold,5.0,5939.0,8871.0,221259.0,4038.0,Float64,True,True,True,False
product,,,0.0,328680.0,5.0,String,True,True,True,True
store,,,0.0,328680.0,3.0,String,True,True,True,True


In [6]:
df_train = pl.read_csv(files['train'], schema_overrides = pl_type)
df_test = pl.read_csv(files['test'], schema_overrides = pl_type)

In [24]:
pl_dt_select = lambda x: x.select(
    year = pl.col('date').dt.year(),
    month = pl.col('date').dt.month(),
    day = pl.col('date').dt.day(),
    weekday = pl.col('date').dt.weekday()
)

In [25]:
joblib.dump(pl_dt_select, 'test.joblib')

['test.joblib']

In [20]:
dproc.apply_select(df_train, [
    (pl_dt_select, ['연', '월', '일', '요일'])
], 'dt')

(shape: (230_130, 10)
 ┌────────┬──────────────┬───────────┬──────────────────────┬───┬──────┬───────┬─────┬─────────┐
 │ id     ┆ date         ┆ country   ┆ store                ┆ … ┆ year ┆ month ┆ day ┆ weekday │
 │ ---    ┆ ---          ┆ ---       ┆ ---                  ┆   ┆ ---  ┆ ---   ┆ --- ┆ ---     │
 │ i32    ┆ datetime[μs] ┆ cat       ┆ cat                  ┆   ┆ i32  ┆ i8    ┆ i8  ┆ i8      │
 ╞════════╪══════════════╪═══════════╪══════════════════════╪═══╪══════╪═══════╪═════╪═════════╡
 │ 0      ┆ 2010-01-01   ┆ Canada    ┆ Discount Stickers    ┆ … ┆ 2010 ┆ 1     ┆ 1   ┆ 5       │
 │        ┆ 00:00:00     ┆           ┆                      ┆   ┆      ┆       ┆     ┆         │
 │ 1      ┆ 2010-01-01   ┆ Canada    ┆ Discount Stickers    ┆ … ┆ 2010 ┆ 1     ┆ 1   ┆ 5       │
 │        ┆ 00:00:00     ┆           ┆                      ┆   ┆      ┆       ┆     ┆         │
 │ 2      ┆ 2010-01-01   ┆ Canada    ┆ Discount Stickers    ┆ … ┆ 2010 ┆ 1     ┆ 1   ┆ 5       │
 │      

In [23]:
df_train

id,date,country,store,product,num_sold
i32,datetime[μs],cat,cat,cat,f32
0,2010-01-01 00:00:00,"""Canada""","""Discount Stickers""","""Holographic Goose""",
1,2010-01-01 00:00:00,"""Canada""","""Discount Stickers""","""Kaggle""",973.0
2,2010-01-01 00:00:00,"""Canada""","""Discount Stickers""","""Kaggle Tiers""",906.0
3,2010-01-01 00:00:00,"""Canada""","""Discount Stickers""","""Kerneler""",423.0
4,2010-01-01 00:00:00,"""Canada""","""Discount Stickers""","""Kerneler Dark Mode""",491.0
…,…,…,…,…,…
230125,2016-12-31 00:00:00,"""Singapore""","""Premium Sticker Mart""","""Holographic Goose""",466.0
230126,2016-12-31 00:00:00,"""Singapore""","""Premium Sticker Mart""","""Kaggle""",2907.0
230127,2016-12-31 00:00:00,"""Singapore""","""Premium Sticker Mart""","""Kaggle Tiers""",2299.0
230128,2016-12-31 00:00:00,"""Singapore""","""Premium Sticker Mart""","""Kerneler""",1242.0
