In [70]:
import os
import polars as pl
import pandas as pd
from functools import partial
import dproc

In [12]:
data_path = 'data'
if not os.path.isdir(data_path):
    os.mkdir(data_path)
files = {
    k: os.path.join(p, f)
    for k, p, f in [
        ('train', data_path, 'train.csv'),
        ('test', data_path, 'test.csv'),
        ('org_train', data_path, 'train_org.csv'),
        ('org_test', data_path, 'test_org.csv'),
        ('train_pkl', data_path, 'train.pkl'),
        ('org_pkl', data_path, 'org.pkl'),
        ('test_pkl', data_path, 'test.pkl'),
        ('var_pkl', data_path, 'var.pkl')
    ]
}

if not os.path.isfile(files['train']):
    !kaggle competitions download -c playground-series-s4e7
    !mkdir data
    !unzip playground-series-s4e7.zip -d data
    !rm playground-series-s4e7.zip
    !kaggle datasets download -d annantkumarsingh/health-insurance-cross-sell-prediction-data
    !unzip health-insurance-cross-sell-prediction-data.zip
    !mv train.csv data/train_org.csv
    !mv test.csv data/test_org.csv
    !rm health-insurance-cross-sell-prediction-data.zip

In [45]:
df_type = dproc.merge_type_df([
    pl.scan_csv(files[i]).pipe(dproc.get_type_df) for i in ['train', 'test', 'org_train', 'org_test']
])

Region_Code to category, Policy_Sales_Channel to category

In [16]:
pl_type = dproc.get_type_pl(df_type, {'Region_Code': pl.Categorical, 'Policy_Sales_Channel': pl.Categorical})

In [79]:
s_type = pd.Series(pl_type).rename('type').astype(str)
df_type = df_type[['n_unique']].assign(src='org', Description='').join(s_type)
df_type

Unnamed: 0_level_0,n_unique,src,Description,type
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Age,66.0,org,,Int8
Annual_Premium,46734.25,org,,Float32
Driving_License,2.0,org,,Int8
Gender,2.0,org,,Categorical
Policy_Sales_Channel,150.75,org,,Categorical
Previously_Insured,2.0,org,,Int8
Region_Code,53.25,org,,Categorical
Response,2.0,org,,Int8
Vehicle_Age,3.0,org,,Categorical
Vehicle_Damage,2.0,org,,Categorical


In [52]:
dfl_train = pl.read_csv('data/train.csv', dtypes = pl_type)

In [83]:
dprocs = list()
mode_AnnualPremium = dfl_train['Annual_Premium'].mode()[0]
mean_Annual_Premium_without_mode = dfl_train.filter(pl.col('Annual_Premium') != mode_AnnualPremium)['Annual_Premium'].mean()
dprocs.append(
    partial(dproc.with_columns_opr, 
        proc_list=[
            ('is_mode', 'is_Annual_Premium_mode', pl.when(pl.col('Annual_Premium') == mode_AnnualPremium).then(1).otherwise(0).cast(pl.Int8), 'Annual_Premium의 최빈값 여부'),
            ('log_transform', 'log_Annual_Premium', 
                 (pl.when(pl.col('Annual_Premium') == mode_AnnualPremium).then(mean_Annual_Premium_without_mode).otherwise('Annual_Premium')).log(), 
             'Annual_Premium의 로그(최빈값은 평균으로 대체합니다.)')
        ]
    )
)

In [85]:
dfl_train, df_type = dproc.apply_procs(dfl_train, dprocs, df_type)
df_train = dfl_train.to_pandas().set_index('id')
del dfl_train
df_train.to_pickle(files['train_pkl'])
del df_train

In [87]:
dfl_test = pl.read_csv('data/test.csv', dtypes = pl_type)
dfl_test, _ = dproc.apply_procs(dfl_test, dprocs)

In [23]:
df_test = dfl_test.to_pandas().set_index('id')
del dfl_test
df_test.to_pickle(files['test_pkl'])
del df_test

In [88]:
dfl_org_train = pl.read_csv('data/train_org.csv', dtypes=pl_type)
dfl_org_train, _ = dproc.apply_procs(dfl_org_train, dprocs)
dfl_org_test = pl.read_csv('data/test_org.csv', dtypes=pl_type)
dfl_org_test, _ = dproc.apply_procs(dfl_org_test, dprocs)

pd.concat([
    dfl_org_train.with_columns(id=-pl.col('id')).to_pandas().set_index('id').assign(is_train=True),
    dfl_org_test.with_columns(id=-pl.col('id')).to_pandas().set_index('id').assign(is_train=False)
], axis=0).to_pickle(files['org_pkl'])

del dfl_org_train, dfl_org_test

In [89]:
df_type.to_pickle(files['var_pkl'])