In [1]:
import os
import polars as pl
import pandas as pd
import numpy as np
from functools import partial
import dproc

In [2]:
data_path = 'data'
if not os.path.isdir(data_path):
    os.mkdir(data_path)
files = {
    k: os.path.join(p, f)
    for k, p, f in [
        ('train', data_path, 'train.csv'),
        ('test', data_path, 'test.csv'),
        ('org_train', data_path, 'train_org.csv'),
        ('org_test', data_path, 'test_org.csv'),
        ('train_pkl', data_path, 'train.pkl'),
        ('org_pkl', data_path, 'org.pkl'),
        ('test_pkl', data_path, 'test.pkl'),
        ('var_pkl', data_path, 'var.pkl')
    ]
}

if not os.path.isfile(files['train']):
    !kaggle competitions download -c playground-series-s4e7
    !mkdir data
    !unzip playground-series-s4e7.zip -d data
    !rm playground-series-s4e7.zip
    !kaggle datasets download -d annantkumarsingh/health-insurance-cross-sell-prediction-data
    !unzip health-insurance-cross-sell-prediction-data.zip
    !mv train.csv data/train_org.csv
    !mv test.csv data/test_org.csv
    !rm health-insurance-cross-sell-prediction-data.zip

In [3]:
from io import StringIO
vars = StringIO(
"""Age	(continous)	Age of the Customer.
Gender	(dichotomous)	Gender of the Customer.
Driving_License	(dichotomous)	0 for customer not having DL, 1 for customer having DL.
Region_Code	(nominal)	Unique code for the region of the customer.
Previously_Insured	(dichotomous)	0 for customer not having vehicle insurance, 1 for customer having vehicle insurance.
Vehicle_Age	(nominal)	Age of the vehicle.
Vehicle_Damage	(dichotomous)	Customer got his/her vehicle damaged in the past. 0 : Customer didn't get his/her vehicle damaged in the past.
Annual_Premium	(continous)	The amount customer needs to pay as premium in the year.
Policy_Sales_Channel	(nominal)	Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.
Vintage	(continous)	Number of Days, Customer has been associated with the company.
Response (Dependent Feature)	(dichotomous)	1 for Customer is interested, 0 for Customer is not interested."""
)
df_var = pd.read_csv(vars, sep='\t', header=None).rename(
    columns={0: 'name', 1: 'type', 2: 'Description'}
)
df_var.head()

Unnamed: 0,name,type,Description
0,Age,(continous),Age of the Customer.
1,Gender,(dichotomous),Gender of the Customer.
2,Driving_License,(dichotomous),"0 for customer not having DL, 1 for customer h..."
3,Region_Code,(nominal),Unique code for the region of the customer.
4,Previously_Insured,(dichotomous),"0 for customer not having vehicle insurance, 1..."


In [4]:
df_type = dproc.merge_type_df([
    pl.scan_csv(files[i]).pipe(dproc.get_type_df) for i in ['train', 'test', 'org_train', 'org_test']
])

Region_Code to category, Policy_Sales_Channel to category

In [5]:
pl_type = dproc.get_type_pl(df_type, {'Region_Code': pl.Categorical, 'Policy_Sales_Channel': pl.Categorical, 'Previously_Insured': pl.Categorical})
s_type = pd.Series(pl_type).astype(str).rename('dtype')
df_var = df_var.set_index('name').rename(columns={'type': 'var_type'}).join(df_type.drop(columns='dtype')).join(s_type)
df_var

Unnamed: 0_level_0,var_type,Description,min,max,na,count,n_unique,f32,i32,i16,i8,dtype
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Age,(continous),Age of the Customer.,20.0,85.0,0.0,19682810.0,66.0,True,True,True,True,Int8
Gender,(dichotomous),Gender of the Customer.,,,0.0,19682810.0,2.0,True,True,True,True,Categorical
Driving_License,(dichotomous),"0 for customer not having DL, 1 for customer h...",0.0,1.0,0.0,19682810.0,2.0,True,True,True,True,Int8
Region_Code,(nominal),Unique code for the region of the customer.,0.0,52.0,0.0,19682810.0,53.25,True,True,True,True,Categorical
Previously_Insured,(dichotomous),"0 for customer not having vehicle insurance, 1...",0.0,1.0,0.0,19682810.0,2.0,True,True,True,True,Categorical
Vehicle_Age,(nominal),Age of the vehicle.,,,0.0,19682810.0,3.0,True,True,True,True,Categorical
Vehicle_Damage,(dichotomous),Customer got his/her vehicle damaged in the pa...,,,0.0,19682810.0,2.0,True,True,True,True,Categorical
Annual_Premium,(continous),The amount customer needs to pay as premium in...,2630.0,540165.0,0.0,19682810.0,46734.25,True,True,False,False,Float32
Policy_Sales_Channel,(nominal),Anonymized Code for the channel of outreaching...,1.0,163.0,0.0,19682810.0,150.75,True,True,True,False,Categorical
Vintage,(continous),"Number of Days, Customer has been associated w...",10.0,299.0,0.0,19682810.0,290.0,True,True,True,False,Int16


In [6]:
dfl_train = pl.read_csv('data/train.csv', dtypes = pl_type)

In [7]:
dprocs = list()

mode_AnnualPremium = dfl_train['Annual_Premium'].mode()[0]
mean_Annual_Premium_without_mode = dfl_train.filter(pl.col('Annual_Premium') != mode_AnnualPremium)['Annual_Premium'].mean()
dprocs.append(
    partial(dproc.with_columns_opr, 
        proc_list=[
            ('is_mode', 'is_Annual_Premium_mode', pl.when(pl.col('Annual_Premium') == mode_AnnualPremium).then(1).otherwise(0).cast(pl.Int8), 'Annual_Premium의 최빈값 여부'),
        ]
    )
)

In [8]:
def repl_nearest(cat_vals, x):
    return np.argmin(np.abs(cat_vals - x))

In [9]:
dfl_train, df_var = dproc.apply_procs(dfl_train, dprocs, df_var)
df_train = dfl_train.to_pandas().set_index('id')
del dfl_train

df_train['Annual_Premium_c'] = df_train['Annual_Premium'].astype('category')
s_cat_count = df_train['Annual_Premium_c'].value_counts(normalize=True).sort_values(ascending=False).cumsum()
repl = df_train['Annual_Premium_c'].cat.categories.to_series().pipe(
    lambda x: x.loc[~x.isin(s_cat_count.loc[s_cat_count <= 0.99].index)]
).values
cat_vals = df_train['Annual_Premium_c'].cat.categories.to_series().pipe(
    lambda x: x.loc[~x.isin(s_cat_count.loc[s_cat_count > 0.99].index)]
).values
df_train['Annual_Premium_c'] = dproc.replace_cat(df_train['Annual_Premium_c'].cat.as_ordered(), {i: cat_vals[repl_nearest(cat_vals, i)] for i in repl})

df_train['Age_c'] = df_train['Age'].astype('category').cat.as_ordered()
df_train['Region_Code'] = dproc.replace_cat(df_train['Region_Code'], lambda x: float(x))
df_train['Vintage_c'] = df_train['Vintage'].astype('category').cat.as_ordered()

df_train['Policy_Sales_Channel'] = dproc.replace_cat(df_train['Policy_Sales_Channel'], lambda x: float(x))
df_train['Policy_Sales_Channel'] = df_train['Policy_Sales_Channel'].cat.as_ordered()

df_var = pd.concat([
    df_var, 
    dproc.get_type_vars([
        ('categorize', df_train['Annual_Premium_c'], 'Annual_Premium을 범주화', 'Categorical'),
        ('categorize', df_train['Age_c'], 'Age를 범주화', 'Categorical'),
        ('categorize', df_train['Vintage_c'], 'Vintange를 범주화', 'Categorical')
    ])
], axis=0)
cat_dtypes = {i: df_train[i].dtype for i in ['Annual_Premium_c', 'Age_c', 'Vintage_c'] + df_var.loc[df_var['dtype'] == 'Categorical'].index.tolist()}

In [10]:
comb_cat_list = [
    ('VA', ['Vehicle_Damage', 'Annual_Premium_c']),
    ('VAge', ['Vehicle_Damage', 'Age_c']),
    ('PA', ['Previously_Insured', 'Annual_Premium_c']),
    ('VV', ['Vehicle_Damage', 'Vintage_c']),
    ('PAge', ['Previously_Insured', 'Age_c']),
    ('VP', ['Vehicle_Damage', 'Policy_Sales_Channel']),
    ('PVc', ['Previously_Insured', 'Vintage_c']),
    ('PP', ['Previously_Insured', 'Policy_Sales_Channel']),
    ('RV', ['Region_Code', 'Vehicle_Damage']),
    ('V2V', ['Vehicle_Age', 'Vehicle_Damage']),
    ('RP', ['Region_Code', 'Previously_Insured']),
    ('PV2', ['Previously_Insured', 'Vehicle_Age']),
    ('PV', ['Previously_Insured', 'Vehicle_Damage']),
    ('GV2', ['Gender', 'Vehicle_Damage']),
    ('GP', ['Gender', 'Previously_Insured']),
    ('AA', ['Annual_Premium_c', 'Age_c']),
    ('AV', ['Annual_Premium_c', 'Vintage_c']),
    ('VPAV', ['Vehicle_Damage', 'Previously_Insured', 'Age_c', 'Vintage_c']),
    ('VPPV', ['Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel', 'Vintage_c']),
    ('VPPA', ['Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel', 'Age_c']),
    ('VPVGAP', ['Vehicle_Damage', 'Previously_Insured', 'Vehicle_Age', 'Gender', 'Age_c', 'Policy_Sales_Channel']),
    ('VPVGAV', ['Vehicle_Damage', 'Previously_Insured', 'Vehicle_Age', 'Gender', 'Age_c', 'Vintage_c']),
    ('VPVGAR', ['Vehicle_Damage', 'Previously_Insured', 'Vehicle_Age', 'Gender', 'Age_c', 'Region_Code']),
    ('VPRV', ['Vehicle_Damage', 'Previously_Insured', 'Region_Code', 'Vintage_c']),
    ('VPVA', ['Vehicle_Damage', 'Previously_Insured', 'Vehicle_Age', 'Annual_Premium_c']),
    ('VPPAc', ['Vehicle_Damage', 'Previously_Insured', 'Policy_Sales_Channel', 'Annual_Premium_c']),
    ('VPRAc', ['Vehicle_Damage', 'Previously_Insured', 'Region_Code', 'Annual_Premium_c']),
]
comb_var_list = list()
for n, vars in comb_cat_list:
    df_train[n] = dproc.combine_cat(df_train[vars], '_')
    comb_var_list.append(('combine_categories', df_train[n], ', '.join(vars) + ' were combined', 'Categorical'))

df_var = pd.concat([
    df_var, 
    dproc.get_type_vars(comb_var_list)
], axis=0)

df_train.to_pickle(files['train_pkl'])
del df_train

In [11]:
dfl_test = pl.read_csv('data/test.csv', dtypes = pl_type)
#dfl_test, _ = dproc.apply_procs(dfl_test, dprocs)
df_test = dfl_test.to_pandas().set_index('id')
del dfl_test

df_test['Annual_Premium_c'] = df_test['Annual_Premium'].astype('category')
df_test['Annual_Premium_c'] = dproc.rearrange_cat(df_test['Annual_Premium_c'].cat.as_ordered(), cat_dtypes['Annual_Premium_c'], repl_nearest)
df_test['Age_c'] = df_test['Age'].astype('category')
df_test['Age_c'] = dproc.rearrange_cat(df_test['Age_c'].cat.as_ordered(), cat_dtypes['Age_c'], repl_nearest)
df_test['Region_Code'] = dproc.replace_cat(df_test['Region_Code'], lambda x: float(x))
df_test['Region_Code'] = dproc.rearrange_cat(df_test['Region_Code'], cat_dtypes['Region_Code'], repl_nearest)

df_test['Vintage_c'] = df_test['Vintage'].astype(cat_dtypes['Vintage_c'])

df_test['Policy_Sales_Channel'] = dproc.replace_cat(df_test['Policy_Sales_Channel'], lambda x: float(x))
df_test['Policy_Sales_Channel'] = dproc.rearrange_cat(df_test['Policy_Sales_Channel'].cat.as_ordered(), cat_dtypes['Policy_Sales_Channel'], repl_nearest)

for i in ['Previously_Insured', 'Gender', 'Vehicle_Damage', 'Vehicle_Age']:
    df_test[i] = df_test[i].astype(cat_dtypes[i])
for n, vars in comb_cat_list:
    df_test[n] = dproc.combine_cat(df_test[vars], '_')

df_test.to_pickle(files['test_pkl'])
del df_test

In [12]:
dfl_org_train = pl.read_csv('data/train_org.csv', dtypes=pl_type)
dfl_org_train, _ = dproc.apply_procs(dfl_org_train, dprocs)

dfl_org_test = pl.read_csv('data/test_org.csv', dtypes=pl_type)
dfl_org_test, _ = dproc.apply_procs(dfl_org_test, dprocs)

pd.concat([
    dfl_org_train.with_columns(id=-pl.col('id')).to_pandas().set_index('id').assign(is_train=True),
    dfl_org_test.with_columns(id=-pl.col('id')).to_pandas().set_index('id').assign(is_train=False)
], axis=0).to_pickle(files['org_pkl'])

del dfl_org_train, dfl_org_test

In [13]:
df_var.drop_duplicates().to_pickle(files['var_pkl'])

In [14]:
df_var

Unnamed: 0,var_type,Description,min,max,na,count,n_unique,f32,i32,i16,i8,dtype,src
Age,(continous),Age of the Customer.,20.0,85.0,0.0,19682810.0,66.0,True,True,True,True,Int8,
Gender,(dichotomous),Gender of the Customer.,,,0.0,19682810.0,2.0,True,True,True,True,Categorical,
Driving_License,(dichotomous),"0 for customer not having DL, 1 for customer h...",0.0,1.0,0.0,19682810.0,2.0,True,True,True,True,Int8,
Region_Code,(nominal),Unique code for the region of the customer.,0.0,52.0,0.0,19682810.0,53.25,True,True,True,True,Categorical,
Previously_Insured,(dichotomous),"0 for customer not having vehicle insurance, 1...",0.0,1.0,0.0,19682810.0,2.0,True,True,True,True,Categorical,
Vehicle_Age,(nominal),Age of the vehicle.,,,0.0,19682810.0,3.0,True,True,True,True,Categorical,
Vehicle_Damage,(dichotomous),Customer got his/her vehicle damaged in the pa...,,,0.0,19682810.0,2.0,True,True,True,True,Categorical,
Annual_Premium,(continous),The amount customer needs to pay as premium in...,2630.0,540165.0,0.0,19682810.0,46734.25,True,True,False,False,Float32,
Policy_Sales_Channel,(nominal),Anonymized Code for the channel of outreaching...,1.0,163.0,0.0,19682810.0,150.75,True,True,True,False,Categorical,
Vintage,(continous),"Number of Days, Customer has been associated w...",10.0,299.0,0.0,19682810.0,290.0,True,True,True,False,Int16,
