# 0. data import 

In [28]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [29]:
from catboost.datasets import titanic

train, test = titanic()

# 1. ColumnTransformer를 이용하지 않았을 경우 
### train 데이터 변환

In [30]:
ycol = ["Survived"]
xcols = [col for col in train.columns if col not in ycol]

In [31]:
num_cols = train[xcols].select_dtypes(include=np.number).columns.tolist()
cat_cols = [col for col in xcols if col not in num_cols+ycol+["Name","Ticket","Cabin"]]

- OneHot Encoding

In [32]:
cat_enc = OneHotEncoder(sparse=False)
cat_enc.fit(train[cat_cols])

In [33]:
train_cat_encoded = cat_enc.transform(train[cat_cols])

- Standard Scaling

In [34]:
num_enc = StandardScaler()
num_enc.fit(train[num_cols])

In [35]:
train_num_encoded = num_enc.transform(train[num_cols])

In [36]:
train_encoded = np.concatenate([train_cat_encoded,train_num_encoded], axis=1)

### test 데이터 변환

In [37]:
test_cat_encoded = cat_enc.transform(test[cat_cols])
test_num_encoded = num_enc.transform(test[num_cols])
test_encoded = np.concatenate([test_cat_encoded, test_num_encoded], axis=1)

# 2. ColumnTransformer 활용 (pandas)

In [38]:
ycol = ["Survived"]
xcols = [col for col in train.columns if col not in ycol]

In [39]:
num_cols = train[xcols].select_dtypes(include=np.number).columns.tolist()
cat_cols = [col for col in xcols if col not in num_cols+ycol+["Name","Ticket","Cabin"]]

In [40]:
trans = ColumnTransformer([
    ('cat_cols',OneHotEncoder(sparse=False), cat_cols),
    ('num_cols',StandardScaler(), num_cols)])

In [41]:
trans.fit(train[xcols])
train_encoded = trans.transform(train[xcols])
test_encoded = trans.transform(test[xcols])

In [42]:
trans

# 3. ColumnTransformer (array)

In [43]:
ycol = ["Survived"]
xcols = [col for col in train.columns if col not in ycol]

In [44]:
cat_cols = [i for i,col in enumerate(xcols) if (test[col].dtypes=="object") and (col not in ["Name","Ticket","Cabin"])]
num_cols = [i for i,col in enumerate(xcols) if (i not in cat_cols) and (col not in ["Name","Ticket","Cabin"])]

In [45]:
train_numpy = train[xcols].to_numpy()
test_numpy = test[xcols].to_numpy()

In [46]:
trans = ColumnTransformer([
    ('cat_cols',OneHotEncoder(sparse=False), cat_cols),
    ('num_cols',StandardScaler(), num_cols)])

In [47]:
trans.fit(train_numpy)
train_encoded = trans.transform(train_numpy)
test_encoded = trans.transform(test_numpy)

In [48]:
trans

# 4. make_column_selector 활용 (pandas)

In [49]:
ycol = ["Survived"]
xcols = [col for col in train.columns if col not in ycol+["Name","Ticket","Cabin"]]

In [50]:
trans = ColumnTransformer([
    ('cat_cols',OneHotEncoder(sparse_output=False), make_column_selector(dtype_exclude=np.number)),
    ('num_cols',StandardScaler(), make_column_selector(dtype_include=np.number))])

In [51]:
trans.fit(train[xcols])
train_encoded = trans.transform(train[xcols])
test_encoded = trans.transform(test[xcols])

In [52]:
trans