### Loading Packages

In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# from sklearn.preprocessing import Imputer    # 옛날 코드
from sklearn.impute import SimpleImputer   # Null Value 처리

from sklearn.preprocessing import PolynomialFeatures   # Numeric 피쳐를 곱해서 Feature Engineering을 편하게 하기위해
from sklearn.preprocessing import StandardScaler

# Selection: 너무많은 Feature를 생성 시에 다중 공선성 등 문제가 발생 -> 중요한 Feature를 선택
from sklearn.feature_selection import VarianceThreshold   # 변수의 Variance가 작다는 것은 정보를 적게 가지고 있다는 것, 이를 가지고 selection
from sklearn.feature_selection import SelectFromModel   # 모델마다 변수에 중요도를 매김, 이를 가지고 selection
# Selection 시 하나의 모델로만 변수를 선택하게 되면 모델별로 중요한 변수가 치우칠 수 있음, 여러 알고리즘을 가지고 해보는게 좋다. (트리, Linear, SVM 등)
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

### Column 설명
- 컬럼명 뒤쪽에 붙어있는 것
    - bin = 바이너리
    - cat = 카테고리
    - 없는것은 = continuous or ordinal
    - Missing Value = -1
    - Target = 0, 1

In [23]:
DEBUG = True

In [29]:
# 디버깅 시 팁   /  Data가 너무 많은 경우 디버깅을 빠르게 할 수 없으므로 데이터 수를 줄여서 사용

if DEBUG:
    NROWS = 500000
else:
    NROWS = None

### EDA할 때 데이터가 너무 많으면 5만개~10만개, 100만개 정도로만 하자. PC사양에 따라 적절하게

In [32]:
train = pd.read_csv('/kaggle/input/porto-seguro-safe-driver-prediction/train.csv', nrows=NROWS)
test = pd.read_csv('/kaggle/input/porto-seguro-safe-driver-prediction/test.csv', nrows=NROWS)

train = train.sample(frac=0.1)    # 위의 디버깅 방법외에도 이 코드로 비율을 뽑아서 쓸 수 있음

In [None]:
# 클래스의 불균형이 걱정되면 아래의 코드 StratifiedKFold 사용해서 샘플링

# from sklearn.model_selection import StratifiedKFold
# fold = StratifiedKFold(n_splits=10, random_state=38)
# for train_idx, val_idx in fold.split(train, train['target']):
#     break
# train = train.iloc[train_idx]

In [33]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
216267,540294,0,1,1,3,0,0,1,0,0,0,0,0,0,0,0,7,0,0,1,0.2,0.1,-1.0,7,1,1,0,1,0,1,1,0,1,87,3,0.316228,0.68203,0.294449,3.162278,0.0,0.6,0.3,4,2,9,4,9,1,8,8,0,2,6,1,1,0,1,1,0
353894,884256,0,0,1,8,0,0,1,0,0,0,0,0,0,0,0,5,0,1,0,0.9,0.2,0.56734,7,1,0,0,1,11,1,1,2,1,7,3,0.374166,0.810379,0.35426,3.316625,0.6,0.7,0.6,0,1,7,2,8,4,8,8,2,5,7,0,1,1,0,1,1
420872,1052029,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,6,1,0,0,0.0,0.0,0.707107,7,1,-1,0,-1,11,1,0,2,1,76,3,0.316228,0.853045,0.379473,3.741657,0.8,0.1,0.2,3,2,6,1,9,1,7,5,1,6,5,0,0,0,0,1,0
54298,136243,0,4,1,2,0,0,0,0,0,1,0,0,0,0,0,10,1,0,0,0.9,0.0,1.173137,10,1,-1,0,1,11,1,1,0,1,3,2,0.424264,1.167271,0.434971,3.464102,0.1,0.0,0.1,1,2,10,1,11,0,6,11,2,0,7,0,0,0,0,0,1
230356,575778,0,0,1,2,0,0,1,0,0,0,0,0,0,0,0,7,1,0,0,0.9,0.2,0.54658,7,1,0,0,0,1,1,1,2,1,10,2,0.316228,0.737891,0.325576,3.162278,0.5,0.5,0.2,1,0,9,2,10,3,11,4,0,5,7,0,1,1,1,0,1


In [38]:
# 변수 중 카테고리컬 피처만을 뽑기

변수 중 카테고리컬 피처만을 뽑기

cat_cols= [col for col in train.columns if 'cat' in col]    # cat이라는 단어가 포함되어있는 컬럼
cat_cols

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [67]:
# 카테고리컬 피처의 분포보기 (정수형인지, 몇개의 Unique한 클래스를 가지는지)
# Categorical의 13번째 피처는 104개의 클래스 = high cardinality(고차원),  따라서 원핫대신 다른 Encoding 방법 사용

for col in cat_cols:
    print(col, train[col].nunique(), train[col].unique())

ps_ind_02_cat 5 [ 1  3  2  4 -1]
ps_ind_04_cat 3 [ 0  1 -1]
ps_ind_05_cat 8 [ 0  3  6  4  1 -1  2  5]
ps_car_01_cat 13 [ 7 10 11  4  9  6  0  5  2  8  3  1 -1]
ps_car_02_cat 2 [1 0]
ps_car_03_cat 3 [ 1  0 -1]
ps_car_04_cat 10 [0 4 1 8 2 9 6 3 7 5]
ps_car_05_cat 3 [ 1 -1  0]
ps_car_06_cat 18 [ 0 11  1 14  3 10 16 17  7  4 15  6 13  2  9  8 12  5]
ps_car_07_cat 3 [ 1  0 -1]
ps_car_08_cat 2 [1 0]
ps_car_09_cat 6 [ 0  2  1  3  4 -1]
ps_car_10_cat 3 [1 0 2]
ps_car_11_cat 104 [ 87   7  76   3  10  85  31  34  22  29  99 104  53  80  44  45  82  74
  16  83  36  64  38  88  18  32  46  78 103  90 101  68  39  72  66  12
  30  98   5  25   1   2  49  37  14  52  77  97  42 100  19  20  48  51
  62  84  65  75  56  95  11  91  86  27  23  94  28  93  17 102  41  67
   4  26  60  47   6  89  57  55  40  92  24  63  61  58  33  15  70  35
  43  21  96  59  69  79   9   8  50  73  81  54  71  13]


In [68]:
# Train 데이터의 중복여부 확인,  중복이 존재하지 않음

train.drop_duplicates()
train.shape

(50000, 59)

In [69]:
# 변수의 정보확인, 모두 int 및 Float /  Object형 즉, String형태의 컬럼은 없음
# Null Data도 확인 / 여기는 -1로 다 치환되어 있기 때문에 없음

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 216267 to 319875
Data columns (total 59 columns):
id                50000 non-null int64
target            50000 non-null int64
ps_ind_01         50000 non-null int64
ps_ind_02_cat     50000 non-null int64
ps_ind_03         50000 non-null int64
ps_ind_04_cat     50000 non-null int64
ps_ind_05_cat     50000 non-null int64
ps_ind_06_bin     50000 non-null int64
ps_ind_07_bin     50000 non-null int64
ps_ind_08_bin     50000 non-null int64
ps_ind_09_bin     50000 non-null int64
ps_ind_10_bin     50000 non-null int64
ps_ind_11_bin     50000 non-null int64
ps_ind_12_bin     50000 non-null int64
ps_ind_13_bin     50000 non-null int64
ps_ind_14         50000 non-null int64
ps_ind_15         50000 non-null int64
ps_ind_16_bin     50000 non-null int64
ps_ind_17_bin     50000 non-null int64
ps_ind_18_bin     50000 non-null int64
ps_reg_01         50000 non-null float64
ps_reg_02         50000 non-null float64
ps_reg_03         50000