In [1]:
import numpy as np
import pandas as pd

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/big-data-analytics-certification/t2-2-y_train.csv
/kaggle/input/big-data-analytics-certification/t1-data1.csv
/kaggle/input/big-data-analytics-certification/t2-2-X_train.csv
/kaggle/input/big-data-analytics-certification/t2-1-sample_submission.csv
/kaggle/input/big-data-analytics-certification/t1-data2.csv
/kaggle/input/big-data-analytics-certification/t2-1-test.csv
/kaggle/input/big-data-analytics-certification/t2-2-X_test.csv
/kaggle/input/big-data-analytics-certification/t2-1-train.csv


In [3]:
train = pd.read_csv('/kaggle/input/big-data-analytics-certification/t2-1-train.csv')
test = pd.read_csv('/kaggle/input/big-data-analytics-certification/t2-1-test.csv')

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   1490 non-null   int64  
 1   Age                  1490 non-null   int64  
 2   Employment Type      1490 non-null   object 
 3   GraduateOrNot        1490 non-null   object 
 4   AnnualIncome         1486 non-null   float64
 5   FamilyMembers        1490 non-null   int64  
 6   ChronicDiseases      1490 non-null   int64  
 7   FrequentFlyer        1490 non-null   object 
 8   EverTravelledAbroad  1490 non-null   object 
 9   TravelInsurance      1490 non-null   int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 116.5+ KB


In [5]:
for col in train.columns:
    print(f'{col} unique values :', train[col].unique())
    print('-'*100)

id unique values : [10000 10001 10002 ... 11487 11488 11489]
----------------------------------------------------------------------------------------------------
Age unique values : [28 31 29 33 30 26 25 27 34 32 35]
----------------------------------------------------------------------------------------------------
Employment Type unique values : ['Private Sector/Self Employed' 'Government Sector']
----------------------------------------------------------------------------------------------------
GraduateOrNot unique values : ['Yes' 'No']
----------------------------------------------------------------------------------------------------
AnnualIncome unique values : [1250000. 1200000.  650000.  800000.  550000. 1400000. 1150000. 1050000.
  400000. 1000000.      nan 1450000. 1100000. 1500000.  300000.  500000.
  600000. 1300000.  350000.  900000. 1350000. 1750000.  700000.  750000.
  950000.  850000. 1700000.  450000. 1800000. 1550000. 1650000.]
---------------------------------------

데이터 요약
- id : 이산형, 개별 데이터 id
- Age : 이산형
- Employment Type : 명목형 이진 데이터, 숫자 매핑 필요(0,1 매핑 or labelencoder)
- GraduateOrNot  : 명목형 이진 데이터, 숫자 매핑 필요(0,1 매핑 or labelencoder)
- AnnualIncome : 연속형, 대체로 수입 관련 데이터는 정규분포를 따르지 않으므로 최소-최대정규화 적용 예정이나 shapiro wilk 테스트 또는 QQ플롯으로 확인 해볼것. 확인 결과 정규분포 따름. 표준화, 정규화 모두 사용 가능
- FamilyMembers : 이산형
- ChronicDiseases : 이산형 이진 데이터
- FrequentFlyer : 명목형 이진 데이터, 숫자 매핑 필요(0,1 매핑 or labelencoder)
- EverTravelledAbroad : 명목형 이진 데이터, 숫자 매핑 필요(0,1 매핑 or labelencoder)
- TravelInsurance : 이산형 이진 데이터, 타켓 데이터

In [6]:
# 결측치 파악
train.isnull().sum()

id                     0
Age                    0
Employment Type        0
GraduateOrNot          0
AnnualIncome           4
FamilyMembers          0
ChronicDiseases        0
FrequentFlyer          0
EverTravelledAbroad    0
TravelInsurance        0
dtype: int64

In [7]:
test.isnull().sum()

id                     0
Age                    0
Employment Type        0
GraduateOrNot          0
AnnualIncome           3
FamilyMembers          0
ChronicDiseases        0
FrequentFlyer          0
EverTravelledAbroad    0
dtype: int64

In [8]:
# 결측치 대체 - 연속형 데이터이므로 중앙값으로 대체
median = train['AnnualIncome'].median()
train['AnnualIncome'] = train['AnnualIncome'].fillna(value=median)
test['AnnualIncome'] = test['AnnualIncome'].fillna(value=median)

In [9]:
# 명목형 이진 데이터 레이블인코딩(숫자 매핑)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# train data
train['Employment Type'] = le.fit_transform(train['Employment Type'])
train['GraduateOrNot'] = le.fit_transform(train['GraduateOrNot'])
train['FrequentFlyer'] = le.fit_transform(train['FrequentFlyer'])
train['EverTravelledAbroad'] = le.fit_transform(train['EverTravelledAbroad'])
# test data
test['Employment Type'] = le.fit_transform(test['Employment Type'])
test['GraduateOrNot'] = le.fit_transform(test['GraduateOrNot'])
test['FrequentFlyer'] = le.fit_transform(test['FrequentFlyer'])
test['EverTravelledAbroad'] = le.fit_transform(test['EverTravelledAbroad'])

In [10]:
# 연속형 데이터 최소-최대 정규화
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train['AnnualIncome'] = scaler.fit_transform(train['AnnualIncome'].values.reshape(-1, 1))

In [11]:
train.head()

Unnamed: 0,id,Age,Employment Type,GraduateOrNot,AnnualIncome,FamilyMembers,ChronicDiseases,FrequentFlyer,EverTravelledAbroad,TravelInsurance
0,10000,28,1,1,0.633333,6,1,0,0,0
1,10001,31,1,1,0.633333,7,1,0,0,0
2,10002,29,1,1,0.6,7,0,0,0,1
3,10003,33,0,1,0.233333,6,1,0,0,1
4,10004,28,1,1,0.333333,6,0,0,1,1


In [12]:
# 데이터 분리
X = train.drop(['id', 'TravelInsurance'], axis=1).to_numpy()
y = train['TravelInsurance'].to_numpy()
test_id = test.pop('id')

In [13]:
# 모델 객체 생성
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

model = RandomForestClassifier()
# rf = RandomForestClassifier()
# bg = BaggingClassifier()
# xgb = XGBClassifier()
# lgbm = LGBMClassifier(verbose=-1)
# catboost = CatBoostClassifier(verbose=False)

# estimators = [
#     ('rg', rf), ('bg', bg), ('xgb', xgb),
#     ('lgbm', lgbm), ('catboost', catboost)
# ]
# voting = VotingClassifier(estimators=estimators, voting='soft')

In [14]:
# 평가 지표 설정
from sklearn.metrics import roc_auc_score

In [15]:
# # 계층화 K-fold 교차 검증 수행
# from sklearn.model_selection import StratifiedKFold
# random_state = 42
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

# roc_auc_lst = []

# for train_idx, val_idx in skf.split(X, y):
#     X_train, X_val = X[train_idx], X[val_idx]
#     y_train, y_val = y[train_idx], y[val_idx]
    
#     voting.fit(X_train, y_train)
    
#     preds_proba = voting.predict_proba(X_val)[:, 1]
#     roc_auc = roc_auc_score(y_val, preds_proba)
#     roc_auc_lst.append(roc_auc)

# print('Mean roc auc', np.mean(roc_auc_lst))

In [16]:
from sklearn.model_selection import cross_val_score

In [17]:
cv_result = cross_val_score(model, X, y, scoring='roc_auc')
np.mean(cv_result)

0.8045201085615593

In [18]:
# 전체 학습 및 추론
# voting.fit(X, y)
# preds_proba = voting.predict_proba(test)[:, 1]
model.fit(X, y)
preds_proba = model.predict_proba(test)[:, 1]



In [19]:
# 제출
submission = pd.DataFrame({'id': test_id, 'TravelInsurance': preds_proba})
submission.to_csv('submission.csv', index=False)