In [31]:
import pandas as pd
pd.set_option('display.max_columns',None)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from scipy.special import erfc
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, VotingRegressor

from lightgbm import LGBMRegressor

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

import warnings
import datawig
from category_encoders.ordinal import OrdinalEncoder

# Data load

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

### 결측치 처리

In [3]:
train.fillna('NaN', inplace=True) 
test.fillna('NaN', inplace=True)

## 데이터 전처리 / 파생변수 생성

&nbsp;

#### binary class 

In [4]:
# binary transform (0, 1)

train.gender = train.gender.replace(['F','M'],[0,1])  
test.gender = test.gender.replace(['F','M'],[0,1])

train.car = train.car.replace(['N','Y'],[0,1])  
test.car = test.car.replace(['N','Y'],[0,1])

train.reality = train.reality.replace(['N','Y'],[0,1])  
test.reality = test.reality.replace(['N','Y'],[0,1])

&nbsp;

#### days 변수 

양수변환

In [5]:
# 음수 -> 양수 변환

train.DAYS_BIRTH = -train.DAYS_BIRTH
test.DAYS_BIRTH = -test.DAYS_BIRTH

train.DAYS_EMPLOYED = -train.DAYS_EMPLOYED
test.DAYS_EMPLOYED = -test.DAYS_EMPLOYED

train.begin_month = -train.begin_month
test.begin_month = -test.begin_month

이상치 처리

In [6]:
# 365243 인 값 0으로 바꾸기

train.DAYS_EMPLOYED = train.DAYS_EMPLOYED.replace([-365243],[0])
test.DAYS_EMPLOYED = test.DAYS_EMPLOYED.replace([-365243],[0])

나이, 일한 기간 변수로 변환

In [7]:
# 360으로 나눠서 나이, 일한 년수 계산

train['EMPLOYED']= train.DAYS_EMPLOYED.apply(lambda x: abs(x/360) )
train['age']= train.DAYS_BIRTH.apply(lambda x: abs(x/360) ).astype(int)
train['begin_month']= train.begin_month.apply(lambda x: abs(x) ).astype(int)

test['EMPLOYED']= test.DAYS_EMPLOYED.apply(lambda x: abs(x/360) )
test['age']= test.DAYS_BIRTH.apply(lambda x: abs(x/360) ).astype(int)
test['begin_month']= test.begin_month.apply(lambda x: abs(x) ).astype(int)

&nbsp;

#### child num, family size

In [8]:
# child_num이 14, 19인 행 지우기
idx_child_drop = train[(train['child_num'] == 14)|(train['child_num'] == 19)].index
train = train.drop(idx_child_drop)

In [9]:
# child_num이 family_size보다 큰 행도 지우기

idx_child_drop2 = train[train['family_size'] < train['child_num']].index
train = train.drop(idx_child_drop2)

**child_num, faimily size PCA 변수 생성**

In [12]:
train_pca = train[['child_num', 'family_size']]
#train_pca_target = train['credit']       <-  이거는 혹시 왜 필요한가요?

test_pca = test[['child_num', 'family_size']]
#test_pca_target = test['credit']

In [13]:
train_pca = StandardScaler().fit_transform(train_pca)
test_pca  = StandardScaler().fit_transform(test_pca)

In [14]:
from sklearn.decomposition import PCA
pca_train = PCA(n_components=1) # 어차피 변수 2개라서 하나로만 진행
printcipalComponents_train = pca_train.fit_transform(train_pca)
principalDf_train = pd.DataFrame(data=printcipalComponents_train, columns = ['principal component'])

pca_test = PCA(n_components=1) # 어차피 변수 2개라서 하나로만 진행
printcipalComponents_test = pca_test.fit_transform(test_pca)
principalDf_test = pd.DataFrame(data=printcipalComponents_test, columns = ['principal component'])

In [16]:
train['cf_pca'] = principalDf_train # 기존 데이터 셋에 넣어주기
test['cf_pca'] = principalDf_test

&nbsp;

#### income / family size 변수 생성

In [21]:
train['if_ratio'] = train['income_total'] / train['family_size']
test['if_ratio'] = test['income_total'] / test['family_size']

#### 통합 핸드폰 관련 변수 생성

In [23]:
def try_t(data):
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 0)&(data['email'] == 0), 'try'] = 0
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 0)&(data['email'] == 1), 'try'] = 1
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 1)&(data['email'] == 0), 'try'] = 2
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 0)&(data['email'] == 0), 'try'] = 3
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 1)&(data['email'] == 1), 'try'] = 4
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 0)&(data['email'] == 1), 'try'] = 5
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 1)&(data['email'] == 0), 'try'] = 6
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 1)&(data['email'] == 1), 'try'] = 7

try_t(train)
try_t(test)

In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26452 entries, 0 to 26456
Data columns (total 25 columns):
index            26452 non-null int64
gender           26452 non-null int64
car              26452 non-null int64
reality          26452 non-null int64
child_num        26452 non-null int64
income_total     26452 non-null float64
income_type      26452 non-null object
edu_type         26452 non-null object
family_type      26452 non-null object
house_type       26452 non-null object
DAYS_BIRTH       26452 non-null int64
DAYS_EMPLOYED    26452 non-null int64
FLAG_MOBIL       26452 non-null int64
work_phone       26452 non-null int64
phone            26452 non-null int64
email            26452 non-null int64
occyp_type       26452 non-null object
family_size      26452 non-null float64
begin_month      26452 non-null int32
credit           26452 non-null float64
EMPLOYED         26452 non-null float64
age              26452 non-null int32
cf_pca           26447 non-null float64
if

&nbsp;

#### 대체된 열 / 미사용 변수 삭제

In [27]:
train = train.drop(['index', 'FLAG_MOBIL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'child_num', 'family_size', 
                   'work_phone', 'phone', 'email'],axis=1)

test = test.drop(['index', 'FLAG_MOBIL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'child_num', 'family_size', 
                   'work_phone', 'phone', 'email'],axis=1)

&nbsp;

### 스케일링 / 인코딩

log scailing - income_total

In [None]:
# train에 로그변환한 변수로 바꾸기
tr_it=train['income_total']
tr_it_log = np.log1p(tr_it)
train['income_total']= tr_it_log
# test set
te_it=test['income_total']
te_it_log = np.log1p(te_it)
test['income_total']= te_it_log

ordinal encoding

In [28]:
numerical_feats = train.dtypes[train.dtypes != "object"].index.tolist()
numerical_feats.remove('credit')
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  10
Number of Categorical features:  5


In [32]:
encoder = OrdinalEncoder(categorical_feats)
train[categorical_feats] = encoder.fit_transform(train[categorical_feats], train['credit'])
test[categorical_feats] = encoder.transform(test[categorical_feats])

standard scailing

In [33]:
#numerical_feats.remove('income_total')
scaler = StandardScaler()
train[numerical_feats] = scaler.fit_transform(train[numerical_feats])
test[numerical_feats] = scaler.transform(test[numerical_feats])