In [20]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from scipy.special import erfc
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, VotingRegressor

from lightgbm import LGBMRegressor

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

import warnings
import datawig

In [26]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import ADASYN # doctest: +NORMALIZE_WHITESPACE
from imblearn.over_sampling import SMOTENC

&nbsp;

# data load

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')

&nbsp;

## binary class transformation

In [3]:
# binary transform (0, 1)

train.gender = train.gender.replace(['F','M'],[0,1])  
test.gender = test.gender.replace(['F','M'],[0,1])

train.car = train.car.replace(['N','Y'],[0,1])  
test.car = test.car.replace(['N','Y'],[0,1])

train.reality = train.reality.replace(['N','Y'],[0,1])  
test.reality = test.reality.replace(['N','Y'],[0,1])

&nbsp;

## days 변수 양수변환, 360나누기

In [4]:
# 음수 -> 양수 변환

train.DAYS_BIRTH = -train.DAYS_BIRTH
test.DAYS_BIRTH = -test.DAYS_BIRTH

train.DAYS_EMPLOYED = -train.DAYS_EMPLOYED
test.DAYS_EMPLOYED = -test.DAYS_EMPLOYED

train.begin_month = -train.begin_month
test.begin_month = -test.begin_month

In [5]:
# 365243 인 값 0으로 바꾸기

train.DAYS_EMPLOYED = train.DAYS_EMPLOYED.replace([-365243],[0])
test.DAYS_EMPLOYED = test.DAYS_EMPLOYED.replace([-365243],[0])

In [6]:
# 360으로 나눠서 나이, 일한 년수 계산

train['EMPLOYED']= train.DAYS_EMPLOYED.apply(lambda x: abs(x/360) )
train['age']= train.DAYS_BIRTH.apply(lambda x: abs(x/360) ).astype(int)
train['begin_month']= train.begin_month.apply(lambda x: abs(x) ).astype(int)

test['EMPLOYED']= test.DAYS_EMPLOYED.apply(lambda x: abs(x/360) )
test['age']= test.DAYS_BIRTH.apply(lambda x: abs(x/360) ).astype(int)
test['begin_month']= test.begin_month.apply(lambda x: abs(x) ).astype(int)

&nbsp;

## log transformation

income total

In [16]:
# train에 로그변환한 변수로 바꾸기
tr_it=train['income_total']
tr_it_log = np.log1p(tr_it)
train['income_total']= tr_it_log
# test set도
te_it=test['income_total']
te_it_log = np.log1p(te_it)
test['income_total']= te_it_log

days_employed

In [11]:
train['EMPLOYED']= np.log1p(train['EMPLOYED'])
# 테스트셋에 적용한 부분이 없는 것 같아서 일단 임의로 추가했습니다!
test['EMPLOYED']= np.log1p(test['EMPLOYED'])

&nbsp;

## 이상치, 오류 제거

In [12]:
# child_num이 14, 19인 행 지우기
idx_child_drop = train[(train['child_num'] == 14)|(train['child_num'] == 19)].index
train = train.drop(idx_child_drop)

In [13]:
# child_num이 family_size보다 큰 행도 지우기

idx_child_drop2 = train[train['family_size'] < train['child_num']].index
train = train.drop(idx_child_drop2)

In [14]:
# test도 드랍
idx_child_drop_test = test[test['family_size'] < test['child_num']].index
test = test.drop(idx_child_drop_test)

&nbsp;

## child_num, family_size 파생변수 생성

In [17]:
train['cf_ratio'] = train['child_num'] / train['family_size']
test['cf_ratio'] = test['child_num'] / test['family_size']

train['ic_ratio'] = train['income_total'] / train['child_num']
test['ic_ratio'] = test['income_total'] / test['child_num']

train['if_ratio'] = train['income_total'] / train['family_size']
test['if_ratio'] = test['income_total'] / test['family_size']

&nbsp;

## 결측치 처리

days_employed가 0인 행들은 직업값을 미고용으로 변환

In [19]:
def unemployed(data):
    data.loc[(data['DAYS_EMPLOYED'] == 0), 'occyp_type'] = 'unemployed'

#train에 적용
unemployed(train)
#test에 적용
unemployed(test)

#### 카테고리 변수에 DATAWig적용

우선 명목형 변수를 수치형으로 변환해서 결측치를 대치할 columm과 상관성이 높은 column들을 확인해줘야 함
> labeling encoding

In [21]:
train_label = train.copy()
test_label = test.copy()
label_encoder=preprocessing.LabelEncoder()

# 소득 형태 변환
train_label['income_type']=label_encoder.fit_transform(train['income_type'])
test_label['income_type']=label_encoder.transform(test['income_type'])

# 학력 수준 변환
train_label['edu_type']=label_encoder.fit_transform(train['edu_type'])
test_label['edu_type']=label_encoder.transform(test['edu_type'])

# 가족 형태 변환
train_label['family_type']=label_encoder.fit_transform(train['family_type'])
test_label['family_type']=label_encoder.transform(test['family_type'])

# 가족 형태 변환
train_label['house_type']=label_encoder.fit_transform(train['house_type'])
test_label['house_type']=label_encoder.transform(test['house_type'])

# 직업 형태 변환
train_label['occyp_type']=label_encoder.fit_transform(train['occyp_type'].astype(str))
test_label['occyp_type']=label_encoder.fit_transform(test['occyp_type'].astype(str))

상관계수 확인

In [22]:
train_label.corr()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,...,email,occyp_type,family_size,begin_month,credit,EMPLOYED,age,cf_ratio,ic_ratio,if_ratio
index,1.0,-0.006306,0.004171,0.006656,0.001048,0.004074,0.009495,-0.003191,0.009164,-0.002158,...,0.005336,-7.3e-05,-0.00289,-0.002248,-0.00785,-0.002506,0.008844,0.003113,0.004624,0.004174
gender,-0.006306,1.0,0.362132,-0.048955,0.076737,0.232167,0.102934,0.006226,-0.102035,0.067504,...,0.000167,-0.136227,0.111463,-0.007852,0.001433,0.095093,-0.201519,0.066962,-0.029085,-0.101108
car,0.004171,0.362132,1.0,-0.016613,0.103048,0.230868,0.04822,-0.105696,-0.124334,0.018668,...,0.018958,-0.126614,0.150871,0.029828,0.00765,0.111774,-0.156892,0.094127,-0.046457,-0.152147
reality,0.006656,-0.048955,-0.016613,1.0,-0.001586,0.03623,-0.048522,0.011502,0.02803,-0.177957,...,0.051105,0.051685,-0.008507,-0.003865,-0.009422,-0.076737,0.13074,-0.002811,0.003664,0.015964
child_num,0.001048,0.076737,0.103048,-0.001586,1.0,0.03771,0.106968,-0.052706,-0.169425,0.0248,...,0.017641,-0.147514,0.886867,0.00741,0.002038,0.176491,-0.343496,0.954081,-0.952372,-0.593247
income_total,0.004074,0.232167,0.230868,0.03623,0.03771,1.0,-0.055405,-0.225854,0.000435,-0.000211,...,0.091373,-0.134836,0.022806,0.023876,0.005347,0.195031,-0.108004,0.04741,0.076187,0.021422
income_type,0.009495,0.102934,0.04822,-0.048522,0.106968,-0.055405,1.0,0.056578,-0.050915,0.033068,...,-0.020006,-0.213458,0.107587,-0.002777,-0.008284,0.314223,-0.211611,0.108785,-0.052183,-0.092695
edu_type,-0.003191,0.006226,-0.105696,0.011502,-0.052706,-0.225854,0.056578,1.0,0.008633,-0.039632,...,-0.095605,0.089266,-0.043676,-0.013846,0.013695,-0.079305,0.166826,-0.060239,-0.012125,0.023538
family_type,0.009164,-0.102035,-0.124334,0.02803,-0.169425,0.000435,-0.050915,0.008633,1.0,0.007227,...,-0.015389,0.062558,-0.557782,-0.030677,-0.005317,-0.108789,0.10426,-0.125419,0.093436,0.78324
house_type,-0.002158,0.067504,0.018668,-0.177957,0.0248,-0.000211,0.033068,-0.039632,0.007227,1.0,...,0.013499,-0.068808,0.005789,-0.030553,-0.008985,0.048701,-0.209918,0.035281,0.036811,0.007778


In [23]:
# data split
train_notnull = train[train.occyp_type.notnull()]
train_null = train[train.occyp_type.isnull()]

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=['EMPLOYED','age','income_type'], # column(s) containing information about the column we want to impute
    output_column='occyp_type', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=train_notnull, num_epochs=50)

#Impute missing values and return original dataframe with predictions
imputed = imputer.predict(train_null)

2021-05-25 11:14:44,270 [INFO]  CategoricalEncoder for column occyp_type                                found only 88 occurrences of value Secretaries
2021-05-25 11:14:44,271 [INFO]  CategoricalEncoder for column occyp_type                                found only 62 occurrences of value Realty agents
2021-05-25 11:14:44,272 [INFO]  CategoricalEncoder for column occyp_type                                found only 57 occurrences of value HR staff
2021-05-25 11:14:44,273 [INFO]  CategoricalEncoder for column occyp_type                                found only 38 occurrences of value IT staff
2021-05-25 11:14:44,818 [INFO]  
2021-05-25 11:14:51,275 [INFO]  Epoch[0] Batch [0-639]	Speed: 1590.78 samples/sec	cross-entropy=1.726312	occyp_type-accuracy=0.405859
2021-05-25 11:14:57,646 [INFO]  Epoch[0] Train-cross-entropy=1.697577
2021-05-25 11:14:57,647 [INFO]  Epoch[0] Train-occyp_type-accuracy=0.412167
2021-05-25 11:14:57,648 [INFO]  Epoch[0] Time cost=12.826
2021-05-25 11:14:57,654 [INFO

2021-05-25 11:17:37,416 [INFO]  Epoch[12] Validation-occyp_type-accuracy=0.427817
2021-05-25 11:17:43,709 [INFO]  Epoch[13] Batch [0-639]	Speed: 1627.50 samples/sec	cross-entropy=1.654349	occyp_type-accuracy=0.421289
2021-05-25 11:17:49,950 [INFO]  Epoch[13] Train-cross-entropy=1.645640
2021-05-25 11:17:49,951 [INFO]  Epoch[13] Train-occyp_type-accuracy=0.421313
2021-05-25 11:17:49,952 [INFO]  Epoch[13] Time cost=12.535
2021-05-25 11:17:49,957 [INFO]  Saved checkpoint to "imputer_model\model-0013.params"
2021-05-25 11:17:50,522 [INFO]  Epoch[13] Validation-cross-entropy=1.625137
2021-05-25 11:17:50,523 [INFO]  Epoch[13] Validation-occyp_type-accuracy=0.429137
2021-05-25 11:17:56,816 [INFO]  Epoch[14] Batch [0-639]	Speed: 1627.31 samples/sec	cross-entropy=1.652852	occyp_type-accuracy=0.421777
2021-05-25 11:18:03,105 [INFO]  Epoch[14] Train-cross-entropy=1.644421
2021-05-25 11:18:03,106 [INFO]  Epoch[14] Train-occyp_type-accuracy=0.420824
2021-05-25 11:18:03,107 [INFO]  Epoch[14] Time co

2021-05-25 11:21:06,917 [INFO]  Epoch[26] Validation-occyp_type-accuracy=0.429577
2021-05-25 11:21:14,924 [INFO]  Epoch[27] Batch [0-639]	Speed: 1279.25 samples/sec	cross-entropy=1.644251	occyp_type-accuracy=0.421094
2021-05-25 11:21:22,820 [INFO]  Epoch[27] Train-cross-entropy=1.635134
2021-05-25 11:21:22,821 [INFO]  Epoch[27] Train-occyp_type-accuracy=0.421264
2021-05-25 11:21:22,822 [INFO]  Epoch[27] Time cost=15.905
2021-05-25 11:21:22,835 [INFO]  Saved checkpoint to "imputer_model\model-0027.params"
2021-05-25 11:21:23,405 [INFO]  Epoch[27] Validation-cross-entropy=1.619920
2021-05-25 11:21:23,406 [INFO]  Epoch[27] Validation-occyp_type-accuracy=0.430018
2021-05-25 11:21:31,258 [INFO]  Epoch[28] Batch [0-639]	Speed: 1304.53 samples/sec	cross-entropy=1.643862	occyp_type-accuracy=0.421387
2021-05-25 11:21:39,118 [INFO]  Epoch[28] Train-cross-entropy=1.634688
2021-05-25 11:21:39,119 [INFO]  Epoch[28] Train-occyp_type-accuracy=0.421802
2021-05-25 11:21:39,120 [INFO]  Epoch[28] Time co

2021-05-25 11:24:33,988 [INFO]  Epoch[40] Validation-occyp_type-accuracy=0.431338
2021-05-25 11:24:40,294 [INFO]  Epoch[41] Batch [0-639]	Speed: 1624.21 samples/sec	cross-entropy=1.638714	occyp_type-accuracy=0.420215
2021-05-25 11:24:46,540 [INFO]  Epoch[41] Train-cross-entropy=1.629309
2021-05-25 11:24:46,540 [INFO]  Epoch[41] Train-occyp_type-accuracy=0.421166
2021-05-25 11:24:46,541 [INFO]  Epoch[41] Time cost=12.552
2021-05-25 11:24:46,546 [INFO]  Saved checkpoint to "imputer_model\model-0041.params"
2021-05-25 11:24:47,092 [INFO]  Epoch[41] Validation-cross-entropy=1.615722
2021-05-25 11:24:47,093 [INFO]  Epoch[41] Validation-occyp_type-accuracy=0.429577
2021-05-25 11:24:53,355 [INFO]  Epoch[42] Batch [0-639]	Speed: 1635.39 samples/sec	cross-entropy=1.638116	occyp_type-accuracy=0.420898
2021-05-25 11:24:59,632 [INFO]  Epoch[42] Train-cross-entropy=1.628928
2021-05-25 11:24:59,633 [INFO]  Epoch[42] Train-occyp_type-accuracy=0.421802
2021-05-25 11:24:59,634 [INFO]  Epoch[42] Time co

<datawig.simple_imputer.SimpleImputer at 0x29f00d8c730>

  return np.log(probas)


데이터셋 병합

In [29]:
imputed = imputed.drop(['occyp_type','occyp_type_imputed_proba'],axis=1)

imputed = imputed.rename(columns = {'occyp_type_imputed':'occyp_type'})

train_imputed = pd.concat([train_notnull,imputed],axis = 0)

train_imputed.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  train_imputed = pd.concat([train_notnull,imputed],axis = 0)


Unnamed: 0,DAYS_BIRTH,DAYS_EMPLOYED,EMPLOYED,FLAG_MOBIL,age,begin_month,car,cf_ratio,child_num,credit,...,house_type,ic_ratio,if_ratio,income_total,income_type,index,occyp_type,phone,reality,work_phone
1,11380,1540,0.979643,1,31,5,0,0.333333,1,1.0,...,House / apartment,2.596684,0.865561,2.596684,Commercial associate,1,Laborers,0,1,0
2,19087,4434,1.277878,1,53,22,1,0.0,0,2.0,...,House / apartment,inf,1.320136,2.640271,Working,2,Managers,1,1,0
3,15088,2092,1.071089,1,41,37,0,0.0,0,0.0,...,House / apartment,inf,1.290809,2.581617,Commercial associate,3,Sales staff,1,1,0
4,15037,2105,1.072899,1,41,26,1,0.0,0,2.0,...,House / apartment,inf,1.281211,2.562422,State servant,4,Managers,0,1,0
5,13413,4996,1.308297,1,37,18,0,0.5,2,1.0,...,House / apartment,1.301574,0.650787,2.603147,Working,5,High skill tech staff,0,1,0


테스트셋에도 적용

In [24]:
# data split
test_notnull = test[test.occyp_type.notnull()]
test_null = test[test.occyp_type.isnull()]

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=['DAYS_EMPLOYED','DAYS_BIRTH','income_type'], # column(s) containing information about the column we want to impute
    output_column='occyp_type', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=test_notnull, num_epochs=50)

#Impute missing values and return original dataframe with predictions
imputed_2 = imputer.predict(test_null)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<datawig.simple_imputer.SimpleImputer at 0x29f00df8040>

In [25]:
imputed_2 = imputed_2.drop(['occyp_type','occyp_type_imputed_proba'],axis=1)

imputed_2 = imputed_2.rename(columns = {'occyp_type_imputed':'occyp_type'})

test_imputed = pd.concat([test_notnull,imputed_2],axis = 0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  test_imputed = pd.concat([test_notnull,imputed_2],axis = 0)


&nbsp;

## 원핫인코딩, 임베딩

In [30]:
import tensorflow as tf
from tensorflow import feature_column

In [31]:
family_em_train = feature_column.embedding_column(train_imputed['family_type'],dimension=8)
income_em_train = feature_column.embedding_column(train_imputed['income_type'],dimension=8)
house_em_train = feature_column.embedding_column(train_imputed['house_type'],dimension=8)

In [32]:
family_em_test = feature_column.embedding_column(test_imputed['family_type'],dimension=8)
income_em_test = feature_column.embedding_column(test_imputed['income_type'],dimension=8)
house_em_test = feature_column.embedding_column(test_imputed['house_type'],dimension=8)

In [33]:
edu_train = pd.get_dummies(train_imputed.edu_type)
# edu_test = pd.get_dummies(test_imputed.edu_type)

In [36]:
train_imputed['family_type'] = family_em_train
train_imputed['income_type'] = income_em_train
train_imputed['house_type'] = house_em_train

test_imputed['family_type'] = family_em_test
test_imputed['income_type'] = income_em_test
test_imputed['house_type'] = house_em_test

ValueError: Length of values does not match length of index

## 대체된 열 삭제

In [None]:
train_imputed = train_imputed.drop(['index', 'FLAG_MOBIL', 'DAYS_BIRTH', 'DAYS_EMPLOYED'])
test_imputed = test_imputed.drop(['index', 'FLAG_MOBIL', 'DAYS_BIRTH', 'DAYS_EMPLOYED'])