In [7]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from scipy.special import erfc
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, VotingRegressor

from lightgbm import LGBMRegressor

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

import warnings

# Data Load

In [8]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')

In [9]:
train_original = train.copy()               #원본 데이터 보존
test_original = test.copy()               

## binary class 변환

In [10]:
# 성별 F,M을 각각 0,1로 변환
train.gender = train.gender.replace(['F','M'],[0,1])  
test.gender = test.gender.replace(['F','M'],[0,1])

# 차량 보유 여부 N,Y을 각각 0,1로 변환
train.car = train.car.replace(['N','Y'],[0,1])  
test.car = test.car.replace(['N','Y'],[0,1])

# 부동산 보유 여부 N,Y을 각각 0,1로 변환
train.reality = train.reality.replace(['N','Y'],[0,1])  
test.reality = test.reality.replace(['N','Y'],[0,1])

&nbsp;

&nbsp;

&nbsp;

## 결측값 처리: occyp_type

> Datawig

In [5]:
import datawig

- 심층신경망(DNN)을 사용하여 데이터프레임에 존재하는 결측값을 채우도록 머신러닝 모델을 훈련하는 알고리즘  
- 다른 대체법에 비하여 상당히 정확한 성능을 보임  
- 오래걸림

impute를 실행할 column과 그와 상관성이 높은 column 또는 다수의 column을 지정해줘야 한다.

In [55]:
train_label = train.copy()
test_label = test.copy()
label_encoder=preprocessing.LabelEncoder()

# 소득 형태 변환
train_label['income_type']=label_encoder.fit_transform(train['income_type'])
test_label['income_type']=label_encoder.transform(test['income_type'])

# 학력 수준 변환
train_label['edu_type']=label_encoder.fit_transform(train['edu_type'])
test_label['edu_type']=label_encoder.transform(test['edu_type'])

# 가족 형태 변환
train_label['family_type']=label_encoder.fit_transform(train['family_type'])
test_label['family_type']=label_encoder.transform(test['family_type'])

# 가족 형태 변환
train_label['house_type']=label_encoder.fit_transform(train['house_type'])
test_label['house_type']=label_encoder.transform(test['house_type'])

# 직업 형태 변환
train_label['occyp_type']=label_encoder.fit_transform(train['occyp_type'].astype(str))
test_label['occyp_type']=label_encoder.fit_transform(test['occyp_type'].astype(str))

In [56]:
train_label.corr()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
index,1.0,-0.006261,0.00421,0.006645,0.001012,0.009848,0.009505,-0.003183,0.009149,-0.002161,-0.008901,0.004259,,-0.005575,-0.00179,0.005332,-0.000384,-0.002867,0.002181,-0.007841
gender,-0.006261,1.0,0.36232,-0.048927,0.076731,0.198455,0.103083,0.006351,-0.101969,0.067437,0.201138,-0.17299,,0.064316,-0.027078,0.000108,-0.130283,0.11106,0.007707,0.001562
car,0.00421,0.36232,1.0,-0.016599,0.101846,0.213432,0.048361,-0.105567,-0.12428,0.018617,0.156883,-0.15926,,0.026124,-0.01373,0.018904,-0.121225,0.149463,-0.029955,0.007761
reality,0.006645,-0.048927,-0.016599,1.0,0.000435,0.035377,-0.048468,0.011535,0.028144,-0.177957,-0.130241,0.097947,,-0.208648,-0.065553,0.051085,0.047347,-0.006667,0.003863,-0.009387
child_num,0.001012,0.076731,0.101846,0.000435,1.0,0.032186,0.106498,-0.049002,-0.161222,0.02312,0.332816,-0.228159,,0.051521,-0.010555,0.01612,-0.135173,0.89053,-0.007229,0.004081
income_total,0.009848,0.198455,0.213432,0.035377,0.032186,1.0,-0.075175,-0.225893,-0.005802,-0.004514,0.064155,-0.166478,,-0.034207,0.019013,0.089882,-0.093062,0.023839,-0.018047,0.008555
income_type,0.009505,0.103083,0.048361,-0.048468,0.106498,-0.075175,1.0,0.056688,-0.05079,0.033013,0.211466,-0.361347,,0.161857,0.004436,-0.02006,-0.198431,0.107698,0.00275,-0.008163
edu_type,-0.003183,0.006351,-0.105567,0.011535,-0.049002,-0.225893,0.056688,1.0,0.008717,-0.039668,-0.166666,0.120944,,-0.023778,-0.045179,-0.09564,0.084669,-0.041345,0.013824,0.01378
family_type,0.009149,-0.101969,-0.12428,0.028144,-0.161222,-0.005802,-0.05079,0.008717,1.0,0.007189,-0.104556,0.125207,,-0.065483,-0.014388,-0.015427,0.056889,-0.545149,0.030711,-0.00523
house_type,-0.002161,0.067437,0.018617,-0.177957,0.02312,-0.004514,0.033013,-0.039668,0.007189,1.0,0.210021,-0.105835,,0.031381,-0.019003,0.013517,-0.064466,0.00501,0.030556,-0.009023


일단 label encoding된 데이터셋의 상관계수를 바탕으로 가장 높은 상관성을 지니고 있는 DAYS_EMPLOYED, DAYS_BIRTH, income_type을 참고 column으로 지정했음

In [64]:
# data split
train_notnull = train[train.occyp_type.notnull()]
train_null = train[train.occyp_type.isnull()]

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=['DAYS_EMPLOYED','DAYS_BIRTH','income_type'], # column(s) containing information about the column we want to impute
    output_column='occyp_type', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=train_notnull, num_epochs=50)

#Impute missing values and return original dataframe with predictions
imputed = imputer.predict(train_null)

  _warn_prf(average, modifier, msg_start, len(result))


<datawig.simple_imputer.SimpleImputer at 0x20419ddc580>

  return np.log(probas)


결측값 대치된 데이터

In [65]:
imputed[['occyp_type', 'occyp_type_imputed','occyp_type_imputed_proba']]

Unnamed: 0,occyp_type,occyp_type_imputed,occyp_type_imputed_proba
0,,Laborers,0.214560
8,,Laborers,0.163709
14,,Security staff,1.000000
18,,Security staff,1.000000
19,,Laborers,0.339406
...,...,...,...
26439,,Security staff,1.000000
26441,,Security staff,1.000000
26443,,Security staff,1.000000
26449,,Laborers,0.319186


prob는 아마 실제 대치된 결측값이 맞을 확률을 의미하는듯,,? 잘은 모르겠습니다

데이터셋 병합

In [73]:
imputed = imputed.drop(['occyp_type','occyp_type_imputed_proba'],axis=1)

imputed = imputed.rename(columns = {'occyp_type_imputed':'occyp_type'})

train_imputed = pd.concat([train_notnull,imputed],axis = 0)

In [91]:
train_imputed.head(3)

train_imputed.isnull().sum()

Unnamed: 0,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,begin_month,car,child_num,credit,edu_type,email,family_size,family_type,gender,house_type,income_total,income_type,index,occyp_type,phone,reality,work_phone
1,-11380,-1540,1,-5.0,0,1,1.0,Secondary / secondary special,1,3.0,Civil marriage,0,House / apartment,247500.0,Commercial associate,1,Laborers,0,1,0
2,-19087,-4434,1,-22.0,1,0,2.0,Higher education,0,2.0,Married,1,House / apartment,450000.0,Working,2,Managers,1,1,0
3,-15088,-2092,1,-37.0,0,0,0.0,Secondary / secondary special,0,2.0,Married,0,House / apartment,202500.0,Commercial associate,3,Sales staff,1,1,0


DAYS_BIRTH       0
DAYS_EMPLOYED    0
FLAG_MOBIL       0
begin_month      0
car              0
child_num        0
credit           0
edu_type         0
email            0
family_size      0
family_type      0
gender           0
house_type       0
income_total     0
income_type      0
index            0
occyp_type       0
phone            0
reality          0
work_phone       0
dtype: int64

결측값을 모두 대치했습니다.