<a href="https://colab.research.google.com/github/teamgaon/recommendation_algorithms_know/blob/main/20220117_hj_know_2019.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install optuna



In [2]:
import optuna
import warnings
import gc
import os
import random
import sys

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from datetime import datetime
from glob import glob
from tqdm import tqdm
from IPython.display import Image


In [3]:
# 추후에 경고 값들을 출력하지 않도록 'ignore'상태로 만들어 준다.
# 불필요한 경고창이 더 이상 출력 되지 않게 됩니다.
warnings.filterwarnings(action = 'ignore')

In [4]:
train_2017 = []
train_2018 = []
train_2019 = []
train_2020 = []

for i, path in enumerate(sorted(glob("/content/drive/MyDrive/KNOW_data/train/*.csv"))):
    print(path)
    if i == 0:
        train_2017 = pd.read_csv(path)
        pass
    elif i==1:
        train_2018 = pd.read_csv(path)
        pass
    elif i==2:
        train_2019 = pd.read_csv(path)
        pass
    else:
        train_2020 = pd.read_csv(path)
        pass
    
test_2017 = []
test_2018 = []
test_2019 = []
test_2020 = []

for i, path in enumerate(sorted(glob("/content/drive/MyDrive/KNOW_data/test/*.csv"))):
    print(path)
    
    if i == 0:
        test_2017 = pd.read_csv(path)
        pass
    elif i==1:
        test_2018 = pd.read_csv(path)
        pass
    elif i==2:
        test_2019 = pd.read_csv(path)
        pass
    else:
        test_2020 = pd.read_csv(path)
        pass

/content/drive/MyDrive/KNOW_data/train/KNOW_2017.csv
/content/drive/MyDrive/KNOW_data/train/KNOW_2018.csv
/content/drive/MyDrive/KNOW_data/train/KNOW_2019.csv
/content/drive/MyDrive/KNOW_data/train/KNOW_2020.csv
/content/drive/MyDrive/KNOW_data/test/KNOW_2017_test.csv
/content/drive/MyDrive/KNOW_data/test/KNOW_2018_test.csv
/content/drive/MyDrive/KNOW_data/test/KNOW_2019_test.csv
/content/drive/MyDrive/KNOW_data/test/KNOW_2020_test.csv


1.1 공백으로 구성된 결측치 값을 np.nan으로 변경

In [5]:
# 결측치 값 확인을 편하게 하기 위해서 모두 np.nan 값으로 변형해 준다.
# test_2019에 있는 모든 columns들을 불러온다.
for col in test_2019:
    
    # idx 값에는 별도의 결측치 값이 존재하지 않기 때문에 건너뛴다.
    if col == 'idx':
        continue
        
    train_2019.replace(' ', np.nan, inplace = True)
    test_2019.replace(' ', np.nan, inplace = True)

In [6]:
have_nan_columns = []
for index, value, tindex, tvalue in zip(train_2019.isnull().sum().index, train_2019.isnull().sum(), test_2019.isnull().sum().index, test_2019.isnull().sum()):
    print(f"{index} : {value} / {tindex} : {tvalue} / {value + tvalue}")
    if (value > 0) | (tvalue > 0):
        have_nan_columns.append(index)

idx : 0 / idx : 0 / 0
sq1 : 0 / sq1 : 0 / 0
sq2 : 0 / sq2 : 0 / 0
sq3 : 0 / sq3 : 0 / 0
sq4 : 0 / sq4 : 0 / 0
sq5 : 0 / sq5 : 0 / 0
sq6 : 0 / sq6 : 0 / 0
sq7 : 0 / sq7 : 0 / 0
sq8 : 0 / sq8 : 0 / 0
sq9 : 0 / sq9 : 0 / 0
sq10 : 0 / sq10 : 0 / 0
sq11 : 0 / sq11 : 0 / 0
sq12 : 0 / sq12 : 0 / 0
sq13 : 0 / sq13 : 0 / 0
sq14 : 0 / sq14 : 0 / 0
sq15 : 0 / sq15 : 0 / 0
sq16 : 0 / sq16 : 0 / 0
kq1_1 : 0 / kq1_1 : 0 / 0
kq1_2 : 0 / kq1_2 : 0 / 0
kq2_1 : 0 / kq2_1 : 0 / 0
kq2_2 : 0 / kq2_2 : 0 / 0
kq3_1 : 0 / kq3_1 : 0 / 0
kq3_2 : 0 / kq3_2 : 0 / 0
kq4_1 : 0 / kq4_1 : 0 / 0
kq4_2 : 0 / kq4_2 : 0 / 0
kq5_1 : 0 / kq5_1 : 0 / 0
kq5_2 : 0 / kq5_2 : 0 / 0
kq6_1 : 0 / kq6_1 : 0 / 0
kq6_2 : 0 / kq6_2 : 0 / 0
kq7_1 : 0 / kq7_1 : 0 / 0
kq7_2 : 0 / kq7_2 : 0 / 0
kq8_1 : 0 / kq8_1 : 0 / 0
kq8_2 : 0 / kq8_2 : 0 / 0
kq9_1 : 0 / kq9_1 : 0 / 0
kq9_2 : 0 / kq9_2 : 0 / 0
kq10_1 : 0 / kq10_1 : 0 / 0
kq10_2 : 0 / kq10_2 : 0 / 0
kq11_1 : 0 / kq11_1 : 0 / 0
kq11_2 : 0 / kq11_2 : 0 / 0
kq12_1 : 0 / kq12_1 : 0 / 0
kq12

In [7]:
# idx와 knowcode를 제외하고는 154개의 특징들로 구성되어 있는 것을 확인할 수 있다.
train_2019.head()

Unnamed: 0,idx,sq1,sq2,sq3,sq4,sq5,sq6,sq7,sq8,sq9,sq10,sq11,sq12,sq13,sq14,sq15,sq16,kq1_1,kq1_2,kq2_1,kq2_2,kq3_1,kq3_2,kq4_1,kq4_2,kq5_1,kq5_2,kq6_1,kq6_2,kq7_1,kq7_2,kq8_1,kq8_2,kq9_1,kq9_2,kq10_1,kq10_2,kq11_1,kq11_2,kq12_1,...,bq14_5,bq15,bq16_1,bq16_2,bq16_3,bq16_4,bq16_5,bq17,bq18_1,bq18_2,bq18_3,bq18_4,bq18_5,bq18_6,bq18_7,bq18_8,bq18_9,bq18_10,bq19,bq20,bq20_1,bq21_1,bq21_2,bq21_3,bq22,bq23,bq24,bq25,bq26,bq27,bq27_1,bq28,bq28_1,bq28_2,bq29,bq30,bq31_1,bq31_2,bq31_3,knowcode
0,18569,4,4,4,3,4,4,4,4,4,4,4,5,4,4,4,4,3,3,2,2,1,0,2,2,2,2,3,4,4,5,1,0,4,5,4,5,4,5,4,...,5.0,4.0,4.0,4.0,4.0,3.0,4.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,사람은 줄고 일은 많으니까 업무가 많다,3.0,4,수송수요가 늘어날다,4.0,4.0,3,,,,1,35.0,4,기계공학과,1.0,1.0,,1.0,40.0,3500.0,3000.0,,812301
1,18570,4,3,4,4,4,4,3,4,5,4,3,5,4,4,1,2,4,6,3,4,4,5,5,5,3,4,5,6,1,0,1,0,1,0,1,0,1,0,1,...,3.0,3.0,4.0,2.0,3.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,3,운송시장이 큰 변화 없을듯해서,2.0,3.0,3,"컨테이너 대여사업,랜탈사업",,,1,63.0,3,경영학,2.0,,5.0,,40.0,,,5000.0,15201
2,18571,2,3,2,2,2,2,2,2,3,3,3,2,2,2,2,2,2,2,1,0,3,4,2,3,2,2,1,0,1,0,3,3,1,0,1,0,1,0,1,...,4.0,1.0,3.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,1.0,3,,4.0,4.0,4,숙박업,,,2,62.0,2,,2.0,,6.0,,30.0,,,3000.0,901101
3,18572,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,7,5,6,5,6,5,6,4,5,4,5,4,5,5,6,3,3,3,3,2,...,4.0,3.0,4.0,3.0,3.0,3.0,3.0,4.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,4,편리하게 일처리을 하기위해서 문서대행 업무를 맡길거 같아서,2.0,3.0,3,,,,2,33.0,4,행정학,1.0,1.0,,1.0,40.0,3600.0,2400.0,,29903
4,18573,1,4,4,1,1,2,4,3,4,4,4,5,4,3,1,1,2,2,2,2,2,2,2,2,4,5,1,0,1,0,1,0,1,0,1,0,1,0,1,...,3.0,3.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2.0,2,일거리가 줄어들고 있다,3.0,2.0,2,용역회사(청소원),,,1,67.0,1,,2.0,,6.0,,45.0,,,1500.0,561401


bq4_1a ~ 1c 자격증 종류

In [8]:
train_2019.loc[(train_2019['bq4'] == 2) & (train_2019['bq4_1a'].isnull()), 'bq4_1a'] = '없음'
test_2019.loc[(test_2019['bq4'] == 2) & (test_2019['bq4_1a'].isnull()), 'bq4_1a'] = '없음'
train_2019.loc[(train_2019['bq4'] == 2) & (train_2019['bq4_1b'].isnull()), 'bq4_1b'] = '없음'
train_2019.loc[(train_2019['bq4'] == 2) & (train_2019['bq4_1c'].isnull()), 'bq4_1c'] = '없음'
test_2019.loc[(test_2019['bq4'] == 2) & (test_2019['bq4_1b'].isnull()), 'bq4_1b'] = '없음'
test_2019.loc[(test_2019['bq4'] == 2) & (test_2019['bq4_1c'].isnull()), 'bq4_1c'] = '없음'

In [9]:
train_2019.bq4_1a.value_counts()

없음           4434
정보처리기사         87
전기기사           81
의사면허증          65
토목기사           63
             ... 
용접              1
교통기술사 자격증       1
국가공인자격증         1
안과의사전문면허증       1
지게차조정면허         1
Name: bq4_1a, Length: 1559, dtype: int64

In [10]:
test_2019.bq4_1a.value_counts()

없음                 4427
정보처리기사               94
전기기사                 76
의사면허증                57
의사면허                 52
                   ... 
중등교감                  1
사업용조종사/운송용조종사         1
중식조리사자격증              1
사업용조종사 육상 단, 다발       1
마사지 자격증               1
Name: bq4_1a, Length: 1559, dtype: int64

In [11]:
train_2019.loc[(train_2019['bq4'] == 1) & (train_2019['bq4_1a'].isnull()), 'bq4_1a'] = '정보처리기사'
test_2019.loc[(test_2019['bq4'] == 1) & (test_2019['bq4_1a'].isnull()), 'bq4_1a'] = '정보처리기사'

In [12]:
train_2019.loc[(train_2019['bq4_1b'].isnull()), 'bq4_1b'] = '없음'
train_2019.loc[(train_2019['bq4_1c'].isnull()), 'bq4_1c'] = '없음'
test_2019.loc[(test_2019['bq4_1b'].isnull()), 'bq4_1b'] = '없음'
test_2019.loc[(test_2019['bq4_1c'].isnull()), 'bq4_1c'] = '없음'

In [13]:
for index, value, tindex, tvalue in zip(train_2019.isnull().sum().index, train_2019.isnull().sum(), test_2019.isnull().sum().index, test_2019.isnull().sum()):
    print(f"{index} : {value} / {tindex} : {tvalue} / {value + tvalue}")

idx : 0 / idx : 0 / 0
sq1 : 0 / sq1 : 0 / 0
sq2 : 0 / sq2 : 0 / 0
sq3 : 0 / sq3 : 0 / 0
sq4 : 0 / sq4 : 0 / 0
sq5 : 0 / sq5 : 0 / 0
sq6 : 0 / sq6 : 0 / 0
sq7 : 0 / sq7 : 0 / 0
sq8 : 0 / sq8 : 0 / 0
sq9 : 0 / sq9 : 0 / 0
sq10 : 0 / sq10 : 0 / 0
sq11 : 0 / sq11 : 0 / 0
sq12 : 0 / sq12 : 0 / 0
sq13 : 0 / sq13 : 0 / 0
sq14 : 0 / sq14 : 0 / 0
sq15 : 0 / sq15 : 0 / 0
sq16 : 0 / sq16 : 0 / 0
kq1_1 : 0 / kq1_1 : 0 / 0
kq1_2 : 0 / kq1_2 : 0 / 0
kq2_1 : 0 / kq2_1 : 0 / 0
kq2_2 : 0 / kq2_2 : 0 / 0
kq3_1 : 0 / kq3_1 : 0 / 0
kq3_2 : 0 / kq3_2 : 0 / 0
kq4_1 : 0 / kq4_1 : 0 / 0
kq4_2 : 0 / kq4_2 : 0 / 0
kq5_1 : 0 / kq5_1 : 0 / 0
kq5_2 : 0 / kq5_2 : 0 / 0
kq6_1 : 0 / kq6_1 : 0 / 0
kq6_2 : 0 / kq6_2 : 0 / 0
kq7_1 : 0 / kq7_1 : 0 / 0
kq7_2 : 0 / kq7_2 : 0 / 0
kq8_1 : 0 / kq8_1 : 0 / 0
kq8_2 : 0 / kq8_2 : 0 / 0
kq9_1 : 0 / kq9_1 : 0 / 0
kq9_2 : 0 / kq9_2 : 0 / 0
kq10_1 : 0 / kq10_1 : 0 / 0
kq10_2 : 0 / kq10_2 : 0 / 0
kq11_1 : 0 / kq11_1 : 0 / 0
kq11_2 : 0 / kq11_2 : 0 / 0
kq12_1 : 0 / kq12_1 : 0 / 0
kq12

In [14]:
train_2019.loc[(train_2019['bq5'] == 2) & (train_2019['bq5_1'].isnull()),'bq5_1'] = 0
test_2019.loc[(test_2019['bq5'] == 2) & (test_2019['bq5_1'].isnull()),'bq5_1'] = 0

In [15]:
# 5번 문항에서 2번을 선택한 사람들에 대해서 5-2에 대한 값을 '없음'으로 처리함
train_2019.loc[(train_2019['bq5'] == 2) & (train_2019['bq5_2'].isnull()),'bq5_2'] = '없음'
test_2019.loc[(test_2019['bq5'] == 2) & (test_2019['bq5_2'].isnull()),'bq5_2'] = '없음'

In [16]:
# bq5_2에서 많은 수가 남는다. 서술형이기 때문에 단순히 없음으로 처리하자.
train_2019.bq5_2.isnull().sum()

717

In [17]:
train_2019.loc[train_2019['bq5_2'].isnull(), 'bq5_2'] = '없음'
test_2019.loc[test_2019['bq5_2'].isnull(), 'bq5_2'] = '없음'

bq18_10, bq20_1, bq22, bq23, bq24 제거 (서술형 문장, 처리 굉장히 어려움)

bq27_1은 2017년 bq38_1과 동일하게 처리합니다

In [18]:
train_2019.loc[train_2019['bq27_1'].isnull(), 'bq27_1'] = '없음'
test_2019.loc[test_2019['bq27_1'].isnull(), 'bq27_1'] = '없음'

In [19]:
train_2019['bq28_1'].replace('없음', np.nan, inplace = True)
test_2019['bq28_1'].replace('없음', np.nan, inplace = True)
train_2019['bq28_2'].replace('없음', np.nan, inplace = True)
test_2019['bq28_2'].replace('없음', np.nan, inplace = True)

In [20]:
train_2019['bq28_1'].replace(np.nan, 0,inplace = True)
test_2019['bq28_1'].replace(np.nan, 0,inplace = True)
train_2019['bq28_2'].replace(np.nan,0, inplace = True)
test_2019['bq28_2'].replace(np.nan, 0,inplace = True)

In [21]:
test_2019.bq28_1.isnull().sum()

0

In [22]:
train_2019['bq29'].replace(np.nan,0, inplace = True)
test_2019['bq29'].replace(np.nan, 0,inplace = True)

In [23]:
# bq37행에서 전공 걸러내서 bq37_1_2018에 담기
bq27_1_2019 = train_2019[(train_2019['bq27'] =='컴퓨터공학')
|(train_2019['bq27'] == '지능로봇학')
|(train_2019['bq27'] == '모바일인터넷')
|(train_2019['bq27'] == '영상영화디자인과')
|(train_2019['bq27'] == '방사선과')
|(train_2019['bq27'] == '신문방송')
&(train_2019['bq27'] !='1')
&(train_2019['bq27'] !='2')
&(train_2019['bq27'] !='3')
&(train_2019['bq27'] !='4')
&(train_2019['bq27'] !='5')
&(train_2019['bq27'] !='6')]

# 잘못된 입력 값 인덱스 찾기
wrong_index = train_2019[(train_2019['bq27'] =='컴퓨터공학')
|(train_2019['bq27'] == '지능로봇학')
|(train_2019['bq27'] == '모바일인터넷')
|(train_2019['bq27'] == '영상영화디자인과')
|(train_2019['bq27'] == '방사선과')
|(train_2019['bq27'] == '신문방송')
&(train_2019['bq27'] !='1')
&(train_2019['bq27'] !='2')
&(train_2019['bq27'] !='3')
&(train_2019['bq27'] !='4')
&(train_2019['bq27'] !='5')
&(train_2019['bq27'] !='6')].index

# 원래 데이터에서 잘못된 row 제거
train_2019 = train_2019.drop(wrong_index)

In [24]:
# column 이름을 바꿔서 원위치 시키기

bq27_1_2019.rename(columns={'bq27_1':'bq27-1'},inplace=True)
bq27_1_2019.rename(columns={'bq27':'bq27_1'},inplace=True)
bq27_1_2019.rename(columns={'bq27-1':'bq27'},inplace=True)

In [25]:
# 데이터 합치기
train_2019 = pd.concat([train_2019,bq27_1_2019])

# 이상값 제거
train_2019['bq27_1'].replace('2','이')

0          기계공학과
1            경영학
2             없음
3            행정학
4             없음
          ...   
614       모바일인터넷
3309    영상영화디자인과
3361        방사선과
7251       컴퓨터공학
7758        신문방송
Name: bq27_1, Length: 8555, dtype: object

In [26]:
# num to str
train_2019['bq27'] = train_2019['bq27'].map(str)

In [35]:
# 테스트 데이터 전처리

test_2019['bq27'] = test_2019['bq27'].map(str)

# 데이터 날리기
drop_index = test_2019[test_2019['bq27']=='60'].index
test_2019.drop(drop_index,axis=0,inplace=True)

# 데이터 날리기
drop_index = test_2019[test_2019['bq27']=='2900'].index
test_2019.drop(drop_index,axis=0,inplace=True)

In [41]:
# bq37행에서 전공 걸러내서 bq37_1_2018에 담기
bq27_1_2019 = test_2019[(test_2019['bq27'] =='건축')
|(test_2019['bq27'] == '지질학과')
&(test_2019['bq27'] !='1')
&(test_2019['bq27'] !='2')
&(test_2019['bq27'] !='3')
&(test_2019['bq27'] !='4')
&(test_2019['bq27'] !='5')
&(test_2019['bq27'] !='6')]

# 잘못된 입력 값 인덱스 찾기
wrong_index = test_2019[(test_2019['bq27'] =='건축')
|(test_2019['bq27'] == '지질학과')
&(test_2019['bq27'] !='1')
&(test_2019['bq27'] !='2')
&(test_2019['bq27'] !='3')
&(test_2019['bq27'] !='4')
&(test_2019['bq27'] !='5')
&(test_2019['bq27'] !='6')].index

# 원래 데이터에서 잘못된 row 제거
test_2019 = test_2019.drop(wrong_index)

In [42]:
# column 이름을 바꿔서 원위치 시키기

bq27_1_2019.rename(columns={'bq27_1':'bq27-1'},inplace=True)
bq27_1_2019.rename(columns={'bq27':'bq27_1'},inplace=True)
bq27_1_2019.rename(columns={'bq27-1':'bq27'},inplace=True)

In [43]:
# 데이터 합치기
test_2019 = pd.concat([test_2019,bq27_1_2019])

bq_31 1~3

In [30]:
total_2019 = pd.concat([train_2019, test_2019], axis=0)
total_2019.head()

Unnamed: 0,idx,sq1,sq2,sq3,sq4,sq5,sq6,sq7,sq8,sq9,sq10,sq11,sq12,sq13,sq14,sq15,sq16,kq1_1,kq1_2,kq2_1,kq2_2,kq3_1,kq3_2,kq4_1,kq4_2,kq5_1,kq5_2,kq6_1,kq6_2,kq7_1,kq7_2,kq8_1,kq8_2,kq9_1,kq9_2,kq10_1,kq10_2,kq11_1,kq11_2,kq12_1,...,bq14_5,bq15,bq16_1,bq16_2,bq16_3,bq16_4,bq16_5,bq17,bq18_1,bq18_2,bq18_3,bq18_4,bq18_5,bq18_6,bq18_7,bq18_8,bq18_9,bq18_10,bq19,bq20,bq20_1,bq21_1,bq21_2,bq21_3,bq22,bq23,bq24,bq25,bq26,bq27,bq27_1,bq28,bq28_1,bq28_2,bq29,bq30,bq31_1,bq31_2,bq31_3,knowcode
0,18569,4,4,4,3,4,4,4,4,4,4,4,5,4,4,4,4,3,3,2,2,1,0,2,2,2,2,3,4,4,5,1,0,4,5,4,5,4,5,4,...,5.0,4.0,4.0,4.0,4.0,3.0,4.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,사람은 줄고 일은 많으니까 업무가 많다,3,4,수송수요가 늘어날다,4.0,4.0,3,,,,1,35,4,기계공학과,1,1,0,1,40.0,3500.0,3000.0,,812301.0
1,18570,4,3,4,4,4,4,3,4,5,4,3,5,4,4,1,2,4,6,3,4,4,5,5,5,3,4,5,6,1,0,1,0,1,0,1,0,1,0,1,...,3.0,3.0,4.0,2.0,3.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,1,3,운송시장이 큰 변화 없을듯해서,2.0,3.0,3,"컨테이너 대여사업,랜탈사업",,,1,63,3,경영학,2,0,5,0,40.0,,,5000.0,15201.0
2,18571,2,3,2,2,2,2,2,2,3,3,3,2,2,2,2,2,2,2,1,0,3,4,2,3,2,2,1,0,1,0,3,3,1,0,1,0,1,0,1,...,4.0,1.0,3.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,1,3,,4.0,4.0,4,숙박업,,,2,62,2,없음,2,0,6,0,30.0,,,3000.0,901101.0
3,18572,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,7,5,6,5,6,5,6,4,5,4,5,4,5,5,6,3,3,3,3,2,...,4.0,3.0,4.0,3.0,3.0,3.0,3.0,4.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2,4,편리하게 일처리을 하기위해서 문서대행 업무를 맡길거 같아서,2.0,3.0,3,,,,2,33,4,행정학,1,1,0,1,40.0,3600.0,2400.0,,29903.0
4,18573,1,4,4,1,1,2,4,3,4,4,4,5,4,3,1,1,2,2,2,2,2,2,2,2,4,5,1,0,1,0,1,0,1,0,1,0,1,0,1,...,3.0,3.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,,2,2,일거리가 줄어들고 있다,3.0,2.0,2,용역회사(청소원),,,1,67,1,없음,2,0,6,0,45.0,,,1500.0,561401.0


In [31]:
bq1_list = total_2019.bq1.unique()
bq1_list

array([15,  8,  1, 19,  3, 17, 20,  6, 18, 10,  9, 11, 13, 16,  7,  4,  5,
        2, 14, 12, 21])

In [32]:
for bq1 in bq1_list:
    mean_bq31_1 = round(total_2019.loc[(total_2019.bq1 == bq1) & (total_2019.bq31_1.notnull())].bq31_1.apply(lambda x: int(x)).mean())
    train_2019.loc[(train_2019.bq1 == bq1) & (train_2019.bq31_1.isnull()), 'bq31_1'] = mean_bq31_1
    test_2019.loc[(test_2019.bq1 == bq1) & (test_2019.bq31_1.isnull()), 'bq31_1'] = mean_bq31_1
    
    mean_bq31_2 = round(total_2019.loc[(total_2019.bq1 == bq1) & (total_2019.bq31_2.notnull())].bq31_2.apply(lambda x: int(x)).mean())
    
    try:
        mean_bq31_3 = round(total_2019.loc[(total_2019.bq1 == bq1) & (total_2019.bq31_3.notnull())].bq31_3.apply(lambda x: int(x)).mean())
    except:
        mean_bq31_3 = mean_bq31_2
    
    train_2019.loc[(train_2019.bq1 == bq1) & (train_2019.bq31_2.isnull()), 'bq31_2'] = mean_bq31_2
    test_2019.loc[(test_2019.bq1 == bq1) & (test_2019.bq31_2.isnull()), 'bq31_2'] = mean_bq31_2
    
    train_2019.loc[(train_2019.bq1 == bq1) & (train_2019.bq31_3.isnull()), 'bq31_3'] = mean_bq31_3
    test_2019.loc[(test_2019.bq1 == bq1) & (test_2019.bq31_3.isnull()), 'bq31_3'] = mean_bq31_3

나머지

In [33]:
from sklearn.impute import SimpleImputer
train_columns = train_2019.columns
test_columns = test_2019.columns
train_target = train_2019.iloc[:,-1]
imp = SimpleImputer(strategy='most_frequent')
train_2019_no_target = pd.DataFrame(imp.fit_transform(train_2019.iloc[:,:-1]))
test_2019 = pd.DataFrame(imp.transform(test_2019))
train_2019 = pd.concat([train_2019_no_target, train_target], axis=1)

train_2019.columns = train_columns
test_2019.columns = test_columns

In [46]:
train_2019.to_csv("220114_know2019_train.csv")
test_2019.to_csv("220114_know2019_test.csv")