# Motivation

2019 GOMS 설문조사를 바탕사회 초년생의 이직의도에 영향을 미치는 요인 파악

# Loading Library

In [9]:
#!pip install brewer2mpl
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings; 
warnings.filterwarnings(action='ignore')

# preprocessing
import numpy as np
import pandas as pd

# warning 삭제용
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')

large = 22; med = 16; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (16, 10),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.style.use('seaborn-whitegrid')
sns.set_style("white")
%matplotlib inline

## Loading Data

In [10]:
import sys
# Add the ptdraft folder path to the sys.path list
sys.path.append('../src')
from data import *

In [11]:
# data.py
df

Unnamed: 0,g181d001,g181d006,g181e001,g181pid,g181majorcat,g181sex,g181birthy,g181age,g181graduy,g181a001,...,g181q021,g181q022,g181q023,g181p001,g181p008,g181p036,g181p046,g181p041,worker_type,regular_worker
3,,,2.0,100034,1,2,1994,25.583333,2018,2018.0,...,2,5,1,1,2,5,1,2,employed,yes
4,,,2.0,100036,1,1,1997,21.750000,2018,2018.0,...,4,3,4,1,2,4,1,2,employed,yes
5,,,2.0,100047,1,1,1994,25.250000,2018,2018.0,...,4,4,4,1,2,3,1,2,employed,yes
6,1.0,2.0,2.0,100049,1,1,1995,24.500000,2018,2018.0,...,6,4,2,1,2,4,1,2,employed,yes
7,,,2.0,100061,1,1,1993,26.000000,2018,2018.0,...,4,1,4,1,2,4,3,2,employed,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18152,,,2.0,913797,7,2,1993,25.833333,2017,2016.0,...,4,4,4,1,2,5,1,2,employed,yes
18156,,,2.0,913858,7,2,1995,24.000000,2018,2018.0,...,3,2,3,1,2,2,1,2,employed,yes
18157,,,2.0,913874,7,2,1993,25.833333,2017,2018.0,...,2,2,1,1,2,4,1,2,employed,yes
18159,,,2.0,913907,7,2,1994,25.083333,2018,2018.0,...,4,4,5,1,2,3,1,2,employed,yes


In [12]:
df['g181a297'].value_counts()

0    8262
1    2263
Name: g181a297, dtype: int64

In [175]:
#import seaborn as sns
#sns.get_dataset_names()
#df = sns.load_dataset("titanic")
#df.head()

In [13]:
# train test split
from sklearn.model_selection import train_test_split

target = "g181a297"
#target = "survived"

def split(df, target, test_size=0.3):
    
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split(df, target)
    

In [8]:
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from category_encoders import OrdinalEncoder
# column tranformer
from sklearn.compose import ColumnTransformer

# pipeline
from sklearn.pipeline import Pipeline

In [101]:
feature_dict =  {
      'g181pid':'id', # id 
       'g181majorcat':'majorcat', # 전공계열
       'g181sex':'sex_cat', # 성별   
       'g181birthy':'birth_date', # 출생년
       'g181age':'age_num', # 연령
       'g181graduy':'graduy_date', #졸업년
### 사업체 관련       
        'g181a001':'year_start_date', # 현일자리 시작년
        'g181a002':'month_start_date', # 현일자리 시작월
        'g181a004_10':'ind_cat', # 일자리 산업 대분류
        'g181a010':'corp_worker_cat', # 기업체 종사자 수 # 결측값문제로 categorical로 변경
        'g181a011':'biz_worker_cat', # 사업체 종사자 수
        'g181a018':'tw_hour', # 출근시간_시간
        'g181a019':'tw_min', # 출근시간_분
        'g181a116':'workday_num', # 주당 정규 근로일
        'g181a117':'worktime_num', # 주당정규 근로시간
        'g181a118':'worktime_ex_num', # 주당초과 근로시간
        'g181a119': 'holy_work_num', # 월평균 휴일근로
        'g181a020': 'biztype_cat', # 사업체형태
       'g181a022':'regular_cat',     # 정규직 비정규직 여부
### 혜택관련
        'g181a390':'voluntary_cat',     # 일자리 형태 자발, 비자발 여부
        'g181a035':'shift_cat' ,    # 교대제 여부
        'g181a038':'pension_cat',     # 퇴직금 제공 여부
        'g181a039':'payed_vc_cat',     # 제공여부 2- 유급휴가
        'g181a043':'maternity_cat',     #  6- 육아휴직
        'g181a045':'overtime_pay_cat',     # 8- 시간 외 수당
        'g181a046':'bonus_cat',     # 9- 상여금
        'g181a048':'weekly_hl_cat',     # 11- 유급주휴
        'g181a392':'baby_vc_cat',     # 12- 산전후휴가
        'g181a120':'wage_type_cat',     # 급여 형태 구분
        'g181a122':'month_wage_num',     # 월 평균 근로소득
        'g181a126':'sat_wage_num',     # 만족도-임금
        'g181a127':'sat_stable_num',     # 만족도-고용안정성
        'g181a128':'sat_work_num',     # 만족도-직무내용
        'g181a129':'sat_env_num',     # 만족도-근무환경
        'g181a130':'sat_wt_num',     # 만족도-노동시간
        'g181a131':'sat_potential_num',     # 만족도-발전가능성
        'g181a132':'sat_relation_num',     # 만족도-인간관계
        'g181a133':'sat_welfare_num',     # 만족도-복리후생
        'g181a134':'sat_hr_num',     # 만족도-인사체계
        'g181a135':'sat_rep1_num',     # 만족도-사회적평판-일
        'g181a136':'sat_auto_num',     # 만족도-자율성 및 권한
        'g181a137':'sat_rep2_num',     # 만족도-일자리-사회적 평판
        'g181a138':'sat_fit_num',     # 만족도-적성흥미일치도
        'g181a139':'sat_edu_num',     # 만족도-직무관련 교육
        'g181a140':'sat_general_num',     # 만족도-일자리_전반적만족도
        'g181a141':'sat_work-general_num',     # 만족도-업무_전반적만족도
        'g181a142':'edu-fit_num',     # 교육수준-일수준일치정도
        'g181a143':'skill-fit_num',     # 일기술수준-본인기술수준일치정도
        'g181a144':'major-fit_num',     # 주전공일치정도
        'g181a146':'major_help_num',     # 전공지식업무도움정도
        'g181a158':'ins_1_num',     # 보험-국민연금
        'g181a159':'ins_2_num',     # 보험-특수직역연금
        'g181a160':'ins_3_num',     # 보험-건강보험
        'g181a161':'ins_4_num',     # 보험-고용보험
        'g181a162':'ins_5_num',     # 보험-산재보험
        'g181a189':'seeking_time_num',     # 구직활동경험기간-개월
        'g181a283':'adjust_difficulty_cat',     # 다른일자리제의여부
        'g181a285':'job_offer_cat',     # 적응시어려움여부
        'g181a297':'turnover_intention',     # 이직준비 여부: target
        'g181g001':'graduate_cat',     # 대학원 경험유무
        'g181l001':'train_cat',     # 취업훈련경험유무
        'g181q001':'health_num',     # 현재 견강상태
        'g181q004':'smoke_cat',     # 흡연여부
        'g181q006':'drink_num,',     # 음주빈도
        'g181q015':'lifesat_personal',     # 삶의만족도-개인적 측면
        'g181q016':'lifesat_relational',     # 삶의만족도-관계적 측면
        'g181q017':'lifesat_group',     # 삶의만족도-소속집단
        'g181q018':'emg_joy_num',     # 감정빈도-즐거운
        'g181q019':'emg_happy_num',     # 감정빈도-행복한
        'g181q020':'emg_comfort_num',     # 감정빈도-편안한
        'g181q021':'emb_irr_num',     # 감정빈도-짜증나는
        'g181q022':'emb_negative_num',     # 감정빈도-부정적인
        'g181q023':'emb_spiritless',     # 감정빈도-무기력한
        'g181p001':'marriage_cat',     # 혼인여부
        'g181p008':'child_cat',     # 부양자녀 유무
        'g181p036':'parent_asset_cat',     # 부모님 자산규모
        'g181p046':'livetype_cat',     # 거주형태
        'g181p041':'support_cat',      # 경제적 지원여부
        # subset에서 추가한 변수들
        'worker_type':'worker_type_cat',
        'regular_worker':'regular_worker_cat',
        'turnover_exp':'turnover_exp_cat',
        'work_exp':'work_exp_cat'        
}


import numpy as np

class Preprocess():
    """
    X_train, X_test
    """
    
    def __init__(self):
        """
        
        """
        print("Preprocessing Class")
        self.numeric_df = None
        self.cat_df = None
        self.ordinal_col = ['biz_worker_cat','parent_asset_cat','turnover_exp_cat']
        
        
    
    def get_dtypes(self,data):
        # onehot 할 피처
        #self.numeric_df = data.select_dtypes(include=['number'])
        #self.cat_df = data.select_dtypes(exclude=['number'])
        self.cat_col = [col for col in data.columns.tolist() if col.endswith('cat') ] 
        self.onehot_col = [col for col in self.cat_col if col not in self.ordinal_col]
        self.numeric_col = [col for col in data if col not in self.onehot_col and col not in self.ordinal_col]
        
                 
    def engineer(self,data):
        
        # id 칼럼 제거
        data = data.drop(['g181pid'], axis=1)
    
        # 이직경험 빈도 추가
        turnover_cond = [
        (data['g181d001'].isnull()==True ), # 첫 직장 전 취업한적 없음(알바포함)
        (data['g181d001']==1) & (data['g181d006']==1), # 알바 경험 있음
        (data['g181d001']==1) & (data['g181d006']==2) & (data['g181e001']==2), # 전직장 있음, 1번이직
        (data['g181e001']== 1)] # 전직장 2개 이상 있음
        choices = [0, 1, 2, 3]

        data['turnover_exp'] = np.select(turnover_cond, choices,default=3)
    
        data.drop(['g181d001','g181d006','g181e001'], axis=1, inplace=True)

        # 알바이외 근로경험 여부
        data['work_exp'] = np.where(data['turnover_exp'].isin([0,1]),0,1)
    
        # 구직활동기간 결측값 0으로 처리
    
        data['g181a189'] = data['g181a189'].fillna(0)
    
        # feature name 변경 (map)
    
        data.columns = data.columns.map(feature_dict)
    
    
        data = data.drop(data[data['month_wage_num'] == -1].index) # 급여 모르는 경우 제거
    
        data['work_year'] = 2019 - data['year_start_date']  # 근무기간
    
        data['work_time_num'] = data['tw_min'] + data['tw_hour'] # 출근소요시간
    
        # 보험 수
        insurances_col = [col for col in data if col.startswith('ins')]
        data['insurances_num'] = 0
        for col in insurances_col:
            data['temp'] = np.where(data[col] == 1, 1, 0)
            data['insurances_num'] += data['temp']
    
        # 회사 전반적 만족도
        biz_sat_col = [col for col in data if col.startswith('sat')]
        data['biz_sat'] = data[biz_sat_col].sum(axis=1)
    
        # 긍정적 감정
        pos_col = [col for col in data if col.startswith('emg') ] 
        data['pos'] = data[pos_col].sum(axis=1)
    
        # 부정적 감정
        neg_col = [col for col in data if col.startswith('neg') ]
        data['neg'] = data[neg_col].sum(axis=1)
    
        # 삶의 만족도
        lifesat_col = [col for col in data if col.startswith('lifesat')]
        data['lifesat'] = data[lifesat_col].sum(axis=1)
        
        # 혜택 수
        benefit_col  =['pension_cat',     # 퇴직금 제공 여부
            'payed_vc_cat',     # 제공여부 2- 유급휴가
            'maternity_cat',     #  6- 육아휴직
            'overtime_pay_cat',     # 8- 시간 외 수당
            'bonus_cat',     # 9- 상여금
            'weekly_hl_cat',     # 11- 유급주휴
            'baby_vc_cat'] # 출산휴가

        data['benefit_num'] = 0

        for cols in benefit_col:
            data['temp'] = np.where(data[cols]==1, 1, 0)
            data['benefit_num'] += data['temp']
        
        # 결측값 0으로 처리(구직활동기간이 0이기에)    
        data['seeking_time_num'] = data['seeking_time_num'].fillna(0)
    
        data.drop(columns='temp',axis=1,inplace=True) # temp 삭제
        data.drop(data[data.month_wage_num==-1].index,inplace=True) # 급여 모르는 경우 제거
    
        return data
    
    def drop_col(self,data,remove_col):
        
        all_cols = [self.numeric_col,self.onehot_col,self.ordinal_col]
        
        for col in all_cols:
            if remove_col in col:
                col.remove(remove_col)
            else:
                continue
        
        data.drop(columns = remove_col,axis=1,inplace=True)
        
        return data
    

    
    def base_pipline(self,data):
        
      
        """
        - categorical_feature imputation 고려
        - OneHotEncoder 
            - handle_unknown = 'ignore' 옵션 -> specifically useful if you don't know all possible categories
            - sparse = False 옵션 -> 기본값은 True. 리턴값을 sparse matrix에서 array로 변환
            
                
        ohe_pipeline = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value=-2)),
            ('one-hot', OneHotEncoder(use_cat_names=True, handle_unknown='ignore'))
        ])
        
        ord_pipeline = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value=-2)),
            ('ordinal', OrdinalEncoder(handle_unknown='ignore'))
        ])
        
        num_pipeline = Pipeline(steps=[
            ('impute', SimpleImputer(strategy='median')),
            ('scale', StandardScaler())
        ])

        """
        #make_pipeline(StandardScaler(), GaussianNB(priors=None))
        ohe_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value=-2),OneHotEncoder(use_cat_names=True, handle_unknown='ignore'))
        ord_pipeline = make_pipeline(SimpleImputer(strategy='constant', fill_value=-2),OrdinalEncoder(handle_unknown='ignore'))
        num_pipeline = make_pipeline(SimpleImputer(strategy='median'),StandardScaler())
        preprocess_pipeline = ColumnTransformer(
            transformers=[
                ('num', num_pipeline, self.numeric_col),
                ('ohe', ohe_pipeline, self.onehot_col),
                ('ord', ord_pipeline,self.ordinal_col)
            ]
        )
        
        preprocessor=make_pipeline(preprocess_pipeline)
        
        preprocessor.fit(data)
    
        return preprocessor
    
    """
    
    
    
    import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

df = pd.DataFrame({'brand': ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],
                   'category': ['asdf', 'asfa', 'asdfas', 'as'],
                   'num1': [1, 1, 0, 0],
                   'target': [0.2, 0.11, 1.34, 1.123]})

numeric_features = ['num1']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['brand', 'category']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor',  LinearRegression())])
clf.fit(df.drop('target', 1), df['target'])

clf.named_steps['preprocessor'].transformers_[1][1]\
   .named_steps['onehot'].get_feature_names(categorical_features)
    """
    
    def ce_pipeline(self,data):
        """
        category encode를 활용한 전처리 파이프라인
        
        encoder_list = [ce.backward_difference.BackwardDifferenceEncoder, 
               ce.basen.BaseNEncoder,
               ce.binary.BinaryEncoder,
                ce.cat_boost.CatBoostEncoder,
                ce.hashing.HashingEncoder,
                ce.helmert.HelmertEncoder,
                ce.james_stein.JamesSteinEncoder,
                ce.one_hot.OneHotEncoder,
                ce.leave_one_out.LeaveOneOutEncoder,
                ce.m_estimate.MEstimateEncoder,
                ce.ordinal.OrdinalEncoder,
                ce.polynomial.PolynomialEncoder,
                ce.sum_coding.SumEncoder,
                ce.target_encoder.TargetEncoder,
                ce.woe.WOEEncoder
                ]
for encoder in encoder_list:
    
    numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('woe', encoder())])
    
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
    
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(n_estimators=500))])
    
    model = pipe.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    print(encoder)
    print(f1_score(y_test, y_pred, average='macro'))
        """
        pass
        
        
class PreprocessSelector():
    """
    전처리 방식 선택
    """
    def __init__(self):
        self.data=None
        self._preprocessor=Preprocess()

    def strategy(self, data, strategy_type="strategy1"):
        self.data=data
        if strategy_type=='strategy1':
            self._strategy1()
        elif strategy_type=='strategy2':
            self._strategy2()

        return self.data

    def _base_strategy(self):
        self.data=self._preprocessor.engineer(self.data)
        self._preprocessor.get_dtypes(self.data)

    def _strategy1(self):

        self._base_strategy()
        self.data=self._preprocessor.drop_col(self.data,"corp_worker_cat")
        self.data=self._preprocessor.base_pipline(self.data)

    

    def _strategy2(self):
        """
        유사한 feature 제외
        """
        self._base_strategy()
        #self.data = self._preprocessor.drop_col(self.data,"corp_worker_cat")
        self.daa = self._preprocessor.drop_col(self.data,"")
        



In [27]:
temp_selector=PreprocessSelector()

Preprocessing Class


In [3]:
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV


from sklearn.ensemble import VotingClassifier

In [4]:
class GridSearchHelper():
    def __init__(self):
        print("GridSearchHelper Created")

        self.gridSearchCV=None
        self.clf_and_params=[]
        self._initialize_clf_and_params()

    def _initialize_clf_and_params(self):
        
        clf=LogisticRegression()
        params={'penalty':['l1', 'l2'],
                'C':np.logspace(0, 4, 10)
                }
        self.clf_and_params.append((clf, params))

        clf = SVC()
        params = [ {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                   {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]
        self.clf_and_params.append((clf, params))

        clf=DecisionTreeClassifier()
        params={'max_features': ['auto', 'sqrt', 'log2'],
          'min_samples_split': [2,3,4,5,6,7,8,9,10,11,12,13,14,15],
          'min_samples_leaf':[1],
          'random_state':[123]}
        #Because of depricating warning for Decision Tree which is not appended.
        #But it give high competion accuracy score. You can append when you run the kernel
        self.clf_and_params.append((clf,params))

        clf = RandomForestClassifier()
        params = {'n_estimators': [4, 6, 9],
              'max_features': ['log2', 'sqrt','auto'],
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10],
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }
        #Because of depricating warning for RandomForestClassifier which is not appended.
        #But it give high competion accuracy score. You can append when you run the kernel
        self.clf_and_params.append((clf, params))

    def fit_predict_save(self, X_train, X_test, y_train,strategy_type):
        self.X_train=X_train
        self.X_test=X_test
        self.y_train=y_train
        self.strategy_type=strategy_type

        clf_and_params = self.get_clf_and_params()
        models=[]
        self.results={}
        for clf, params in clf_and_params:
            self.current_clf_name = clf.__class__.__name__
            grid_search_clf = GridSearchCV(clf, params, cv=5)
            grid_search_clf.fit(self.X_train, self.y_train)
            self.Y_pred = grid_search_clf.predict(self.X_test)
            clf_train_acc = round(grid_search_clf.score(self.X_train, self.y_train) * 100, 2)
            print(self.current_clf_name, " trained and used for prediction on test data...")
            self.results[self.current_clf_name]=clf_train_acc
            # for ensemble
            models.append(clf)

            self.save_result()
            print()
        
        
    def show_result(self):
        for clf_name, train_acc in self.results.items():
                  print("{} train accuracy is {:.3f}".format(clf_name, train_acc))
        
    def save_result(self):
        id_idx = list(range(1, len(self.Y_pred) + 1))
        Submission = pd.DataFrame({'Id': id_idx,
                                    'Survived': self.Y_pred})
        file_name="{}_{}.csv".format(self.strategy_type,self.current_clf_name.lower())
        Submission.to_csv(file_name, index=False)

        print("Submission saved file name: ",file_name)

    def get_clf_and_params(self):

        return self.clf_and_params

    def add(self,clf, params):
        self.clf_and_params.append((clf, params))
        
    

In [15]:
temp_grid_1 = GridSearchHelper()
temp_grid_1.get_clf_and_params()

GridSearchHelper Created


[(LogisticRegression(),
  {'penalty': ['l1', 'l2'],
   'C': array([1.00000000e+00, 2.78255940e+00, 7.74263683e+00, 2.15443469e+01,
          5.99484250e+01, 1.66810054e+02, 4.64158883e+02, 1.29154967e+03,
          3.59381366e+03, 1.00000000e+04])}),
 (SVC(),
  [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
   {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]),
 (DecisionTreeClassifier(),
  {'max_features': ['auto', 'sqrt', 'log2'],
   'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
   'min_samples_leaf': [1],
   'random_state': [123]}),
 (RandomForestClassifier(),
  {'n_estimators': [4, 6, 9],
   'max_features': ['log2', 'sqrt', 'auto'],
   'criterion': ['entropy', 'gini'],
   'max_depth': [2, 3, 5, 10],
   'min_samples_split': [2, 3, 5],
   'min_samples_leaf': [1, 5, 8]})]

In [86]:
from sklearn.utils.class_weight import compute_class_weight

class MyGridSearcher():
    
    def __init__(self):
        self.clf = None
        self.params = None
        self.grid_clf = None
    
    def _get_class_weight(self,y_train):
        self.y_train = y_train  
        classes = np.unique(self.y_train)
        weights = compute_class_weight('balanced', classes = classes, y=self.y_train)
        self.class_weight = dict(zip(classes, weights))
        
        
    def get_clf_and_params(self):
        """
        Voting classifier 에 넣을 classifier 들을 리스트로 반환
        init_prams 활용
        """
        
    def init_params(self,clf, params,cv = 3,scoring = 'accuracy'):
        self.clf = clf
        self.params = params
        self.cv =3 
        self.grid_clf = GridSearchCV(clf, params, cv=cv,scoring=scoring,n_jobs=-1,verbose=1)
    
    
    def fit(self, X_train, X_test, y_train):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train        
        
        self.grid_clf.fit(self.X_train, self.y_train)
        
    def get_result_df(self):
        """
        cv_result df
        """
        
    def predict(self, X_test):
        return self.grid_clf.predict_proba(self.X_test)
        

        

In [87]:
#RandomForestClassifier(class_weight=class_weights)

mygrid = MyGridSearcher()

param_dists ={
    'criterion' : ['entropy', 'gini'],
    'n_estimators' : [110, 150, 200],
    'max_depth':  [10],
    'min_samples_leaf' : [1,2,4],
    'max_features': ['auto', 'sqrt', 'log2']
}

#'criterion': ['gini', 'entropy'],
#'max_features': ['auto', 'sqrt', 'log2']



In [102]:
X_p_new = PreprocessSelector().strategy(X,strategy_type="strategy1")

Preprocessing Class


In [120]:
"""
   .named_steps['onehot'].get_feature_names(categorical_features)

"""

X_p_new[0].transformers_[0][2]

['birth_date',
 'age_num',
 'graduy_date',
 'year_start_date',
 'month_start_date',
 'tw_hour',
 'tw_min',
 'workday_num',
 'worktime_num',
 'worktime_ex_num',
 'holy_work_num',
 'month_wage_num',
 'sat_wage_num',
 'sat_stable_num',
 'sat_work_num',
 'sat_env_num',
 'sat_wt_num',
 'sat_potential_num',
 'sat_relation_num',
 'sat_welfare_num',
 'sat_hr_num',
 'sat_rep1_num',
 'sat_auto_num',
 'sat_rep2_num',
 'sat_fit_num',
 'sat_edu_num',
 'sat_general_num',
 'sat_work-general_num',
 'edu-fit_num',
 'skill-fit_num',
 'major-fit_num',
 'major_help_num',
 'ins_1_num',
 'ins_2_num',
 'ins_3_num',
 'ins_4_num',
 'ins_5_num',
 'seeking_time_num',
 'health_num',
 'drink_num,',
 'lifesat_personal',
 'lifesat_relational',
 'lifesat_group',
 'emg_joy_num',
 'emg_happy_num',
 'emg_comfort_num',
 'emb_irr_num',
 'emb_negative_num',
 'emb_spiritless',
 'work_year',
 'work_time_num',
 'insurances_num',
 'biz_sat',
 'pos',
 'neg',
 'lifesat',
 'benefit_num']

In [124]:
X_p_new[0].transformers_[1][1]


Pipeline(steps=[('imputer', SimpleImputer(fill_value=-2, strategy='constant')),
                ('one-hot',
                 OneHotEncoder(cols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                     13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                     24, 25, 26],
                               handle_unknown='ignore', use_cat_names=True))])

In [None]:
# 각 하이퍼파라미터 조합으로 만들어진 모델들을 순위별로 나열해 봅니다.
# rank_test_score: 테스트 순위
# mean_score_time: 예측에 걸리는 시간
pd.DataFrame(clf.cv_results_).sort_values(by='rank_test_score').T

In [97]:
num_train = X_train.shape[0]

X = pd.concat([X_train, X_test], axis=0)
feature_col = X.columns

X_p = PreprocessSelector().strategy(X,strategy_type="strategy1")

X_train_p = X_p[:num_train]
X_test_p = X_p[num_train:]

Preprocessing Class


In [89]:
mygrid._get_class_weight(y_train)

print(mygrid.class_weight)

{0: 0.6348672871423647, 1: 2.3536741214057506}


In [90]:
# grid

mygrid.init_params(RandomForestClassifier(class_weight=mygrid.class_weight), param_dists,cv=3,scoring='accuracy')


mygrid.fit(X_train_p, X_test_p, y_train)

Fitting 3 folds for each of 54 candidates, totalling 162 fits


In [91]:
y_pred = mygrid.predict(X_test_p)

In [92]:
y_pred[:10]

array([[0.44281417, 0.55718583],
       [0.40801169, 0.59198831],
       [0.29898196, 0.70101804],
       [0.78098981, 0.21901019],
       [0.36134781, 0.63865219],
       [0.62092994, 0.37907006],
       [0.54130788, 0.45869212],
       [0.43095219, 0.56904781],
       [0.85523216, 0.14476784],
       [0.81782742, 0.18217258]])

In [60]:
class_wise_weights = mygrid.get_class_weight(y_train)

In [None]:
print(X_train_p.shape)
print(X_test_p.shape)
print(X_train.shape)
print(X_test.shape)

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split

from sklearn.cross_validation import *
from sklearn.grid_search import GridSearchCV


parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1337]}


clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=StratifiedKFold(train['QuoteConversion_Flag'], n_folds=5, shuffle=True), 
                   scoring='roc_auc',
                   verbose=2, refit=True)


In [28]:
from time import time


def timer_func(func):
    # 실행시간 측정용
    # 실행시간을 측정하기 위해 함수를 실행하고 시작 시간과 종료 시간을 저장한다.
    def wrap_func(*args, **kwargs):
        t1 = time()
        result = func(*args, **kwargs)
        t2 = time()
        print(f'Function {func.__name__!r} executed in {(t2-t1):.4f}s')
        return result
    return wrap_func

In [None]:
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

In [35]:
param_dists ={
    'rf__criterion' : ['entropy'],
    'rf__n_estimators' : [110, 130],
    'rf__max_depth':  [10],
    'rf__min_samples_leaf' : [1,2,4]
}
grcv = GridSearchCV(
    pipe, 
    param_grid=param_dists, 
    cv=5, 
    scoring='f1',  
    verbose=1,
    n_jobs=-1
)

NameError: name 'pipe' is not defined

In [62]:
test_index=X_test.index.to_list()

In [46]:
X_train_p.shape

(7367, 160)

- feature selection과 classification method의 조합
- hyperparameter tuning은 greed search cv
- permutation importance의 경우 변수간 상관관계를 고려할 필요요가 있음
  
- feauture의 수를 줄일 것
- 다중공선성 검정
