In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib

In [2]:
data = pd.read_csv("./data/salary2.csv")
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
data2 = data.copy()

# ydata로 EDA하기

In [4]:
# !pip install ydata-profiling
!pip install ipywidgets



In [5]:
from ydata_profiling import ProfileReport

In [6]:
profile = ProfileReport(data, title="Profiling Report") # 분석리포트 만들기
profile.to_notebook_iframe() # 주피터노트북 출력창에 결과 출력
profile.to_file("salary데이터EDA.html") # 분석결과를 html 파일로 출력

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                           | 0/14 [00:00<?, ?it/s][A
 14%|███████████▊                                                                       | 2/14 [00:01<00:08,  1.36it/s][A
100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:01<00:00,  8.03it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

중복행 제거, 결측값 제거

In [7]:
data = data.drop_duplicates()

In [8]:
data = data.dropna()

In [9]:
data = data.reset_index(drop=True)

In [10]:
data

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39235,53,Private,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
39236,22,Private,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
39237,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
39238,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [11]:
data['class'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [12]:
data['class'] = data['class'].apply(lambda x: 1 if x == ' >50K' else 0)
data['class'].value_counts()

class
0    29265
1     9975
Name: count, dtype: int64

# 중요 컬럼 선택하기
1) EDA를 통해서 종속변수(target)과 중요한 관계가 있는 변수들만 선택
2) 수치형 변수는 상관분석 결과를 통해, 범주형 변수는 카이제곱 통계량(통계학적 방법)
3) 머신러닝 알고리즘을 통한 1차 분석 후 중요하게 사용된 변수만 선택

# tree 계열 모델의 feature_importance로 선택하기

In [13]:
X = data.drop('class', axis=1)
y = data['class']

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=10, stratify=y)

In [16]:
from sklearn.preprocessing import OneHotEncoder

In [17]:
def ohe_transform(X_train, X_test):
    train_cat_cols = X_train.select_dtypes(include='object')
    train_num_cols = X_train.select_dtypes(exclude='object')
    test_cat_cols = X_test.select_dtypes(include='object')
    test_num_cols = X_test.select_dtypes(exclude='object')
    
    ohe = OneHotEncoder(drop = 'first', sparse_output=False, handle_unknown='ignore')
    ohe.fit(train_cat_cols)
    train_temp = ohe.transform(train_cat_cols)
    test_temp = ohe.transform(test_cat_cols)
    
    ohe_cols_name = ohe.get_feature_names_out(train_cat_cols.columns)
    
    train_temp_df = pd.DataFrame(train_temp, index=train_cat_cols.index, columns=ohe_cols_name)
    test_temp_df = pd.DataFrame(test_temp, index=test_cat_cols.index, columns=ohe_cols_name)
    
    train_result = pd.concat([train_temp_df, train_num_cols], axis=1)
    test_result = pd.concat([test_temp_df, test_num_cols], axis=1)
    return train_result, test_result

In [18]:
X_train, X_test = ohe_transform(X_train, X_test)
X_test

Unnamed: 0,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 11th,education_ 12th,education_ 1st-4th,education_ 5th-6th,...,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia,age,education-num,capital-gain,capital-loss,hours-per-week
22149,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,26,9,0,0,40
21899,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,33,16,0,0,40
12634,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,41,10,0,0,50
1171,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,42,10,0,0,70
4839,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,24,5,0,0,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25349,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,50,15,0,0,40
34503,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,64,16,0,0,80
15058,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,35,9,0,0,50
28083,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,27,6,0,0,60


# 의사결정나무로 1회 분석 후 컬럼 선별하기

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [20]:
dtc = DecisionTreeClassifier(max_depth=4, random_state=10)
dtc.fit(X_train, y_train)
pred = dtc.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.85      0.94      0.89     11706
           1       0.75      0.51      0.61      3990

    accuracy                           0.83     15696
   macro avg       0.80      0.73      0.75     15696
weighted avg       0.83      0.83      0.82     15696



In [21]:
dtc.feature_importances_

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 4.88789042e-01, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [22]:
dtc.feature_names_in_

array(['workclass_ Local-gov', 'workclass_ Private',
       'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc',
       'workclass_ State-gov', 'workclass_ Without-pay',
       'education_ 11th', 'education_ 12th', 'education_ 1st-4th',
       'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th',
       'education_ Assoc-acdm', 'education_ Assoc-voc',
       'education_ Bachelors', 'education_ Doctorate',
       'education_ HS-grad', 'education_ Masters', 'education_ Preschool',
       'education_ Prof-school', 'education_ Some-college',
       'marital-status_ Married-AF-spouse',
       'marital-status_ Married-civ-spouse',
       'marital-status_ Married-spouse-absent',
       'marital-status_ Never-married', 'marital-status_ Separated',
       'marital-status_ Widowed', 'occupation_ Armed-Forces',
       'occupation_ Craft-repair', 'occupation_ Exec-managerial',
       'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners',
       'occupation_ Machine-op-inspct'

In [23]:
feature_importances = pd.DataFrame(dtc.feature_importances_, index=dtc.feature_names_in_, columns=['importances'])

In [24]:
feature_importances = feature_importances.sort_values(by='importances', ascending=False)
feature_importances

Unnamed: 0,importances
marital-status_ Married-civ-spouse,0.488789
capital-gain,0.252757
education-num,0.224867
capital-loss,0.024827
hours-per-week,0.008297
...,...
native-country_ South,0.000000
native-country_ Scotland,0.000000
native-country_ Yugoslavia,0.000000
native-country_ United-States,0.000000


In [25]:
feature_importances[feature_importances['importances'] > 0]

Unnamed: 0,importances
marital-status_ Married-civ-spouse,0.488789
capital-gain,0.252757
education-num,0.224867
capital-loss,0.024827
hours-per-week,0.008297
age,0.000462


In [26]:
dtc_feature_importances = feature_importances[feature_importances['importances'] > 0].index
dtc_feature_importances

Index(['marital-status_ Married-civ-spouse', 'capital-gain', 'education-num',
       'capital-loss', 'hours-per-week', 'age'],
      dtype='object')

# 랜덤 포레스트로 분석하기

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [28]:
rfc = RandomForestClassifier(n_estimators=200, max_depth=4, n_jobs=10, random_state=10)
rfc.fit(X_train, y_train)
pred = rfc.predict(X_test)
print(classification_report(y_test, pred))

feature_importances = pd.DataFrame(rfc.feature_importances_, index=rfc.feature_names_in_, columns=['importances'])
feature_importances = feature_importances.sort_values(by='importances', ascending=False)
feature_importances[feature_importances['importances'] > 0.01]

              precision    recall  f1-score   support

           0       0.80      0.99      0.89     11706
           1       0.92      0.28      0.43      3990

    accuracy                           0.81     15696
   macro avg       0.86      0.64      0.66     15696
weighted avg       0.83      0.81      0.77     15696



Unnamed: 0,importances
marital-status_ Married-civ-spouse,0.217282
capital-gain,0.164975
education-num,0.109109
marital-status_ Never-married,0.087167
age,0.06594
hours-per-week,0.042754
relationship_ Not-in-family,0.042214
relationship_ Own-child,0.041527
occupation_ Exec-managerial,0.033154
sex_ Male,0.033119


In [29]:
rfc_feature_importances = feature_importances[feature_importances['importances'] > 0.01].index
rfc_feature_importances

Index(['marital-status_ Married-civ-spouse', 'capital-gain', 'education-num',
       'marital-status_ Never-married', 'age', 'hours-per-week',
       'relationship_ Not-in-family', 'relationship_ Own-child',
       'occupation_ Exec-managerial', 'sex_ Male', 'capital-loss',
       'occupation_ Prof-specialty', 'education_ Bachelors',
       'relationship_ Unmarried', 'education_ Masters',
       'education_ Prof-school', 'occupation_ Other-service'],
      dtype='object')

# XGBoost로 컬럼 중요도 출력

In [30]:
!pip install xgboost lightgbm catboost



In [31]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [32]:
xgb = RandomForestClassifier(n_estimators=200, max_depth=4, n_jobs=10, random_state=10)
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)
print(classification_report(y_test, pred))

feature_importances = pd.DataFrame(xgb.feature_importances_, index=xgb.feature_names_in_, columns=['importances'])
feature_importances = feature_importances.sort_values(by='importances', ascending=False)
feature_importances[feature_importances['importances'] > 0.01]

              precision    recall  f1-score   support

           0       0.80      0.99      0.89     11706
           1       0.92      0.28      0.43      3990

    accuracy                           0.81     15696
   macro avg       0.86      0.64      0.66     15696
weighted avg       0.83      0.81      0.77     15696



Unnamed: 0,importances
marital-status_ Married-civ-spouse,0.217282
capital-gain,0.164975
education-num,0.109109
marital-status_ Never-married,0.087167
age,0.06594
hours-per-week,0.042754
relationship_ Not-in-family,0.042214
relationship_ Own-child,0.041527
occupation_ Exec-managerial,0.033154
sex_ Male,0.033119


In [33]:
xgb_feature_importances = feature_importances[feature_importances['importances'] > 0.01].index
xgb_feature_importances

Index(['marital-status_ Married-civ-spouse', 'capital-gain', 'education-num',
       'marital-status_ Never-married', 'age', 'hours-per-week',
       'relationship_ Not-in-family', 'relationship_ Own-child',
       'occupation_ Exec-managerial', 'sex_ Male', 'capital-loss',
       'occupation_ Prof-specialty', 'education_ Bachelors',
       'relationship_ Unmarried', 'education_ Masters',
       'education_ Prof-school', 'occupation_ Other-service'],
      dtype='object')

In [34]:
important_cols = list(dtc_feature_importances) + list(rfc_feature_importances) + list(xgb_feature_importances)
important_cols

['marital-status_ Married-civ-spouse',
 'capital-gain',
 'education-num',
 'capital-loss',
 'hours-per-week',
 'age',
 'marital-status_ Married-civ-spouse',
 'capital-gain',
 'education-num',
 'marital-status_ Never-married',
 'age',
 'hours-per-week',
 'relationship_ Not-in-family',
 'relationship_ Own-child',
 'occupation_ Exec-managerial',
 'sex_ Male',
 'capital-loss',
 'occupation_ Prof-specialty',
 'education_ Bachelors',
 'relationship_ Unmarried',
 'education_ Masters',
 'education_ Prof-school',
 'occupation_ Other-service',
 'marital-status_ Married-civ-spouse',
 'capital-gain',
 'education-num',
 'marital-status_ Never-married',
 'age',
 'hours-per-week',
 'relationship_ Not-in-family',
 'relationship_ Own-child',
 'occupation_ Exec-managerial',
 'sex_ Male',
 'capital-loss',
 'occupation_ Prof-specialty',
 'education_ Bachelors',
 'relationship_ Unmarried',
 'education_ Masters',
 'education_ Prof-school',
 'occupation_ Other-service']

In [35]:
important_cols = set(important_cols)
important_cols

{'age',
 'capital-gain',
 'capital-loss',
 'education-num',
 'education_ Bachelors',
 'education_ Masters',
 'education_ Prof-school',
 'hours-per-week',
 'marital-status_ Married-civ-spouse',
 'marital-status_ Never-married',
 'occupation_ Exec-managerial',
 'occupation_ Other-service',
 'occupation_ Prof-specialty',
 'relationship_ Not-in-family',
 'relationship_ Own-child',
 'relationship_ Unmarried',
 'sex_ Male'}

In [36]:
important_cols = ['age', 'capital-gain', 'capital-loss', 'education-num',
                  'hours-per-week', 'marital-status', 'native-country', 
                  'occupation', 'sex']

In [37]:
len(important_cols)

9

In [38]:
final_df = data2[important_cols]
final_df

Unnamed: 0,age,capital-gain,capital-loss,education-num,hours-per-week,marital-status,native-country,occupation,sex
0,25,0,0,7,40,Never-married,United-States,Machine-op-inspct,Male
1,38,0,0,9,50,Married-civ-spouse,United-States,Farming-fishing,Male
2,28,0,0,12,40,Married-civ-spouse,United-States,Protective-serv,Male
3,44,7688,0,10,40,Married-civ-spouse,United-States,Machine-op-inspct,Male
4,18,0,0,10,30,Never-married,United-States,,Female
...,...,...,...,...,...,...,...,...,...
48837,27,0,0,12,38,Married-civ-spouse,United-States,Tech-support,Female
48838,40,0,0,9,40,Married-civ-spouse,United-States,Machine-op-inspct,Male
48839,58,0,0,9,40,Widowed,United-States,Adm-clerical,Female
48840,22,0,0,9,20,Never-married,United-States,Adm-clerical,Male


In [39]:
X2 = data[important_cols]
y2 = data['class']

In [40]:
X2

Unnamed: 0,age,capital-gain,capital-loss,education-num,hours-per-week,marital-status,native-country,occupation,sex
0,25,0,0,7,40,Never-married,United-States,Machine-op-inspct,Male
1,38,0,0,9,50,Married-civ-spouse,United-States,Farming-fishing,Male
2,28,0,0,12,40,Married-civ-spouse,United-States,Protective-serv,Male
3,44,7688,0,10,40,Married-civ-spouse,United-States,Machine-op-inspct,Male
4,34,0,0,6,30,Never-married,United-States,Other-service,Male
...,...,...,...,...,...,...,...,...,...
39235,53,0,0,14,40,Married-civ-spouse,United-States,Exec-managerial,Male
39236,22,0,0,10,40,Never-married,United-States,Protective-serv,Male
39237,27,0,0,12,38,Married-civ-spouse,United-States,Tech-support,Female
39238,58,0,0,9,40,Widowed,United-States,Adm-clerical,Female


In [41]:
y2

0        0
1        0
2        1
3        1
4        0
        ..
39235    1
39236    0
39237    0
39238    0
39239    1
Name: class, Length: 39240, dtype: int64

In [42]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.4, random_state=10, stratify=y)

In [43]:
X2_train, X2_test = ohe_transform(X2_train, X2_test)
X2_test

Unnamed: 0,marital-status_ Married-AF-spouse,marital-status_ Married-civ-spouse,marital-status_ Married-spouse-absent,marital-status_ Never-married,marital-status_ Separated,marital-status_ Widowed,native-country_ Canada,native-country_ China,native-country_ Columbia,native-country_ Cuba,...,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,sex_ Male,age,capital-gain,capital-loss,education-num,hours-per-week
22149,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,26,0,0,9,40
21899,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,33,0,0,16,40
12634,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,41,0,0,10,50
1171,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,42,0,0,10,70
4839,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,24,0,0,5,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25349,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,50,0,0,15,40
34503,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,64,0,0,16,80
15058,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,35,0,0,9,50
28083,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,27,0,0,6,60


In [44]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [45]:
dtc = DecisionTreeClassifier(max_depth=4, random_state=10)
dtc.fit(X2_train, y2_train)
pred2 = dtc.predict(X2_test)
print(classification_report(y2_test, pred2))

              precision    recall  f1-score   support

           0       0.85      0.94      0.89     11706
           1       0.75      0.51      0.61      3990

    accuracy                           0.83     15696
   macro avg       0.80      0.73      0.75     15696
weighted avg       0.83      0.83      0.82     15696

