# 데이터, 라이브러리 로드

In [1]:
import pandas as pd
import numpy as np

In [58]:
data_dir = 'C:/Users/joyh1/Desktop/GitRepo/data/'
data = pd.read_csv(data_dir+'adult.csv')

In [59]:
data.shape

(32561, 15)

In [60]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


# 데이터 전처리

In [61]:
# 사용할 feature들 : age, education.num, sex, race, capital.gain, capital.loss, hours.per.week, native.country
# y값 : income

# 필요한 칼럼들만 남기기
columns = ['age', 'education.num', 'sex', 'race', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
          'income']

data = data[columns].copy()
data.shape

(32561, 9)

In [62]:
# 결측치 확인
data.isnull().sum()

age               0
education.num     0
sex               0
race              0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

## 각 Feature별 value들 확인하고 전처리

### 'age' feature

In [63]:
# age feature 값들 확인 => 그대로 연속적인 값으로 써도 괜찮을 듯!
data['age'].unique()


array([90, 82, 66, 54, 41, 34, 38, 74, 68, 45, 52, 32, 51, 46, 57, 22, 37,
       29, 61, 21, 33, 49, 23, 59, 60, 63, 53, 44, 43, 71, 48, 73, 67, 40,
       50, 42, 39, 55, 47, 31, 58, 62, 36, 72, 78, 83, 26, 70, 27, 35, 81,
       65, 25, 28, 56, 69, 20, 30, 24, 64, 75, 19, 77, 80, 18, 17, 76, 79,
       88, 84, 85, 86, 87], dtype=int64)

### 'education.num' feature

In [64]:
# education.num 값들 확인 => 그대로 사용해도 괜찮을 듯!
data['education.num'].unique()

array([ 9, 10,  4,  6, 16, 15, 13, 14,  7, 12, 11,  2,  3,  8,  5,  1],
      dtype=int64)

### 'sex' feature

In [65]:
# sex 값들 확인
data['sex'].unique()

array(['Female', 'Male'], dtype=object)

In [66]:
# Female, male 을 여자면 0, 남자면 1로 설정해주자
data.loc[data['sex'] == 'Female', 'sex'] = 0
data.loc[data['sex'] == 'Male', 'sex'] = 1
data['sex'].value_counts()

1    21790
0    10771
Name: sex, dtype: int64

### 'race' feature 

In [67]:
# race에 여러가지 값들이 있어서 우선 사용할 feature에서 유보.. 어떤 인종에다가 가중치를 두어야 하나!?
data['race'].value_counts()

White                 27816
Black                  3124
Asian-Pac-Islander     1039
Amer-Indian-Eskimo      311
Other                   271
Name: race, dtype: int64

### 'capital.gain', 'capital.loss', 'hours.per.week' feature

In [68]:
data['capital.gain'].unique()

array([    0, 99999, 41310, 34095, 27828, 25236, 25124, 22040, 20051,
       18481, 15831, 15024, 15020, 14344, 14084, 13550, 11678, 10605,
       10566, 10520,  9562,  9386,  8614,  7978,  7896,  7688,  7443,
        7430,  7298,  6849,  6767,  6723,  6514,  6497,  6418,  6360,
        6097,  5721,  5556,  5455,  5178,  5060,  5013,  4934,  4931,
        4865,  4787,  4687,  4650,  4508,  4416,  4386,  4101,  4064,
        3942,  3908,  3887,  3818,  3781,  3674,  3471,  3464,  3456,
        3432,  3418,  3411,  3325,  3273,  3137,  3103,  2993,  2977,
        2964,  2961,  2936,  2907,  2885,  2829,  2653,  2635,  2597,
        2580,  2538,  2463,  2414,  2407,  2387,  2354,  2346,  2329,
        2290,  2228,  2202,  2176,  2174,  2105,  2062,  2050,  2036,
        2009,  1848,  1831,  1797,  1639,  1506,  1471,  1455,  1424,
        1409,  1173,  1151,  1111,  1086,  1055,   991,   914,   594,
         401,   114], dtype=int64)

In [69]:
data['capital.loss'].unique()

array([4356, 3900, 3770, 3683, 3004, 2824, 2754, 2603, 2559, 2547, 2489,
       2472, 2467, 2457, 2444, 2415, 2392, 2377, 2352, 2339, 2282, 2267,
       2258, 2246, 2238, 2231, 2206, 2205, 2201, 2179, 2174, 2163, 2149,
       2129, 2080, 2057, 2051, 2042, 2002, 2001, 1980, 1977, 1974, 1944,
       1902, 1887, 1876, 1848, 1844, 1825, 1816, 1762, 1755, 1741, 1740,
       1735, 1726, 1721, 1719, 1672, 1669, 1668, 1651, 1648, 1628, 1617,
       1602, 1594, 1590, 1579, 1573, 1564, 1539, 1504, 1485, 1411, 1408,
       1380, 1340, 1258, 1138, 1092,  974,  880,  810,  653,  625,  419,
        323,  213,  155,    0], dtype=int64)

In [70]:
data['hours.per.week'].unique()

array([40, 18, 45, 20, 60, 35, 55, 76, 50, 42, 25, 32, 90, 48, 15, 70, 52,
       72, 39,  6, 65, 12, 80, 67, 99, 30, 75, 26, 36, 10, 84, 38, 62, 44,
        8, 28, 59,  5, 24, 57, 34, 37, 46, 56, 41, 98, 43, 63,  1, 47, 68,
       54,  2, 16,  9,  3,  4, 33, 23, 22, 64, 51, 19, 58, 53, 96, 66, 21,
        7, 13, 27, 11, 14, 77, 31, 78, 49, 17, 85, 87, 88, 73, 89, 97, 94,
       29, 82, 86, 91, 81, 92, 61, 74, 95], dtype=int64)

- 모두 연속적인 실수값이며 이상한 문자열이 들어가지 않았으므로 feature로 사용 괜찮음

### 'native.country' feature

In [71]:
data['native.country'].value_counts()
# 미국이 압도적으로 많으므로 미국이면 0, 다른나라면 1로 바꾸어도 무방할듯!

United-States                 29170
Mexico                          643
?                               583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
Greece                      

In [72]:
# 미국이면 0, 미국이 아니면 1로 바꾸어주기
data.loc[data['native.country'] == 'United-States', 'native.country'] = 0
data.loc[data['native.country'] != 0, 'native.country'] = 1
data['native.country'].value_counts()

0    29170
1     3391
Name: native.country, dtype: int64

### 'income' feature(y label)

In [73]:
data['income'].value_counts()

<=50K    24720
>50K      7841
Name: income, dtype: int64

In [77]:
# income $50k(5만달러)가 넘으면 1, 같거나 넘지않으면 0으로 label붙여주기
data.loc[data['income'] == '<=50K', 'income'] = 0
data.loc[data['income'] != 0, 'income'] = 1

In [78]:
data['income'].value_counts()

0    24720
1     7841
Name: income, dtype: int64

In [79]:
# 인종 feature은 native.country와 비슷한 성격의 feature이므로 race 칼럼 제거
del data['race']
final_data = data.copy()
final_data.head()

Unnamed: 0,age,education.num,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,9,0,0,4356,40,0,0
1,82,9,0,0,4356,18,0,0
2,66,10,0,0,4356,40,0,0
3,54,4,0,0,3900,40,0,0
4,41,10,0,0,3900,40,0,0


In [80]:
# feature별 칼럼별로 dtypes 확인
final_data.dtypes

age                int64
education.num      int64
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income            object
dtype: object

In [85]:
final_data['income'] = final_data['income'].astype(int)
final_data.dtypes

age                int64
education.num      int64
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income             int32
dtype: object

# Modeling Logistic regression

- Logisitc Regression 사용시에는 분류할 값(y값)을 ``int형``으로 바꾸어주어야 함!!

In [82]:
# KFold-validation 방법 사용
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

In [87]:
features = ['age','education.num','sex','capital.gain',
            'capital.loss','hours.per.week','native.country']

# KFold-validation에서 Fold개수 5개로 설정후 Shuffle(마구섞기)허용
kf = KFold(n_splits=5, shuffle=True)

# KFold 5번하기 떄문에 한번할 때마다 정확도 담을 리스트 할당
accrs = []
# Fold 횟수 출력
fold_idx = 1

# split train/test data 
for train_idx, test_idx in kf.split(final_data):
    print(f'Fold num : {fold_idx}')
    train_d, test_d = final_data.iloc[train_idx], final_data.iloc[test_idx]
    
    # train 데이터에서 x,y값 할당
    train_x = train_d[features]
    train_y = train_d['income']
    
    # test 데이터에서 x,y값 할당
    test_x = test_d[features]
    test_y = test_d['income']
    
    # 모델 정의 
    model = LogisticRegression() #lbgfs는 최적화알고리즘
    # train 데이터로 학습시키기
    model.fit(train_x, train_y)
    
    # 모델한번 학습하고 test한 후 accuracy 측정
    mean_accr = model.score(test_x, test_y)
    # KFold 할때마다 모델 accuracy 측정해서 accrs리스트에 담기
    accrs.append(mean_accr)
    
    # Fold 횟수 한 번 끝날때마다 1씩 증가
    fold_idx += 1

# 5번 KFold한 정확도 5개값의 평균값 출력
print(np.average(accrs))

Fold num : 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold num : 2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold num : 3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold num : 4


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Fold num : 5
0.8124440968752346


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# 결과값

- KFold 5번 시행한 accuracy의 평균 정확도가 81%로 성능이 좋은 편이 아님
- Feature의 차원을 줄여보자!

## PCA로 Feature dimension reduction 후 사용

In [88]:
# PCA 임포트
from sklearn.decomposition import PCA

features = ['age','education.num','sex','capital.gain',
            'capital.loss','hours.per.week','native.country']

# KFold-validation에서 Fold개수 5개로 설정후 Shuffle(마구섞기)허용
kf = KFold(n_splits=5, shuffle=True)

# KFold 5번하기 떄문에 한번할 때마다 정확도 담을 리스트 할당
accrs = []
# Fold 횟수 출력
fold_idx = 1

# split train/test data 
for train_idx, test_idx in kf.split(final_data):
    print(f'Fold num : {fold_idx}')
    train_d, test_d = final_data.iloc[train_idx], final_data.iloc[test_idx]
    
    # PCA 할당, n_components : 몇개의 feature로 줄일건지
    pca = PCA(n_components=4)
    
    # train 데이터에서 x,y값 할당
        # x = feature에다가 PCA의 fit_transform 적용
    train_x = pca.fit_transform(train_d[features])
    train_y = train_d['income']
    
    # test 데이터에서 x,y값 할당
        # test의 x(feature)에는 PCA의 transform만 적용!
    test_x = pca.transform(test_d[features])
    test_y = test_d['income']
    
    # 모델 정의 
    model = LogisticRegression() #lbgfs는 최적화알고리즘
    # train 데이터로 학습시키기
    model.fit(train_x, train_y)
    
    # 모델한번 학습하고 test한 후 accuracy 측정
    mean_accr = model.score(test_x, test_y)
    # KFold 할때마다 모델 accuracy 측정해서 accrs리스트에 담기
    accrs.append(mean_accr)
    
    # Fold 횟수 한 번 끝날때마다 1씩 증가
    fold_idx += 1

# 5번 KFold한 정확도 5개값의 평균값 출력
print(np.average(accrs))

Fold num : 1
Fold num : 2
Fold num : 3
Fold num : 4
Fold num : 5
0.8002211038139182


- 오히려 평균성능이 1% 낮아짐..