대회는 총 3개의 파일 제공합니다.

train.csv : training set. 훈련용 데이터
test.csv : test set, 제출용 데이터
submission_example.csv : 제출파일 예시
데이터 세부 설명
train/test는 14개의 columns으로 구성되어 있고, train은 예측해야 하는 target 값 feature까지 1개가 추가로 있습니다. 각 데이터는 다음을 의미합니다.

id
age : 나이
workclass : 고용 형태
fnlwgt : 사람 대표성을 나타내는 가중치 (final weight의 약자)
education : 교육 수준
education_num : 교육 수준 수치
marital_status: 결혼 상태
occupation : 업종
relationship : 가족 관계
race : 인종
sex : 성별
capital_gain : 양도 소득
capital_loss : 양도 손실
hours_per_week : 주당 근무 시간
native_country : 국적
income : 수익 (예측해야 하는 값)
>50K : 1
<=50K : 0

In [1]:
import pandas as pd
import numpy as np
import os

# Random Forest Classifier
# Gradient Boosting Classifier
# XGBoost Classifier
# Support Vector Machine(SVC)
from sklearn.ensemble import VotingClassifier

In [2]:
path = 'kakr-4th-competition/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
submission = pd.read_csv(path + 'sample_submission.csv')

In [3]:
train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0,40,Private,168538,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,>50K
1,1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,20,United-States,<=50K
2,2,18,Private,353358,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,16,United-States,<=50K
3,3,21,Private,151158,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,25,United-States,<=50K
4,4,24,Private,122234,Some-college,10,Never-married,Adm-clerical,Not-in-family,Black,Female,0,0,20,?,<=50K


In [4]:
train.shape

(26049, 16)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26049 entries, 0 to 26048
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              26049 non-null  int64 
 1   age             26049 non-null  int64 
 2   workclass       26049 non-null  object
 3   fnlwgt          26049 non-null  int64 
 4   education       26049 non-null  object
 5   education_num   26049 non-null  int64 
 6   marital_status  26049 non-null  object
 7   occupation      26049 non-null  object
 8   relationship    26049 non-null  object
 9   race            26049 non-null  object
 10  sex             26049 non-null  object
 11  capital_gain    26049 non-null  int64 
 12  capital_loss    26049 non-null  int64 
 13  hours_per_week  26049 non-null  int64 
 14  native_country  26049 non-null  object
 15  income          26049 non-null  object
dtypes: int64(7), object(9)
memory usage: 3.2+ MB


In [6]:
# for문으로 자료형이 object면 인코딩을 진행한다.

In [7]:
train['id'].dtype == 'int64'

True

## 1-1. 범주형변수 변환 (인코딩)

In [8]:
pd.factorize(train['workclass'])

(array([0, 0, 0, ..., 2, 3, 2], dtype=int64),
 Index(['Private', 'State-gov', '?', 'Self-emp-not-inc', 'Local-gov',
        'Federal-gov', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
       dtype='object'))

In [9]:
for col in list(train.columns):
    if train[col].dtype == object:
        # pd.factorize는 해당값과 각 숫자별 의미하는 실제값이 부여됨
        train.loc[:, col] = pd.factorize(train[col])[0].reshape(-1,1)

In [10]:
train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,0,40,0,168538,0,9,0,0,0,0,0,0,0,60,0,0
1,1,17,0,101626,1,5,1,1,1,0,0,0,0,20,0,1
2,2,18,0,353358,2,10,1,2,1,0,0,0,0,16,0,1
3,3,21,0,151158,2,10,1,3,1,0,1,0,0,25,0,1
4,4,24,0,122234,2,10,1,4,2,1,1,0,0,20,1,1


## 1-2. 데이터 분리


In [11]:
y = train['income']

X = train.copy()
del X['income']
del X['id']

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

In [14]:
y_train

11482    1
4373     1
12160    0
2694     1
16938    1
        ..
21575    1
5390     1
860      0
15795    1
23654    1
Name: income, Length: 19536, dtype: int64

## 1-3. 데이터 스케일링

In [15]:
# 개념은 스케일러를 train에 맞춰두고 test에 그대로 진행
# 각자 진행하면 오류발생 (엄청난 영향을 준다.)
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)
# x_test_scaled[0]

In [16]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [17]:
model1 = SVC(kernel = 'rbf', C=1, gamma = 0.1, probability=True) #'linear'
model2 = LogisticRegression()
model3 = KNeighborsClassifier() #deafult 5
model4 = GaussianNB()
model5 = RandomForestClassifier()
model6 = AdaBoostClassifier(n_estimators=100, random_state=0)
model7 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
models = [model1, model2, model3, model4, model5, model6, model7]

In [18]:
def train_and_test(model):
    model.fit(x_train_scaled,y_train) # 훈련 세트(x_train)를 타겟값(y_train)에 대하여 학습
    prediction = model.predict(x_test_scaled) # 테스트(x_test) 데이터 세트에 대해 타겟값 예측
    
    # 선형회귀의 경우 proba가 없음
    predict_proba = model.predict_proba(x_test_scaled) # 각각의 결과값 도출
#     print(predict_proba)
    
    accuracy = round(model.score(x_train_scaled,y_train) * 100, 2) # 10번 시행시 평균 정확도
    print("정확도 : ", accuracy, "%")
    return prediction

In [24]:
train_and_test(model6)

정확도 :  86.52 %


array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [20]:
train_and_test(model1)

정확도 :  83.33 %


array([1, 1, 1, ..., 1, 1, 1], dtype=int64)