# Python Final Exam

####  국민대 빅데이터경영MBA / U2016040 / 김우현

#### 데이터를 이용하여 각 사람의 income을 분류/예측하는 머신러닝 수행

### 데이터 불러오기

In [1]:
import pandas as pd
from sklearn import metrics

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,age,workclass,education,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,under50k
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,under50k
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,under50k
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,under50k
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,under50k


In [3]:
train.shape

(24999, 12)

In [4]:
train.columns

Index(['age', 'workclass', 'education', 'marital', 'occupation',
       'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
       'hours_per_week', 'income'],
      dtype='object')

#### 종속변수

In [5]:
y = train['income']
y.head()

0    under50k
1    under50k
2    under50k
3    under50k
4    under50k
Name: income, dtype: object

#### 독립변수

In [6]:
conti_var = train.columns[train.dtypes != 'object']
conti_var

Index(['age', 'education', 'capital_gain', 'capital_loss', 'hours_per_week'], dtype='object')

In [7]:
cate_var = train.columns[train.dtypes == 'object'].difference(['income'])
cate_var

Index(['marital', 'occupation', 'race', 'relationship', 'sex', 'workclass'], dtype='object')

In [8]:
# 범주형 변수를 dummy 변수로 변환

In [9]:
dummy_var = pd.get_dummies(train[cate_var])

In [10]:
X = pd.concat([train[conti_var], dummy_var], axis=1)
X.head()

Unnamed: 0,age,education,capital_gain,capital_loss,hours_per_week,marital_Divorced,marital_Married-AF-spouse,marital_Married-civ-spouse,marital_Married-spouse-absent,marital_Never-married,...,relationship_Wife,sex_Female,sex_Male,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay
0,39,13,2174,0,40,0,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
1,50,13,0,0,13,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
2,38,9,0,0,40,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,53,7,0,0,40,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
4,28,13,0,0,40,0,0,1,0,0,...,1,1,0,0,0,1,0,0,0,0


### 데이터 분할

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [13]:
X_train.shape

(22499, 46)

In [14]:
X_test.shape

(2500, 46)

### 모형 평가 출력 함수

In [15]:
def model_performance(y_test, y_pred):    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred).round(3)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label='over50k').round(3)))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label='over50k').round(3)))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label='over50k').round(3)))

## 1. Logistic Regression (Lasso / Ridge)

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
def run_lr_model(penalties, Clist):
    for p, c in zip(penalties, Clist):
        print('---------- penalty : {}, C : {} ----------'.format(p, c))
        model = LogisticRegression(penalty=p, C=c)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_performance(y_test, y_pred)
        print('\n')

In [18]:
# penalty : l1 = Lasso / l2 = Ridge
# Lasso (L1 regularization) : 특정 변수의 coef를 0으로 만듬. automatic feature selection.
# Ridge (L2 regularization) : coef를 조절하지만 0으로 만들지는 않음.
# C > 1 : 오차를 주로 줄임. 가능한 training set에 맞춤 / C < 1 : coef 주로 줄임

In [19]:
plist = ['l1','l1','l1','l1','l1','l2','l2','l2','l2','l2',]

In [20]:
clist = [0.001, 0.01, 0.1, 1, 100, 0.001, 0.01, 0.1, 1, 100]

In [21]:
run_lr_model(plist, clist)

---------- penalty : l1, C : 0.001 ----------
confusion matrix
[[ 167  462]
 [  55 1816]]
accuracy : 0.793
precision : 0.752
recall : 0.266
F1 : 0.392


---------- penalty : l1, C : 0.01 ----------
confusion matrix
[[ 338  291]
 [ 107 1764]]
accuracy : 0.841
precision : 0.76
recall : 0.537
F1 : 0.629


---------- penalty : l1, C : 0.1 ----------
confusion matrix
[[ 376  253]
 [ 125 1746]]
accuracy : 0.849
precision : 0.75
recall : 0.598
F1 : 0.665


---------- penalty : l1, C : 1 ----------
confusion matrix
[[ 378  251]
 [ 129 1742]]
accuracy : 0.848
precision : 0.746
recall : 0.601
F1 : 0.665


---------- penalty : l1, C : 100 ----------
confusion matrix
[[ 379  250]
 [ 129 1742]]
accuracy : 0.848
precision : 0.746
recall : 0.603
F1 : 0.667


---------- penalty : l2, C : 0.001 ----------
confusion matrix
[[ 190  439]
 [  59 1812]]
accuracy : 0.801
precision : 0.763
recall : 0.302
F1 : 0.433


---------- penalty : l2, C : 0.01 ----------
confusion matrix
[[ 338  291]
 [  99 1772]]
accu

## 2. Decision Tree

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [23]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [24]:
y_pred = model.predict(X_test)
model_performance(y_test, y_pred)

confusion matrix
[[ 417  212]
 [ 268 1603]]
accuracy : 0.808
precision : 0.609
recall : 0.663
F1 : 0.635


## 3. Random Forest

In [25]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
def run_rf_model(n_estimators, n_jobs):
    for ne, nj in zip(n_estimators, n_jobs):
        print('---------- n_estimators : {}, n_jobs : {} ----------'.format(ne, nj))
        model = RandomForestClassifier(n_estimators=ne, n_jobs=nj)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_performance(y_test, y_pred)
        print('\n')

In [27]:
# n_estimators = number of trees

In [28]:
n_estimators = [10, 10, 10, 100, 100, 100, 1000, 1000, 1000]

In [29]:
# n_jobs = number of jobs to run in parallel for both fit and predict.

In [30]:
n_jobs = [1, 10, 100, 1, 10, 100, 1, 10, 100]

In [31]:
run_rf_model(n_estimators, n_jobs)

---------- n_estimators : 10, n_jobs : 1 ----------
confusion matrix
[[ 409  220]
 [ 197 1674]]
accuracy : 0.833
precision : 0.675
recall : 0.65
F1 : 0.662


---------- n_estimators : 10, n_jobs : 10 ----------
confusion matrix
[[ 416  213]
 [ 185 1686]]
accuracy : 0.841
precision : 0.692
recall : 0.661
F1 : 0.676


---------- n_estimators : 10, n_jobs : 100 ----------
confusion matrix
[[ 415  214]
 [ 196 1675]]
accuracy : 0.836
precision : 0.679
recall : 0.66
F1 : 0.669


---------- n_estimators : 100, n_jobs : 1 ----------
confusion matrix
[[ 394  235]
 [ 155 1716]]
accuracy : 0.844
precision : 0.718
recall : 0.626
F1 : 0.669


---------- n_estimators : 100, n_jobs : 10 ----------
confusion matrix
[[ 391  238]
 [ 157 1714]]
accuracy : 0.842
precision : 0.714
recall : 0.622
F1 : 0.664


---------- n_estimators : 100, n_jobs : 100 ----------
confusion matrix
[[ 401  228]
 [ 159 1712]]
accuracy : 0.845
precision : 0.716
recall : 0.638
F1 : 0.675


---------- n_estimators : 1000, n_jobs 

## 4. Gradient Boosting Tree

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

### loss function : ‘deviance’ = logistic regression (default)

In [33]:
def run_gbt_model(n_estimators, l_rate):
    for ne, lr in zip(n_estimators, l_rate):
        print('---------- n_estimators : {}, learning_rate : {} ----------'.format(ne, lr))
        model = GradientBoostingClassifier(n_estimators=ne, learning_rate=lr)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_performance(y_test, y_pred)
        print('\n')

In [34]:
# n_estimators = number of boosting stages to perform

In [35]:
n_estimators = [100, 100, 100, 1000, 1000, 1000]

In [36]:
l_rate = [0.1, 0.3, 0.5, 0.1, 0.3, 0.5]

In [37]:
run_gbt_model(n_estimators, l_rate)

---------- n_estimators : 100, learning_rate : 0.1 ----------
confusion matrix
[[ 379  250]
 [ 102 1769]]
accuracy : 0.859
precision : 0.788
recall : 0.603
F1 : 0.683


---------- n_estimators : 100, learning_rate : 0.3 ----------
confusion matrix
[[ 407  222]
 [ 111 1760]]
accuracy : 0.867
precision : 0.786
recall : 0.647
F1 : 0.71


---------- n_estimators : 100, learning_rate : 0.5 ----------
confusion matrix
[[ 414  215]
 [ 119 1752]]
accuracy : 0.866
precision : 0.777
recall : 0.658
F1 : 0.713


---------- n_estimators : 1000, learning_rate : 0.1 ----------
confusion matrix
[[ 420  209]
 [ 121 1750]]
accuracy : 0.868
precision : 0.776
recall : 0.668
F1 : 0.718


---------- n_estimators : 1000, learning_rate : 0.3 ----------
confusion matrix
[[ 423  206]
 [ 133 1738]]
accuracy : 0.864
precision : 0.761
recall : 0.672
F1 : 0.714


---------- n_estimators : 1000, learning_rate : 0.5 ----------
confusion matrix
[[ 415  214]
 [ 135 1736]]
accuracy : 0.86
precision : 0.755
recall : 0.66

### loss function : ‘exponential’ = AdaBoost algorithm.

In [38]:
def run_gbtExp_model(n_estimators, l_rate):
    for ne, lr in zip(n_estimators, l_rate):
        print('---------- n_estimators : {}, learning_rate : {} ----------'.format(ne, lr))
        model = GradientBoostingClassifier(n_estimators=ne, learning_rate=lr, loss='exponential')
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_performance(y_test, y_pred)
        print('\n')

In [39]:
run_gbt_model(n_estimators, l_rate)

---------- n_estimators : 100, learning_rate : 0.1 ----------
confusion matrix
[[ 379  250]
 [ 102 1769]]
accuracy : 0.859
precision : 0.788
recall : 0.603
F1 : 0.683


---------- n_estimators : 100, learning_rate : 0.3 ----------
confusion matrix
[[ 407  222]
 [ 111 1760]]
accuracy : 0.867
precision : 0.786
recall : 0.647
F1 : 0.71


---------- n_estimators : 100, learning_rate : 0.5 ----------
confusion matrix
[[ 414  215]
 [ 119 1752]]
accuracy : 0.866
precision : 0.777
recall : 0.658
F1 : 0.713


---------- n_estimators : 1000, learning_rate : 0.1 ----------
confusion matrix
[[ 420  209]
 [ 121 1750]]
accuracy : 0.868
precision : 0.776
recall : 0.668
F1 : 0.718


---------- n_estimators : 1000, learning_rate : 0.3 ----------
confusion matrix
[[ 423  206]
 [ 133 1738]]
accuracy : 0.864
precision : 0.761
recall : 0.672
F1 : 0.714


---------- n_estimators : 1000, learning_rate : 0.5 ----------
confusion matrix
[[ 416  213]
 [ 135 1736]]
accuracy : 0.861
precision : 0.755
recall : 0.6

In [40]:
# feature importance

In [41]:
model = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1)
model.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=1000, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [42]:
varDic = {'var':X_train.columns, 'importance':model.feature_importances_}
impVar = pd.DataFrame(varDic)
impVar.sort_values(by='importance', ascending=False)[1:11]

Unnamed: 0,importance,var
4,0.136053,hours_per_week
1,0.109483,education
3,0.104032,capital_loss
2,0.089615,capital_gain
7,0.031682,marital_Married-civ-spouse
43,0.021958,workclass_Self-emp-not-inc
42,0.020802,workclass_Self-emp-inc
15,0.01836,occupation_Exec-managerial
36,0.017918,relationship_Wife
23,0.016956,occupation_Sales


## 5. SVM

In [43]:
# SVM Classification 의 경우 kernel='linear'로 설정시 22499 건의 데이터를 훈련시키는데 엄청난 시간이 걸림.

In [44]:
# 범주형 변수를 제외하고 연속형 변수만으로 Classification을 하는 경우도 시간이 많이 소요됨.

In [45]:
from sklearn.svm import SVC

In [46]:
model = SVC(C=10)

In [47]:
model.fit(X_train, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [48]:
y_pred = model.predict(X_test)
model_performance(y_test, y_pred)

confusion matrix
[[ 397  232]
 [ 119 1752]]
accuracy : 0.86
precision : 0.769
recall : 0.631
F1 : 0.693


In [49]:
# Scaling

In [50]:
from sklearn.preprocessing import MinMaxScaler

In [51]:
scaler = MinMaxScaler()
scaler.fit(X_train)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [52]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [53]:
model = SVC(C=10)
model.fit(X_train_scaled, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [54]:
y_pred = model.predict(X_test_scaled)
model_performance(y_test, y_pred)

confusion matrix
[[ 369  260]
 [ 122 1749]]
accuracy : 0.847
precision : 0.752
recall : 0.587
F1 : 0.659


## 6. k-NN

In [55]:
from sklearn.neighbors import KNeighborsClassifier

In [56]:
neighbors = range(1,17,2)  # 최근접이웃 갯수.

In [57]:
def run_knn_model(n_neighbors):
    for nn in n_neighbors:
        print('---------- knn : ' + str(nn) + ' ----------')
        model = KNeighborsClassifier(n_neighbors=nn)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_performance(y_test, y_pred)
        print('\n')

In [58]:
run_knn_model(neighbors)

---------- knn : 1 ----------
confusion matrix
[[ 411  218]
 [ 245 1626]]
accuracy : 0.815
precision : 0.627
recall : 0.653
F1 : 0.64


---------- knn : 3 ----------
confusion matrix
[[ 414  215]
 [ 189 1682]]
accuracy : 0.838
precision : 0.687
recall : 0.658
F1 : 0.672


---------- knn : 5 ----------
confusion matrix
[[ 406  223]
 [ 160 1711]]
accuracy : 0.847
precision : 0.717
recall : 0.645
F1 : 0.679


---------- knn : 7 ----------
confusion matrix
[[ 392  237]
 [ 152 1719]]
accuracy : 0.844
precision : 0.721
recall : 0.623
F1 : 0.668


---------- knn : 9 ----------
confusion matrix
[[ 394  235]
 [ 155 1716]]
accuracy : 0.844
precision : 0.718
recall : 0.626
F1 : 0.669


---------- knn : 11 ----------
confusion matrix
[[ 400  229]
 [ 148 1723]]
accuracy : 0.849
precision : 0.73
recall : 0.636
F1 : 0.68


---------- knn : 13 ----------
confusion matrix
[[ 393  236]
 [ 151 1720]]
accuracy : 0.845
precision : 0.722
recall : 0.625
F1 : 0.67


---------- knn : 15 ----------
confusion ma

## 7. Neural Network

In [59]:
from sklearn.neural_network import MLPClassifier

In [60]:
# 1. adam : stochastic gradient-based optimizer

In [61]:
model = MLPClassifier(solver='adam', activation='logistic', hidden_layer_sizes=(100,), max_iter=2000)

In [62]:
model.fit(X_train, y_train)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [63]:
y_pred = model.predict(X_test)
model_performance(y_test, y_pred)

confusion matrix
[[ 378  251]
 [ 153 1718]]
accuracy : 0.838
precision : 0.712
recall : 0.601
F1 : 0.652


In [64]:
# 2. sgd : stochastic gradient descent

In [65]:
model = MLPClassifier(solver='sgd', activation='logistic', hidden_layer_sizes=(200,), max_iter=2000)

In [66]:
model.fit(X_train, y_train)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='sgd', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [67]:
y_pred = model.predict(X_test)
model_performance(y_test, y_pred)

confusion matrix
[[ 275  354]
 [ 122 1749]]
accuracy : 0.81
precision : 0.693
recall : 0.437
F1 : 0.536


In [68]:
# Scaling

In [69]:
from sklearn.preprocessing import MinMaxScaler

In [70]:
scaler = MinMaxScaler()
scaler.fit(X_train)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [71]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [72]:
model = MLPClassifier(solver='adam', activation='logistic', hidden_layer_sizes=(100,), max_iter=2000)
model.fit(X_train_scaled, y_train)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [73]:
y_pred = model.predict(X_test_scaled)
model_performance(y_test, y_pred)

confusion matrix
[[ 399  230]
 [ 148 1723]]
accuracy : 0.849
precision : 0.729
recall : 0.634
F1 : 0.679


# 제출용 예측

In [74]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,age,workclass,education,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week
0,38,Private,9,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40
1,34,Local-gov,11,Divorced,Protective-serv,Own-child,Asian-Pac-Islander,Male,0,0,40
2,51,Private,9,Married-civ-spouse,Craft-repair,Husband,White,Male,7298,0,50
3,48,Private,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,42
4,63,Private,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50


In [75]:
dummy_final = pd.get_dummies(test[cate_var])

In [76]:
X_final = pd.concat([test[conti_var], dummy_final], axis=1)
X_final.head()

Unnamed: 0,age,education,capital_gain,capital_loss,hours_per_week,marital_Divorced,marital_Married-AF-spouse,marital_Married-civ-spouse,marital_Married-spouse-absent,marital_Never-married,...,relationship_Wife,sex_Female,sex_Male,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay
0,38,9,0,0,40,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
1,34,11,0,0,40,1,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,51,9,7298,0,50,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
3,48,9,0,0,42,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
4,63,14,0,0,50,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0


In [77]:
# 최종 모델 선택

In [78]:
# Gradient Boosting Tree. n_estimators : 1000, learning_rate : 0.1 적용

In [79]:
selected_model = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1)
selected_model.fit(X_train, y_train)
y_pred = selected_model.predict(X_test)
model_performance(y_test, y_pred)

confusion matrix
[[ 420  209]
 [ 121 1750]]
accuracy : 0.868
precision : 0.776
recall : 0.668
F1 : 0.718


In [80]:
# 예측

In [81]:
y_final = selected_model.predict(X_final)

In [82]:
y_final

array(['under50k', 'under50k', 'over50k', ..., 'under50k', 'under50k',
       'over50k'], dtype=object)

In [83]:
# 데이터 저장

In [84]:
import numpy
numpy.savetxt('final.csv', y_final, fmt='%s')