# Python Final Exam

#### 데이터를 이용하여 각 사람의 income을 분류/예측하는 머신러닝 수행

### 데이터 불러오기

In [1]:
import pandas as pd
from sklearn import metrics

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,age,workclass,education,marital,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,income
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,under50k
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,under50k
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,under50k
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,under50k
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,under50k


In [3]:
train.shape

(24999, 12)

In [4]:
train.columns

Index(['age', 'workclass', 'education', 'marital', 'occupation',
       'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
       'hours_per_week', 'income'],
      dtype='object')

#### 종속변수

In [5]:
y = train['income']
y.head()

0    under50k
1    under50k
2    under50k
3    under50k
4    under50k
Name: income, dtype: object

#### 독립변수

In [6]:
conti_var = train.columns[train.dtypes != 'object']
conti_var

Index(['age', 'education', 'capital_gain', 'capital_loss', 'hours_per_week'], dtype='object')

In [7]:
cate_var = train.columns[train.dtypes == 'object'].difference(['income'])
cate_var

Index(['marital', 'occupation', 'race', 'relationship', 'sex', 'workclass'], dtype='object')

In [8]:
# 범주형 변수를 dummy 변수로 변환

In [9]:
dummy_var = pd.get_dummies(train[cate_var])

In [10]:
X = pd.concat([train[conti_var], dummy_var], axis=1)
X.head()

Unnamed: 0,age,education,capital_gain,capital_loss,hours_per_week,marital_Divorced,marital_Married-AF-spouse,marital_Married-civ-spouse,marital_Married-spouse-absent,marital_Never-married,...,relationship_Wife,sex_Female,sex_Male,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay
0,39,13,2174,0,40,0,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
1,50,13,0,0,13,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
2,38,9,0,0,40,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,53,7,0,0,40,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
4,28,13,0,0,40,0,0,1,0,0,...,1,1,0,0,0,1,0,0,0,0


### 데이터 분할

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [13]:
X_train.shape

(22499, 46)

In [14]:
X_test.shape

(2500, 46)

### 모형 평가 출력 함수

In [15]:
def model_performance(y_test, y_pred):    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred).round(3)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label='over50k').round(3)))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label='over50k').round(3)))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label='over50k').round(3)))

## 1. Logistic Regression (Lasso / Ridge)

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
def run_lr_model(penalties, Clist):
    for p, c in zip(penalties, Clist):
        print('---------- penalty : {}, C : {} ----------'.format(p, c))
        model = LogisticRegression(penalty=p, C=c)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_performance(y_test, y_pred)
        print('\n')

penalty : l1 = Lasso / l2 = Ridge

Lasso (L1 regularization) : 특정 변수의 coef를 0으로 만듬. automatic feature selection.

Ridge (L2 regularization) : coef를 조절하지만 0으로 만들지는 않음.

C > 1 : 오차를 주로 줄임. 가능한 training set에 맞춤 / C < 1 : coef 주로 줄임

In [18]:
plist = ['l1','l1','l1','l1','l1','l2','l2','l2','l2','l2',]

In [19]:
clist = [0.001, 0.01, 0.1, 1, 100, 0.001, 0.01, 0.1, 1, 100]

In [20]:
run_lr_model(plist, clist)

---------- penalty : l1, C : 0.001 ----------
confusion matrix
[[ 168  461]
 [  55 1816]]
accuracy : 0.794
precision : 0.753
recall : 0.267
F1 : 0.394


---------- penalty : l1, C : 0.01 ----------
confusion matrix
[[ 337  292]
 [ 106 1765]]
accuracy : 0.841
precision : 0.761
recall : 0.536
F1 : 0.629


---------- penalty : l1, C : 0.1 ----------
confusion matrix
[[ 375  254]
 [ 124 1747]]
accuracy : 0.849
precision : 0.752
recall : 0.596
F1 : 0.665


---------- penalty : l1, C : 1 ----------
confusion matrix
[[ 378  251]
 [ 128 1743]]
accuracy : 0.848
precision : 0.747
recall : 0.601
F1 : 0.666


---------- penalty : l1, C : 100 ----------
confusion matrix
[[ 379  250]
 [ 129 1742]]
accuracy : 0.848
precision : 0.746
recall : 0.603
F1 : 0.667


---------- penalty : l2, C : 0.001 ----------
confusion matrix
[[ 190  439]
 [  59 1812]]
accuracy : 0.801
precision : 0.763
recall : 0.302
F1 : 0.433


---------- penalty : l2, C : 0.01 ----------
confusion matrix
[[ 338  291]
 [  99 1772]]
ac

## 2. SVM

In [16]:
from sklearn.svm import SVC

In [18]:
model = SVC(kernel='linear', C=1)

In [None]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
model_performance(y_test, y_pred)

## k-NN

In [106]:
from sklearn.neighbors import KNeighborsClassifier

In [107]:
neighbors = range(1,17,2)  # 최근접이웃 갯수.

In [108]:
def run_knn_model(n_neighbors):
    for nn in n_neighbors:
        print('---------- knn : ' + str(nn) + ' ----------')
        model = KNeighborsClassifier(n_neighbors=nn)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        model_performance(y_test, y_pred)
        print('\n')

In [121]:
run_knn_model(neighbors)

---------- knn : 1 ----------
confusion matrix
[[ 411  218]
 [ 245 1626]]
accuracy : 0.815
precision : 0.627
recall : 0.653
F1 : 0.64


---------- knn : 3 ----------
confusion matrix
[[ 414  215]
 [ 189 1682]]
accuracy : 0.838
precision : 0.687
recall : 0.658
F1 : 0.672


---------- knn : 5 ----------
confusion matrix
[[ 406  223]
 [ 160 1711]]
accuracy : 0.847
precision : 0.717
recall : 0.645
F1 : 0.679


---------- knn : 7 ----------
confusion matrix
[[ 392  237]
 [ 152 1719]]
accuracy : 0.844
precision : 0.721
recall : 0.623
F1 : 0.668


---------- knn : 9 ----------
confusion matrix
[[ 394  235]
 [ 155 1716]]
accuracy : 0.844
precision : 0.718
recall : 0.626
F1 : 0.669


---------- knn : 11 ----------
confusion matrix
[[ 400  229]
 [ 148 1723]]
accuracy : 0.849
precision : 0.73
recall : 0.636
F1 : 0.68


---------- knn : 13 ----------
confusion matrix
[[ 393  236]
 [ 151 1720]]
accuracy : 0.845
precision : 0.722
recall : 0.625
F1 : 0.67


---------- knn : 15 ----------
confusion ma

## Naive Bayses