## 데이터 불러오기

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
data = pd.read_csv('assignment_2.csv')
data.head()

Unnamed: 0,Label,bias,experience,salary
0,1,1,0.7,48000
1,0,1,1.9,48000
2,1,1,2.5,60000
3,0,1,4.2,63000
4,0,1,6.0,76000


### Train Test 데이터 나누기

In [2]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 1:], data.iloc[:, 0], random_state = 0)

In [3]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((150, 3), (50, 3), (150,), (50,))

## 데이터 스케일링

experience와 salary를 스케일링한다.

In [4]:
X_train

Unnamed: 0,bias,experience,salary
71,1,5.3,48000
124,1,8.1,66000
184,1,3.9,60000
97,1,0.2,45000
149,1,1.1,66000
...,...,...,...
67,1,6.7,64000
192,1,4.8,73000
117,1,7.0,86000
47,1,7.6,78000


In [5]:
scaler = StandardScaler()
bias_train = X_train["bias"]
bias_train = bias_train.reset_index()["bias"]
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_train["bias"] = bias_train
X_train.head()

Unnamed: 0,bias,experience,salary
0,1,0.187893,-1.143335
1,1,1.185555,0.043974
2,1,-0.310938,-0.351795
3,1,-1.629277,-1.34122
4,1,-1.3086,0.043974


In [6]:
y_train = y_train.reset_index()["Label"]
y_test = y_test.reset_index()["Label"]

In [7]:
bias_test = X_test["bias"]
bias_test = bias_test.reset_index()["bias"]
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
X_test["bias"] = bias_test
X_test.head()

Unnamed: 0,bias,experience,salary
0,1,-1.344231,-0.615642
1,1,0.50857,0.307821
2,1,-0.310938,0.571667
3,1,1.363709,1.956862
4,1,-0.987923,-0.747565


## sigmoid

In [8]:
import random

In [9]:
X_train = np.array(X_train[["bias", "experience", "salary"]])

In [10]:
beta = np.array([random.random(), random.random(), random.random()]) # 임의의 beta값 생성
beta

array([0.1643293 , 0.54444896, 0.09515524])

sigmoid는 개별 $x_i$(bias, experience, salary) 입력값에 대해 $y$가 1일 확률을 계산하는 함수이다.  
$p = \frac{1}{1+exp(-ax+b)}$  
$e$의 승수(multiplier)는 $\sum\beta_i x_i$ 이다.

In [11]:
def sigmoid(x, beta) :
    multiplier = 0
    for i in range(x.size):
        multiplier += x[i]*beta[i]
    p = 1.0/(1.0+np.exp(-multiplier))
    return p
sigmoid(X_train[0], beta)

0.5393765784138813

## log likelihood

$l(p) = \sum[y_i\log(p)+(1-y_i)\log(1-p)]$

In [12]:
#개별 likelihood, 각각의 x입력값에 대한 p의 값 산정
def lg_likelihood_i(x, y, beta, j) :
    p_hat = 0
    p = sigmoid(x[j], beta)
    p_hat += y[j]*np.log(p) + (1-y[j])*np.log(1-p)
    return p_hat
lg_likelihood_i(X_train, y_train, beta, 0)

-0.6173412907730486

In [13]:
def lg_likelihood(x, y, beta) :
    log_p_hat = 0
    for i in range(y.size) :
        log_p_hat += lg_likelihood_i(x, y, beta, i) # log p 의 추정값에 계속 더해준다.
        
    return log_p_hat
lg_likelihood(X_train, y_train, beta)

-108.29537832987057

## gradient Ascent

get_gradients는 cost function(log likelihood)상에서 각각의 beta 계수들로 편미분했을 때, 각각의 기울기를 구하는 함수이다.

In [14]:
# gradients 한 번 구하기
def get_gradients(x, y, beta):
    gradients = []
    
    for i in range(x[0].size) :
        gradient = 0                                  # 각 계수별 기울기
        for j in range(y.size) :
            p = sigmoid(x[j], beta)
            gradient += (y[j] - p)*x[j][i]            # 개별 데이터 x에 대한 값을 합산
               
        gradient = gradient/y.size                    # 전체 n 값으로 나누기
        gradients.append(gradient)
    
    gradients = np.array(gradients)
    
    return gradients

gradients = np.array(get_gradients(X_train, y_train, beta))
gradients

array([-0.25682068, -0.02336487, -0.20990434])

step은 구한 기울기를 바탕으로 다음 학습을 진행할 지점을 지정하는 함수이다.

In [15]:
def step(beta, gradients, stepsize=np.array([0.01,0.01,0.01])) : #stepsize:학습률, 기본값은 0.01
    beta = beta + stepsize*gradients
    return beta

In [16]:
step(beta, gradients)

array([0.16176109, 0.54421531, 0.0930562 ])

In [17]:
#max_cycle:최대 학습 횟수
#tolerance:이 값보다 step의 변화율이 낮으면 학습을 종료함
#theta_0:학습 이전의 계수
#theta:학습 이후의 계수

def gradientAscent(x, y, beta, max_cycle = 200000, tolerance = 0.000001, stepsize=np.array([0.01,0.01,0.01])) :
    theta_0 = beta
    i = 0
    cost = lg_likelihood(x, y, theta_0)/y.size
    gradients = np.array([])
    while i < max_cycle:
        gradients = get_gradients(x, y, theta_0)
        theta = step(theta_0, gradients, stepsize)
        temp = theta_0 - theta
        theta_0 = theta
        
        if i % 1000 == 0:
            print(gradients)
            #print(theta_0)
            #print(theta)
            #print(np.abs(temp.sum()))
        if np.abs(temp.sum()) < tolerance :
            print("stop")
            break
        i += 1
    return theta_0

In [18]:
beta.sum()

0.8039334964464181

## Step Size 결정

Step Size를 고르는 기법으로는 다음 세가지 방법이 있다.

- Fixed step size
- Backtracking line search
- Exact line search

참고 : https://wikidocs.net/18088

In [19]:
# 학습률 0.01은 진행속도가 느려 학습결과에 큰 차이가 없는 0.1로 설정함.
beta = gradientAscent(X_train, y_train, beta, stepsize=np.array([0.1,0.1,0.1]))
beta

[-0.25682068 -0.02336487 -0.20990434]
[-0.00303449  0.01014112 -0.00956026]
[-0.00118079  0.00385945 -0.00358887]
[-0.00055369  0.00179925 -0.00166399]
[-0.00027865  0.0009033  -0.00083326]
[-0.00014482  0.00046893 -0.00043202]
[-7.64753115e-05  2.47482537e-04 -2.27849032e-04]
[-4.07166588e-05  1.31723367e-04 -1.21230852e-04]
[-2.17717307e-05  7.04228308e-05 -6.48011614e-05]
stop


array([-1.86332466,  4.25483244, -4.02439011])

In [20]:
lg_likelihood(X_train, y_train, beta) # 수렴한 우도

-44.73076832310222

## 예측

In [21]:
X_test = np.array(X_test[["bias", "experience", "salary"]])

In [22]:
Label_predict = []
for i in range(y_test.size) :
    p = sigmoid(X_test[i], beta)  # 학습한 beta 값으로 p를 추정한다.
    if p > 0.5 :
        Label_predict.append(1) # p값이 0.5보다 크면 1로 분류한다.
    else :
        Label_predict.append(0)
Label_predict = np.array(Label_predict)
Label_predict

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

## confusion_matrix

In [23]:
from sklearn.metrics import *
tn, fp, fn, tp = confusion_matrix(y_test, Label_predict).ravel()
confusion_matrix(y_test, Label_predict)

array([[38,  2],
       [ 1,  9]], dtype=int64)

In [24]:
#Accuracy
Accuracy = (tp+tn)/(tp+fn+fp+tn)
Accuracy

0.94