# 1. Import Package

In [1]:
import pandas as pd
import numpy as np

# 2. Read Dataset

In [2]:
dataset = np.loadtxt('../dataset/watermelon_data_set_3_alpha.csv', delimiter='|')

dataset

array([[ 0.   ,  0.697,  0.46 ,  1.   ],
       [ 1.   ,  0.774,  0.376,  1.   ],
       [ 2.   ,  0.634,  0.264,  1.   ],
       [ 3.   ,  0.608,  0.318,  1.   ],
       [ 4.   ,  0.556,  0.215,  1.   ],
       [ 5.   ,  0.403,  0.237,  1.   ],
       [ 6.   ,  0.481,  0.149,  1.   ],
       [ 7.   ,  0.437,  0.221,  1.   ],
       [ 8.   ,  0.666,  0.091,  0.   ],
       [ 9.   ,  0.243,  0.267,  0.   ],
       [10.   ,  0.245,  0.057,  0.   ],
       [11.   ,  0.343,  0.099,  0.   ],
       [12.   ,  0.639,  0.161,  0.   ],
       [13.   ,  0.657,  0.198,  0.   ],
       [14.   ,  0.36 ,  0.37 ,  0.   ],
       [15.   ,  0.593,  0.042,  0.   ],
       [16.   ,  0.719,  0.103,  0.   ]])

In [3]:
x = dataset[:,1:3]
y = dataset[:,3]

x.shape, y.shape

((17, 2), (17,))

# 3. Logistic Regression

In [4]:
# logistic Func
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [5]:
def likelihood(A, y):
    """
    Params:
        A : array - [probs of y_hat]
        y : array - [values of y]
    """
    likehood = np.sum(-y*A + np.log(1 + np.exp(A)))
    return likehood

In [6]:
def logistic_regression_gradient_descent(x, y, iteration, learning_rate):
    """
    Params:
        x : array - [values of x]
        y : array - [values of y]
        iteration : int - number of iteration
        learning_rate : float - value of eta
    """
    input_x = x.T
    input_y = y.reshape(1, y.shape[0])
    
    m = input_x.shape[1]
    w = np.zeros((input_x.shape[0], input_y.shape[0]))
    b = 0
    
    costs = []
    params = {}
    
    for i in range(iteration):
        # forward propagation
        A = sigmoid(np.dot(w.T, input_x) + b)
        cost = likelihood(A, input_y)
#         print(A.shape, input_y.shape, input_x.shape)

        # backward propagation
        dw = 1/m * np.dot(input_x, (A - input_y).T)
        db = 1/m * np.sum(A - input_y)
        
        w -= (learning_rate * dw)
        b -= (learning_rate * db)
        
        costs.append(cost)
        
        params['dw'] = dw
        params['db'] = db
        params['w'] = w
        params['b'] = b
    
    return A, costs, params

In [7]:
def newton(w, x, y):
    n = x.shape[0]

    d1 = np.matrix(np.zeros(n), dtype='float')
    d2 = np.zeros((n, n), dtype='float')

    for i in range(x.shape[1]):
        x_hat = x.T[i]
        Z = np.dot(w.T, x_hat.T)
        
        p = float(1 - 1./(1 + np.exp(Z)))
        
        d1 += - x_hat * (float(y.T[i]) - p)
        d2 += np.matrix(x_hat).T * np.matrix(x_hat) * p * (1 - p)
        
    w -= np.dot(np.linalg.inv(d2), d1.T)
    
    return w

In [8]:
def logistic_regression_newton(x, y, iteration):
    """
    Params:
        x : array - [values of x]
        y : array - [values of y]
        iteration : int - number of iteration
        learning_rate : float - value of eta
    """
    input_x = x.T
    input_y = y.reshape(1, y.shape[0])
    
    m = input_x.shape[1]
    w = np.zeros((input_x.shape[0], input_y.shape[0]))
    b = 0
    
    costs = []
    params = {}
    
    for i in range(iteration):
        # forward propagation
        A = sigmoid(np.dot(w.T, input_x))
        cost = likelihood(A, input_y)

        w = newton(w, input_x, input_y)
        
        costs.append(cost)
        params['w'] = w
        
    return A, costs, params

# 4. Predict

In [33]:
predict, costs, params = logistic_regression_gradient_descent(x, y, iteration=1000, learning_rate=0.1)

In [34]:
predict

array([[0.66022428, 0.62973004, 0.53254994, 0.55913036, 0.4849282 ,
        0.46319321, 0.42812033, 0.46130172, 0.4352763 , 0.4448041 ,
        0.32522597, 0.36870256, 0.47123988, 0.49780861, 0.53394557,
        0.39020518, 0.45450087]])

In [35]:
np.where(predict > 0.5, 1, 0)

array([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]])

In [36]:
y.astype(int)

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [37]:
predict, costs, params = logistic_regression_newton(x, y, iteration=100)

In [38]:
predict

array([[0.81601526, 0.6596925 , 0.55439553, 0.66520798, 0.51748936,
        0.65657439, 0.44725959, 0.60774165, 0.24198664, 0.78651793,
        0.4385844 , 0.44908532, 0.36512229, 0.41827642, 0.85117721,
        0.21351284, 0.23189091]])

In [39]:
np.where(predict > 0.5, 1, 0)

array([[1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]])

In [40]:
y.astype(int)

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])