# Problem 3.3 Implementation of Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1. Load data

In [2]:
pwd

'/home/yuconglin/projects/Machine-Learning-Zhou_Zhihua/Linear/LogisticRegression'

In [3]:
dataset = np.loadtxt('../../data/watermelon_3a.csv', delimiter=",")

In [4]:
dataset

array([[ 1.    ,  0.697 ,  0.46  ,  1.    ],
       [ 2.    ,  0.774 ,  0.376 ,  1.    ],
       [ 3.    ,  0.634 ,  0.264 ,  1.    ],
       [ 4.    ,  0.608 ,  0.318 ,  1.    ],
       [ 5.    ,  0.556 ,  0.215 ,  1.    ],
       [ 6.    ,  0.403 ,  0.237 ,  1.    ],
       [ 7.    ,  0.481 ,  0.149 ,  1.    ],
       [ 8.    ,  0.437 ,  0.211 ,  1.    ],
       [ 9.    ,  0.666 ,  0.091 ,  0.    ],
       [10.    ,  0.243 ,  0.0267,  0.    ],
       [11.    ,  0.245 ,  0.057 ,  0.    ],
       [12.    ,  0.343 ,  0.099 ,  0.    ],
       [13.    ,  0.639 ,  0.161 ,  0.    ],
       [14.    ,  0.657 ,  0.198 ,  0.    ],
       [15.    ,  0.36  ,  0.37  ,  0.    ],
       [16.    ,  0.593 ,  0.042 ,  0.    ],
       [17.    ,  0.719 ,  0.103 ,  0.    ]])

In [5]:
X = dataset[:,1:3]
y = dataset[:,3]

# 2. Logistics Regression from Scratch

## 2.1 Sigmoid Function

In [6]:
def sigmoid(prop):
    return 1/(1 + np.exp(-prop))

## 2.2 Maximum Likelihood 

In [7]:
def LogLikelihood(response, weights, variables):
    """
    This returns the function to be minimized and has three inputs:
    1. response: array, [values of y]
    2. weights: array, [values of beta]
    3. variables: array, [values of x]
    """
    
    prop = np.dot(variables, weights)
    
    # the function to be minimized
    likelihood = np.sum(-repsonse * prop + np.log(1 + np.exp(prop)))
    return likelihood

In [8]:
def logistic_regression(response, variables, max_steps, learning_rate, intercept = False):
    """
    This function returns the weights computed by gradient descent and has five inputs:
    1. response: array, [values of y]
    2. variables: array, [values of x]
    3. max_steps: int, the number of steps
    4. learning_rate: float, the value of eta
    5. intercept: boolean, determine if the intercept should be added into weights
    """
    
    if intercept:
        intercept = np.ones((variables.shape[0],1))
        variables = np.hstack((intercept, variables))
    weights = np.zeros(variables.shape[1])
    
    for i in range(max_steps):
        z = np.dot(variables, weights)
        probability = sigmoid(z)
        
        # update weights step by step
        diff = response - probability
        gradient = np.dot(variables.T, diff)
        
        weights += learning_rate*gradient
    
    return weights
    

In [9]:
try_weights = logistic_regression(y, X, 600, 0.1, True)

In [10]:
try_weights

array([-2.91870431,  0.97427533, 11.60349377])

# 3. Compare with sklearn

In [11]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter = 100, C=1e15)

In [12]:
clf.fit(X, y)

In [13]:
clf.intercept_

array([-3.26418822])

In [14]:
clf.coef_

array([[ 0.60310695, 14.45209167]])

# 4. Result comparsion

In [15]:
y_pred = clf.predict(X)

In [16]:
from sklearn import metrics

In [17]:
print(metrics.confusion_matrix(y, y_pred))

[[8 1]
 [1 7]]


In [18]:
X_new = np.hstack((np.ones((X.shape[0],1)), X))

In [19]:
score = np.dot(X_new, try_weights)

In [20]:
new_result = []

for s in score:
    if sigmoid(s) > .5:
        a = 1
    else : a = 0
    new_result.append(a)
    

In [21]:
new_result

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]

In [22]:
print(metrics.confusion_matrix(y, new_result))

[[7 2]
 [2 6]]
