# Binary Classification
## Regularized Logistic Regression

### Hypothesis
$$ h_{\theta}(x_i) = g(z_i) $$
where,
$$ g(z_i) = \frac{1}{1 + e^{-z_i}} $$
and,
$$ z_i = \theta_0 + \theta_1 x_{i1} + \theta_2 x_{i2} + ... + \theta_n x_{in} $$

### Cost Function
$$ J_i(h_{\theta}(x_i)) = -y_i \log(h_{\theta}(x_i)) - (1-y_i) \log(1 - h_{\theta}(x_i)) $$
$$ J(\theta) = \sum_{i=1}^m J_i(h_{\theta}(x_i)) + \frac{\lambda}{2m} \sum_{j=1}^n \theta_j^2 $$

### Gradient Function
$$ \frac{\partial J(\theta)}{\partial \theta_j }=\frac{1}{m}\sum ^m_{i=1} (h_{\theta } (x^i_{} )-y^i )x^i_j $$ for j = 0

$$ \frac{\partial J(\theta)}{\partial \theta_j }=\frac{1}{m}\sum ^m_{i=1} (h_{\theta } (x^i_{} )-y^i )x^i_j + \lambda \theta_j $$ for j > 1

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("Practice\Machine Learning\machine-learning-ex2\machine-learning-ex2\ex2\ex2data2.txt", sep=',', header=None)
data.columns = ['Test 1', 'Test 2', 'Accepted']
X = data.iloc[:, 0:2]
y = data.iloc[:, 2]
m, n = X.shape
X.head()

In [3]:
plt.figure()
plt.plot(X[y==1].iloc[:, 0], X[y==1].iloc[:, 1], 'b.', label='Accepted')
plt.plot(X[y==0].iloc[:, 0], X[y==0].iloc[:, 1], 'r+', label='Rejected')
plt.xlabel('Microchip Test 1')
plt.ylabel('Microchip Test 2')
plt.title('Microchip Accepted/Rejected Data')
plt.legend()
plt.show()

In [4]:
def sigmoid(Z):
    return 1/(1+ 1/(np.e**Z))

In [5]:
def costFunc(theta, X, y, lbd):
    m = y.size
    y = np.array(y)
    Z = X.dot(theta)
    H = sigmoid(Z)
    # print(H)
    cost = 1/m*(-1 * y.T.dot(np.log(H)) - (1-y).T.dot(np.log(1-H)) )
    cost = cost + lbd/(2*m)*( (theta.T.dot(theta))-theta[0]**2 )
    return cost

In [6]:
def gradFunc(theta, X, y, lbd):
    m = y.size
    Z = X.dot(theta)
    H = sigmoid(Z)
    grad = 1/m *( X.T.dot(H-y) + lbd*theta )
    grad[0] -= lbd/m * theta[0]
    return grad


In [7]:
def mapFeatures(X):
    m,n = X.shape
    out = pd.DataFrame()
    X1, X2 = X.iloc[:, 0], X.iloc[:, 1]
    degree = 6
    for i in range(7):
        for j in range(0, i+1):
            out.insert(out.shape[1], (i-j, j), X1**(i-j) * X2**j)
    return out

In [8]:
X = mapFeatures(X)
X.head()

In [9]:
# initial_theta = np.array([0, 0, 0])
initial_theta = np.zeros(28)
cost = costFunc(initial_theta ,X, y, lbd=1)
theta = gradFunc(initial_theta, X, y, 1)
print(cost)
print(theta.iloc[0:5])

In [10]:
initial_theta = np.ones(28)
cost = costFunc(initial_theta ,X, y, lbd=10)
theta = gradFunc(initial_theta, X, y, 10)
print(cost)
print(theta.iloc[0:5])
# 0.3460
#  0.1614
#  0.1948
#  0.2269
#  0.0922

In [11]:
import scipy.optimize as op
initial_theta = np.zeros(28)
res = op.fmin_bfgs(f=costFunc, x0=initial_theta, fprime=gradFunc, args=(X,y, 1), maxiter=100)
print(res)

In [12]:
cost = costFunc(res, X, y, 1)
print(cost)

In [13]:
# Making a prediction
X_ = pd.DataFrame([-0.25])
X_.insert(1, 'x2', 1.5)
X_ = mapFeatures(X_)
Z = sigmoid(X_.dot(res))
print(Z)

In [14]:
Z = sigmoid(X.dot(res))
Z[Z>=0.5] = 1
Z[Z<0.5] = 0
print('Accuracy', Z[Z == y].count()/y.size * 100)

In [15]:
# Plotting decision boundary
