In [1]:
# ========== Part 1: importing the relevant libraries 
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import datasets


In [11]:
# ========== Part 2: loading the iris dataset
iris = datasets.load_iris()
df = pd.DataFrame(data=iris['data'],columns=iris['feature_names'])
df['species'] = iris['target']

#df.isnull().mean() # missing values
#df['species'].value_counts() # counts of the different species

# We are going to write a binary Logistic Regression classifier,therefore we will only look
# at class 1 and 2 from now on!

# class labels right now are 1 and 2 (we will change this to 1 and 0)
df['species'] = df['species'].replace({2:0})

X = df.iloc[50:,:4].values
y = df.iloc[50:,4].values


In [15]:
# ========== Part 3: splitting data into Training/Testing set and Feature Normalization
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Feature Normalization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)

In [19]:
# ========== Part 4: Calculating the Cost Function J of theta
m = len(y_train)
X_train_padded = np.column_stack((np.ones((m,1)),X_train))
theta = np.zeros((X_train_padded.shape[1],1))

class CC(object):
    def ComputeCost(self,X,y,theta):
        m = len(y)
        # calculate the Cost J of theta
        J = 0
        hyp_x = (1.0 / (1.0 + np.exp(-X.dot(theta))))
        J = -y.dot(np.log(hyp_x)) - ((1-y).dot(np.log(1-hyp_x)))
        return J
    
print('Initial Cost J of theta is \n')
print(CC().ComputeCost(X_train_padded,y_train,theta))

Initial Cost J of theta is 

[51.98603854]


In [80]:
# ========== Part 5: Gradient Descent Algorithm to find regression coefficients that minimize cost Function J
alpha = 0.01
epochs = 3000
J_history = []
iteration_nr = []

class GD(object):
    def GradientDescent(self,alpha,epochs):
        theta = np.zeros((X_train_padded.shape[1],1))
        n = float(len(y_train))
        
        for i in range(epochs):
            J = 0
            hyp_x = (1.0 / (1.0 + np.exp(-X_train_padded.dot(theta))))
            J = -y_train.dot(np.log(hyp_x)) - ((1-y_train).dot(np.log(1-hyp_x)))
            iter = i+1
            J_history.append(J)
            iteration_nr.append(iter)
            
            #update theta values
            gradient = (1/m) * X_train_padded.T.dot(hyp_x - np.transpose([y_train]))
            theta = theta - alpha * gradient
            
        return theta
    
print('The Gradient Descent Algorithm converged on the following theta values \n')
print(GD().GradientDescent(alpha,epochs))
final_theta = GD().GradientDescent(alpha,epochs)

The Gradient Descent Algorithm converged on the following theta values 

[[ 1.44042293]
 [ 2.32178755]
 [ 2.18429106]
 [-3.52543501]
 [-3.10671126]]


In [81]:
# ========== Part 6: calculating accuracy of the Logistic Regression Model on Training Data
y_pred = (1.0 / (1.0 + np.exp(-X_train_padded.dot(final_theta))))
y_prediction = np.zeros(len(y_pred))

for i in range(len(y_pred)):
    if y_pred[i] >= 0.5:
        y_prediction[i] = 1
    else:
        y_prediction[i] = 0

correct = y_prediction == y_train
accuracy = np.mean(correct)*100

print('The Logistic Regression Classifier has an accuracy of', "{:.2f}".format(accuracy) + str("%"))

The Logistic Regression Classifier has an accuracy of 98.67%


In [82]:
# ========== Part 7: calculating the accuracy of the Logistic Regression Model on Testing Data
m = len(y_test)
X_test_padded = np.column_stack((np.ones((m,1)),X_test_std))

y_pred = (1.0 / (1.0 + np.exp(-X_test_padded.dot(final_theta))))
y_prediction = np.zeros(len(y_pred))

for i in range(len(y_pred)):
    if y_pred[i] >= 0.5:
        y_prediction[i] = 1
    else:
        y_prediction[i] = 0

correct = y_prediction == y_test
accuracy = np.mean(correct)*100

print('The Logistic Regression Classifier has an accuracy of', "{:.2f}".format(accuracy) + str("%"))

The Logistic Regression Classifier has an accuracy of 92.00%
