In [1]:
import pandas as pd
import numpy as np

# To set numpy to not print in exponential notation
np.set_printoptions(suppress=True)

cancer_df = pd.read_csv('dataset.csv')


In [2]:
# for preprocessing dataset. note that it returns a copy of the dataset and doesn't modify the original dataset
def preprocess(df_orig, to_standardize=False):
    df = df_orig.copy()

    # fill na values with mean in each column
    for i, col in enumerate(df.columns[2:]):
        df[col] = df[col].fillna(df[col].mean())

    # standardize data wrt normal distribution, only if, to_standardize variable is set to True
    if to_standardize == True:
        for i, col in enumerate(df.columns[2:]):
            df[col] = (df[col] - df[col].mean())/df[col].std()

    return df

In [3]:
def shuffle_split_data(X, y, randomSeed = 0):
    if randomSeed != 0:
        np.random.seed(randomSeed)
    arr_rand = np.random.rand(X.shape[0])
    split = arr_rand < np.percentile(arr_rand, 67)

    X_train = X[split]
    y_train = y[split]
    X_test =  X[~split]
    y_test = y[~split]

    return X_train, y_train, X_test, y_test

# Part A - Perceptron Learning Algorithm

In [4]:
class Perceptron:
    def __init__(self, D, alpha=1):
        self.W = np.zeros(D+1)
        self.alpha = alpha

    # activation function = sign function

    def step(self, x):
        if np.isscalar(x):
            return 1 if x > 0 else -1
        else:
            # just a fancy way to return 1 where x > 0 and -1 where not
            return 2*(x > 0) + np.linspace(-1, -1, x.shape[0])

    def fit(self, X, y, epochs=100):
        # adding a column of ones in the input dataset. this is helpful for training with bias
        X = np.c_[X, np.ones((X.shape[0]))]
        for epoch in np.arange(0, epochs):
            for (x, target) in zip(X, y):
                prediction = np.dot(x, self.W)
                if prediction*target <= 0:
                    self.W = self.W + target*x

    # this runs an infinite loop until the data points are linearly separable wrt W vector
    def linear_fit(self, X, Y, epoch_lim = 0):
        nrows = X.shape[0]
        print("Total Points:", nrows)
        X = np.c_[X, np.ones((X.shape[0]))]

        count = 1           # misclassified points count
        epoch = 0
        while count != 0:
            count = 0
            for i, (x, y) in enumerate(zip(X, Y)):
                prediction = np.dot(x, self.W)
                if prediction*y <= 0:
                    count = count+1
                    self.W = self.W + y*x
            if epoch % 5000 == 0:
                temp_output = 2*(np.dot(X, self.W) > 0) + np.linspace(-1, -1, X.shape[0])
                tp, fp, tn, fn = self.checkStatistics(temp_output, Y)
                
                print("Epochs:", epoch, "\t\tMisclassified Points:", count, "\tAccuracy:", "{0:.5f}".format((tp+tn)/(tp+fp+tn+fn)), "\tRecall:", "{0:.5f}".format(tp/(tp+fn)), "\tPrecision:", "{0:.5f}".format(tp/(tp+fp)))
            epoch += 1
            if epoch_lim and epoch > epoch_lim:
                break
        if epoch <= epoch_lim:
            print("Successfully classified in", epoch, "epochs!")


    # X is a matrix here. Predicts for all the points in X.
    def predict(self, X):
        # ensure our input is a matrix
        X = np.atleast_2d(X)
        X = np.c_[X, np.ones((X.shape[0]))]
        return self.step(np.dot(X, self.W))


    # x is a vector here. Predicts only for one point.
    def predictOne(self, x):
        # add 1 in the back of feature vector since we trained using bias.
        toAdd = pd.Series([1])
        x = pd.concat([x, toAdd])
        return self.step(np.dot(x, self.W))

    # returns back the true positives, false positives, true negatives, false negatives.
    def checkStatistics(self, predicted, response):
        tp = np.sum(np.logical_and(predicted == 1, response == 1))
        fp = np.sum(np.logical_and(predicted == 1, response == -1))
        tn = np.sum(np.logical_and(predicted == -1, response == -1))
        fn = np.sum(np.logical_and(predicted == -1, response == 1))
        return tp, fp, tn, fn

## PM1

In [5]:
# this just sets the blank/nan values to mean values
# df = preprocess(cancer_df)

# this drops the rows with nan values
df = cancer_df.copy()
df.dropna(axis = 0, inplace=True)

# Setting Malignant as the positive class and Benign as the negative class
df['diagnosis'] = df['diagnosis'].replace('B', -1)
df['diagnosis'] = df['diagnosis'].replace('M', 1)

df_input = df[df.columns[2:]]
df_response = df[df.columns[1]]
features_n = len(df.columns)-2

PM1 = Perceptron(features_n)
PM1.linear_fit(df_input, df_response, 50_000)


# For unnormalized data, when tried to linearly fit:
# At 2,50,000 epochs -> 21 points are being misclassified. 
# At 60,40,000 epochs -> 23 still misclassified.
# But normalized data is getting linearly fit in 2,00,293 epochs.

Total Points: 563
Epochs: 0 		Misclassified Points: 168 	Accuracy: 0.77265 	Recall: 0.95735 	Precision: 0.62928
Epochs: 5000 		Misclassified Points: 52 	Accuracy: 0.90941 	Recall: 0.75829 	Precision: 1.00000
Epochs: 10000 		Misclassified Points: 45 	Accuracy: 0.91829 	Recall: 0.80095 	Precision: 0.97688
Epochs: 15000 		Misclassified Points: 45 	Accuracy: 0.90941 	Recall: 0.76303 	Precision: 0.99383
Epochs: 20000 		Misclassified Points: 41 	Accuracy: 0.90764 	Recall: 0.75829 	Precision: 0.99379
Epochs: 25000 		Misclassified Points: 37 	Accuracy: 0.92007 	Recall: 0.79147 	Precision: 0.99405
Epochs: 30000 		Misclassified Points: 35 	Accuracy: 0.92895 	Recall: 0.82464 	Precision: 0.98305
Epochs: 35000 		Misclassified Points: 38 	Accuracy: 0.90053 	Recall: 0.73460 	Precision: 1.00000
Epochs: 40000 		Misclassified Points: 38 	Accuracy: 0.89876 	Recall: 0.73460 	Precision: 0.99359
Epochs: 45000 		Misclassified Points: 35 	Accuracy: 0.93250 	Recall: 0.83886 	Precision: 0.97790
Epochs: 50000 		

## PM2

In [6]:
df = preprocess(cancer_df)
df['diagnosis'] = df['diagnosis'].replace('B', -1)
df['diagnosis'] = df['diagnosis'].replace('M', 1)
features_n = len(df.columns)-2
df1_input, df1_response, df2_input, df2_response = shuffle_split_data(df[df.columns[2:]], df[df.columns[1]], 12345678)

PM2 = Perceptron(features_n)
PM2.fit(df1_input, df1_response, 1000)
df2_predicted = PM2.predict(df2_input)
tp, fp, tn, fn = PM2.checkStatistics(df2_predicted, df2_response)
print(tp, fp, tn, fn)
print("Accuracy:", (tp+tn)/(tp+tn+fp+fn))
print("Recall:", (tp)/(tp+fn))
print("Precision:", (tp)/(tp+fp))


69 1 112 6
Accuracy: 0.9627659574468085
Recall: 0.92
Precision: 0.9857142857142858


## PM3

In [7]:
df = preprocess(cancer_df, True)
df['diagnosis'] = df['diagnosis'].replace('B', -1)
df['diagnosis'] = df['diagnosis'].replace('M', 1)
features_n = len(df.columns)-2
df1_input, df1_response, df2_input, df2_response = shuffle_split_data(df[df.columns[2:]], df[df.columns[1]])

PM3 = Perceptron(features_n)
PM3.fit(df1_input, df1_response, 1000)
df2_predicted = PM3.predict(df2_input)
tp, fp, tn, fn = PM3.checkStatistics(df2_predicted, df2_response)
print("Accuracy:", (tp+tn)/(tp+tn+fp+fn))
print("Recall:", (tp)/(tp+fn))
print("Precision:", (tp)/(tp+fp))

Accuracy: 0.9840425531914894
Recall: 0.9714285714285714
Precision: 0.9855072463768116


## PM4

In [8]:
df = preprocess(cancer_df, False)
df['diagnosis'] = df['diagnosis'].replace('B', -1)
df['diagnosis'] = df['diagnosis'].replace('M', 1)
features_n = len(df.columns)-2

df_input = df.iloc[:, 2:]
df_response = df.iloc[:, 1]
df_input = df_input.sample(frac=1, axis = 1)

df1_input, df1_response, df2_input, df2_response = shuffle_split_data(df_input, df_response, 12345678)

PM4 = Perceptron(features_n)
PM4.fit(df1_input, df1_response, 1000)
df2_predicted = PM4.predict(df2_input)
tp, fp, tn, fn = PM4.checkStatistics(df2_predicted, df2_response)
print(tp, fp, tn, fn)
print("Accuracy:", (tp+tn)/(tp+tn+fp+fn))
print("Recall:", (tp)/(tp+fn))
print("Precision:", (tp)/(tp+fp))

69 1 112 6
Accuracy: 0.9627659574468085
Recall: 0.92
Precision: 0.9857142857142858
