# Import the Necessary Python Libraries and Components

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.metrics import confusion_matrix as cm
from sklearn.metrics import precision_score as ps
from sklearn.metrics import recall_score as rs
from sklearn.metrics import f1_score as f1s
from sklearn.metrics import accuracy_score as acc

### To Disable Convergence Warnings (For Custom Training)

In [3]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

# 1.) Input the Dataset

In [4]:
# Dataset Reference :- https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

data = pd.read_csv("./data.csv")
data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,


# 2.) Convert the String Labels into easily-interpretable Numerics

In [5]:
# Note :- There are many existing Encoders for converting String to Numeric Labels, but for convenience, we used Pandas.

condition_M = data.diagnosis == "M"
condition_B = data.diagnosis == "B"

data.loc[condition_M,"diagnosis"]=0
data.loc[condition_B,"diagnosis"]=1

data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,
1,842517,0,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,
2,84300903,0,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,
3,84348301,0,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,
4,84358402,0,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,0,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,
565,926682,0,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,
566,926954,0,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,
567,927241,0,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,


# 3.) Converting Dataframe into Numpy Arrays (Features and Labels)

In [6]:
Y = data.diagnosis.to_numpy().astype('int')                                     # Labels

X_data = data.drop(columns=["id","diagnosis","Unnamed: 32"])
X = X_data.to_numpy()                                                           # Input Features

# 4.) Splitting the Dataset into Train and Test Portions

In [7]:
user_prompt = 0.3
user_enable = False

x_train,x_test,y_train,y_test = tts(X,Y,test_size=user_prompt,shuffle=user_enable)

# 5.) Model Training and Predicting

In [8]:
# Note :- Don't worry about the code snippet here, it is just to produce the predictions for the test data portion of each classifier

logistic_model = LR()
logistic_model.fit(x_train,y_train)
logistic_pred = logistic_model.predict(x_test)

decision_model = DTC()
decision_model.fit(x_train,y_train)
decision_pred = decision_model.predict(x_test)

# 6.) Evaluation Metrics (Inbulit v/s Scaratch)

## Confusion Matrix

In [9]:
inbuilt_matrix_logistic = cm(y_test,logistic_pred)
inbuilt_matrix_decision = cm(y_test,decision_pred)

print("Confusion Matrix for Logistic Regression-based Predictions =>")
print(inbuilt_matrix_logistic)
print("Confusion Matrix for Decision Tree-based Predictions =>")
print(inbuilt_matrix_decision)


def confusion_matrix(model_preds, y_test = y_test):
  confusion_mat = np.array([[0, 0], [0, 0]])
  for i in range(len(model_preds)):
    if(model_preds[i] == 0 and y_test[i] == 0):
      confusion_mat[0][0] += 1
    elif(model_preds[i] == 0 and y_test[i] == 1):
      confusion_mat[1][0] += 1
    elif(model_preds[i] == 1 and y_test[i] == 0):
      confusion_mat[0][1] += 1
    elif(model_preds[i] == 1 and y_test[i] == 1):
      confusion_mat[1][1] += 1
  return confusion_mat


# cm_logistic =  confusion_matrix(logistic_pred, y_test)
# cm_decision = confusion_matrix(decision_pred, y_test)



  


Confusion Matrix for Logistic Regression-based Predictions =>
[[ 38   1]
 [  9 123]]
Confusion Matrix for Decision Tree-based Predictions =>
[[ 38   1]
 [ 24 108]]


## Average Accuracy

In [12]:
inbuilt_acc_logistic = acc(y_test,logistic_pred)
inbuilt_acc_decision = acc(y_test,decision_pred)

print("Accuracy for Logistic Regression-based Predictions =>",str(inbuilt_acc_logistic*100)+"%")
print("Accuracy for Decision Tree-based Predictions =>",str(inbuilt_acc_decision*100)+"%")

def avg_accuracy(model_preds):
  # Understand the Concept, write the code from scratch and remove "pass"
  conf_mat = confusion_matrix(model_preds)
  avg_accuracy = (conf_mat[0][0] + conf_mat[1][1]) / len(y_test) *100
  # print(avg_accuracy)
  return avg_accuracy
  # pass

# acc_logistic = avg_accuracy(logistic_pred)
# acc_decision = avg_accuracy(decision_pred)

Accuracy for Logistic Regression-based Predictions => 94.15204678362574%
Accuracy for Decision Tree-based Predictions => 85.38011695906432%
94.15204678362574
85.38011695906432


## Precision

In [19]:
inbuilt_ps_logistic = ps(y_test,logistic_pred)
inbuilt_ps_decision = ps(y_test,decision_pred)

print("Precision for Logistic Regression-based Predictions =>",str(inbuilt_ps_logistic*100)+"%")
print("Precision for Decision Tree-based Predictions =>",str(inbuilt_ps_decision*100)+"%")

def precision(model_preds):
  conf_mat = confusion_matrix(model_preds)
  tp = conf_mat[1][1]
  tn = conf_mat[0][0]
  fp = conf_mat[0][1]
  fn = conf_mat[1][0]
  prec = tp/(tp+fp) * 100
  return prec

# precision(logistic_pred)
# precision(decision_pred)


Precision for Logistic Regression-based Predictions => 99.19354838709677%
Precision for Decision Tree-based Predictions => 99.08256880733946%


## Recall

In [20]:
inbuilt_rs_logistic = rs(y_test,logistic_pred)
inbuilt_rs_decision = rs(y_test,decision_pred)

print("Recall for Logistic Regression-based Predictions =>",str(inbuilt_rs_logistic*100)+"%")
print("Recall for Decision Tree-based Predictions =>",str(inbuilt_rs_decision*100)+"%")

def recall(model_preds):
  conf_mat = confusion_matrix(model_preds)
  tp = conf_mat[1][1]
  tn = conf_mat[0][0]
  fp = conf_mat[0][1]
  fn = conf_mat[1][0]
  rec = tp/(tp+fn) * 100
  return rec


Recall for Logistic Regression-based Predictions => 93.18181818181817%
Recall for Decision Tree-based Predictions => 81.81818181818183%


## F-1 Score

In [25]:
inbuilt_f1s_logistic = f1s(y_test,logistic_pred)
inbuilt_f1s_decision = f1s(y_test,decision_pred)

print("F1-Score for Logistic Regression-based Predictions =>",str(inbuilt_f1s_logistic*100)+"%")
print("F1-Score for Decision Tree-based Predictions =>",str(inbuilt_f1s_decision*100)+"%")

def f1_score(model_preds):
  precs = precision(model_preds)
  rec = recall(model_preds)
  f1 = 2 / (1/precs + 1/rec) * 100
  return f1

# print(f1_score(logistic_pred))
# print(f1_score(decision_pred))


F1-Score for Logistic Regression-based Predictions => 96.09375%
F1-Score for Decision Tree-based Predictions => 89.62655601659752%


## Class-Wise Accuracy

In [26]:
def class_accuracy(model_preds):
  # Understand the Concept, write the code from scratch and remove "pass"
  conf_mat = confusion_matrix(model_preds)
  tp = conf_mat[1][1]
  tn = conf_mat[0][0]
  fp = conf_mat[0][1]
  fn = conf_mat[1][0]

  class_acc = (tn/(tn+fp) + tp/(tp+fn))/2
  return class_acc


  
  # pass

## Sensitivity

In [27]:
def sensitivity(model_preds):
  # Understand the Concept, write the code from scratch and remove "pass"
  conf_mat = confusion_matrix(model_preds)
  tp = conf_mat[1][1]
  tn = conf_mat[0][0]
  fp = conf_mat[0][1]
  fn = conf_mat[1][0]

  sens = tp/(tp+fn)
  return sens

  # pass

## Specificity

In [28]:
def specificity(model_preds):
  # Understand the Concept, write the code from scratch and remove "pass"
  conf_mat = confusion_matrix(model_preds)
  tp = conf_mat[1][1]
  tn = conf_mat[0][0]
  fp = conf_mat[0][1]
  fn = conf_mat[1][0]

  spec = tn/(tn+fp)
  return spec

  # pass