In [20]:
import sys
!{sys.executable} -m pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl.metadata (13 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-macosx_10_9_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.5.2 threadpoolctl-3.5.0


In [163]:
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import sklearn
from sklearn.model_selection import train_test_split
import math
from collections import Counter
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

In [2]:
bt_df = pd.read_csv("blood_transfusion.csv")
bt_df.describe()

Unnamed: 0,months_since_last_donation,total_number_of_donations,total_blood_donated,months_since_first_donation,class
count,748.0,748.0,748.0,748.0,748.0
mean,9.506684,5.514706,1378.676471,34.282086,0.237968
std,8.095396,5.839307,1459.826781,24.376714,0.426124
min,0.0,1.0,250.0,2.0,0.0
25%,2.75,2.0,500.0,16.0,0.0
50%,7.0,4.0,1000.0,28.0,0.0
75%,14.0,7.0,1750.0,50.0,0.0
max,74.0,50.0,12500.0,98.0,1.0


In [None]:
months_since_last_donation_avg = np.zeros((75, 3))

for i in range(len(months_since_last_donation_avg)):
    months_since_last_donation_avg[i, 0] = i+1

for index, row in bt_df.iterrows():
    months_since_last_donation_avg[int(row[0]), 2] += 1
    avg_class = (months_since_last_donation_avg[int(row[0]), 1] + row[4])/months_since_last_donation_avg[int(row[0]), 2]
    months_since_last_donation_avg[int(row[0]), 1] = avg_class

months_since_last_donation_fig = go.Figure()

# Iterate over each row in the matrix and plot it
months_since_last_donation_fig.add_trace(go.Scatter(
    x=months_since_last_donation_avg[:, 0],  # Use the first column as x-values
    y=months_since_last_donation_avg[:, 1],  # Use the second column as y-values
    mode='lines+markers',  # Connect the points with lines
    name='Data Points'
))

# Update layout for titles and labels
months_since_last_donation_fig.update_layout(
    title='Chances of coming back after last donation',
    xaxis_title='Months passed since last donation',
    yaxis_title='Chances of coming back',
    xaxis=dict(range=[0, 30])
)

# Show the plot
months_since_last_donation_fig.show()

distribution_fig = go.Figure()

for i in range(len(months_since_last_donation_avg)):
    distribution_fig.add_trace(go.Bar(
        x=months_since_last_donation_avg[:, 0],  # Use column names as x-coordinates
        y=months_since_last_donation_avg[:, 2],  # Use row values as y-coordinates
        marker = dict(color = 'red')
    ))

distribution_fig.update_layout(
    xaxis=dict(range=[0, 30])
)

distribution_fig.show()


In [None]:
segments_df= np.array_split(bt_df, 15)

class_distribution_fig = go.Figure()
avg_class = np.zeros((15, 2))

for i in range(len(segments_df)):
    avg_class[i][1] = sum(split_df[i]['class'])/len(segments_df[i])
    avg_class[i][0] = i+1

class_distribution_fig = go.Figure()

class_distribution_fig.add_trace(go.Scatter(
    x=avg_class[:, 0],  # Use the first column as x-values
    y=avg_class[:, 1],  # Use the second column as y-values
    mode='lines+markers',  # Connect the points with lines
    name='Data Points'
))

class_distribution_fig.update_layout(
    title='Distribution of class variable across data',
    xaxis_title='Segments of Data',
    yaxis_title='Average Value of Class',
)

class_distribution_fig.show()

Task 3: Creating a train and test set

In [152]:
class_attribute = bt_df['class']
variables = bt_df.drop(columns = ["class"])

class_train8020, class_test8020, variables_train8020, variables_test8020= train_test_split(class_attribute, variables, test_size=0.2, train_size = 0.8)
class_train9010, class_test9010, variables_train9010, variables_test9010= train_test_split(class_attribute, variables, test_size=0.1, train_size = 0.9)

Task 4: Classification Algorithms

In [None]:
#4.1: Manual implementation of a KNN classifier algorithm

def euclidean_distance(point1, point2):
    distance = 0
    for i in range(len(point1)):
        distance += (point1[i] - point2[i]) ** 2
    return math.sqrt(distance)

def get_neighbors(training_set, labels, test_instance, k):
    distances = []
    for index in range(len(training_set)):
        dist = euclidean_distance(test_instance, training_set[index])
        distances.append((labels[index], dist))
    
    distances.sort(key=lambda x: x[1])
    neighbors = distances[:k]
    return [neighbor[0] for neighbor in neighbors]

def predict_classification(training_set, labels, test_instance, k):
    neighbors = get_neighbors(training_set, labels, test_instance, k)
    count = Counter(neighbors) 
    prediction = count.most_common(1)[0][0]
    return prediction

def confusion_matrix(predicted, actual):
    predicted = np.array(predicted)
    actual = np.array(actual)

    true_positive = np.sum((predicted == 1) & (actual == 1))
    false_positive = np.sum((predicted == 1) & (actual == 0))

    true_negative = np.sum((predicted == 0) & (actual == 0))
    false_negative = np.sum((predicted == 1) & (actual == 0))

    matrix_data = {'True positive': true_positive, 'False positive': false_positive, 'True negative': true_negative, 'False negative': false_negative}
    confusion_table = pd.DataFrame(matrix_data, index=[0]) 

    return confusion_table

def precision(predicted, actual):
    
    if (int(confusion_matrix(predicted,actual)['True positive']) + int(confusion_matrix(predicted,actual)['False positive'])) == 0:
        return 0.0

    precision = int(confusion_matrix(predicted,actual)['True positive']) / int((confusion_matrix(predicted,actual)['True positive']) + int(confusion_matrix(predicted,actual)['False positive']))
    return precision

def recall(predicted, actual):
    if (int(confusion_matrix(predicted,actual)['True positive']) + int(confusion_matrix(predicted,actual)['False negative'])) == 0:
        return 0.0

    recall = int(confusion_matrix(predicted,actual)['True positive']) / (int(confusion_matrix(predicted,actual)['True positive']) + int(confusion_matrix(predicted,actual)['False negative']))
    return recall

knn8020_predicted = np.zeros(150)
knn8020_correct= np.zeros(150)

for i in range(len(knn8020_predicted)):
    predicted_label = predict_classification(variables_train8020.to_numpy(), class_train8020.to_numpy(), variables_test8020.to_numpy()[i], 20)
    knn8020_predicted[i] = predicted_label
    knn8020_correct[i] = class_test8020.to_numpy()[i]
    

evaluater_data = {'Predicted Class': knn8020_predicted, 'Correct Class': knn8020_correct}
knn8020_evaluater = pd.DataFrame(evaluater_data)

print(confusion_matrix(knn8020_evaluater['Predicted Class'], knn8020_evaluater['Correct Class']))

print(f'For a KNN Classifier with a 80-20 split, the precision is: {precision(knn8020_evaluater['Predicted Class'], knn8020_evaluater['Correct Class'])*100}% and the recall is: {recall(knn8020_evaluater['Predicted Class'], knn8020_evaluater['Correct Class'])*100}%')

knn9010_predicted = np.zeros(75)
knn9010_correct= np.zeros(75)

for i in range(len(knn9010_predicted)):
    predicted_label = predict_classification(variables_train9010.to_numpy(), class_train9010.to_numpy(), variables_test9010.to_numpy()[i], 20)
    knn9010_predicted[i] = predicted_label
    knn9010_correct[i] = class_test9010.to_numpy()[i]
    

evaluater_data = {'Predicted Class': knn9010_predicted, 'Correct Class': knn9010_correct}
knn9010_evaluater = pd.DataFrame(evaluater_data)

print(f'For a KNN Classifier with a 90-10 split, the precision is: {precision(knn9010_evaluater['Predicted Class'], knn9010_evaluater['Correct Class'])*100}% and the recall is: {recall(knn9010_evaluater['Predicted Class'], knn9010_evaluater['Correct Class'])*100}%')

In [None]:
#4.2: Naive Bayes Classifier

nb8020_correct= np.zeros(150)

nb_clf8020 = GaussianNB()
nb_clf8020.fit(variables_train8020, class_train8020)

for i in range(len(nb8020_correct)):
    nb8020_correct[i] = class_test8020.to_numpy()[i]

nb8020_predicted = nb_clf8020.predict(variables_test8020)

evaluater_data = {'Predicted Class': nb8020_predicted, 'Correct Class': nb8020_correct}
nb8020_evaluater = pd.DataFrame(evaluater_data)

print(f'For a Naive Bayes Classifier with a 80-20 split, the precision is: {int(precision(nb8020_evaluater['Predicted Class'],
    nb8020_evaluater['Correct Class'])*100)}% and the recall is: {int(recall(nb8020_evaluater['Predicted Class'], nb8020_evaluater['Correct Class'])*100)}%')

nb9010_correct= np.zeros(75)

nb_clf9010 = GaussianNB()
nb_clf9010.fit(variables_train9010, class_train9010)

for i in range(len(nb9010_correct)):
    nb9010_correct[i] = class_test9010.to_numpy()[i]

nb9010_predicted = nb_clf9010.predict(variables_test9010)

evaluater_data = {'Predicted Class': nb9010_predicted, 'Correct Class': nb9010_correct}
nb9010_evaluater = pd.DataFrame(evaluater_data)

print(f'For a Naive Bayes Classifier with a 90-10 split, the precision is: {int(precision(nb9010_evaluater['Predicted Class'],
    nb9010_evaluater['Correct Class'])*100)}% and the recall is: {int(recall(nb9010_evaluater['Predicted Class'], nb9010_evaluater['Correct Class'])*100)}%')


In [None]:
#4.3 Support Vector Classifier

svc8020_correct= np.zeros(150)

svc_clf8020 = SVC(gamma='auto')
svc_clf8020.fit(variables_train8020, class_train8020)

for i in range(len(svc8020_correct)):
    svc8020_correct[i] = class_test8020.to_numpy()[i]

svc8020_predicted = svc_clf8020.predict(variables_test8020)

evaluater_data = {'Predicted Class': svc8020_predicted, 'Correct Class': svc8020_correct}
svc8020_evaluater = pd.DataFrame(evaluater_data)

print(f'For a Support Vector Classifier with a 80-20 split, the precision is: {int(precision(svc8020_evaluater['Predicted Class'],
    svc8020_evaluater['Correct Class'])*100)}% and the recall is: {int(recall(svc8020_evaluater['Predicted Class'], svc8020_evaluater['Correct Class'])*100)}%')

svc9010_correct= np.zeros(75)

svc_clf9010 = SVC(gamma='auto')
svc_clf9010.fit(variables_train9010, class_train9010)

for i in range(len(svc9010_correct)):
    svc9010_correct[i] = class_test9010.to_numpy()[i]

svc9010_predicted = svc_clf9010.predict(variables_test9010)

evaluater_data = {'Predicted Class': svc9010_predicted, 'Correct Class': svc9010_correct}
svc9010_evaluater = pd.DataFrame(evaluater_data)

print(f'For a Support Vector Classifier with a 90-10 split, the precision is: {int(precision(svc9010_evaluater['Predicted Class'],
    svc9010_evaluater['Correct Class'])*100)}% and the recall is: {int(recall(svc9010_evaluater['Predicted Class'], svc9010_evaluater['Correct Class'])*100)}%')


In [None]:
#4.4 Multilayer Percetron (Neural Network) Classifier

mlp8020_correct= np.zeros(150)

mlp_clf8020 = MLPClassifier(max_iter=300)
mlp_clf8020.fit(variables_train8020, class_train8020)

for i in range(len(mlp8020_correct)):
    mlp8020_correct[i] = class_test8020.to_numpy()[i]

mlp8020_predicted = mlp_clf8020.predict(variables_test8020)

evaluater_data = {'Predicted Class': mlp8020_predicted, 'Correct Class': mlp8020_correct}
mlp8020_evaluater = pd.DataFrame(evaluater_data)

print(f'For a Multilayer Perceptron Classifier with a 80-20 split, the precision is: {int(precision(mlp8020_evaluater['Predicted Class'],
    mlp8020_evaluater['Correct Class'])*100)}% and the recall is: {int(recall(mlp8020_evaluater['Predicted Class'], mlp8020_evaluater['Correct Class'])*100)}%')

mlp9010_correct= np.zeros(75)

mlp_clf9010 = MLPClassifier(max_iter=300)
mlp_clf9010.fit(variables_train9010, class_train9010)

for i in range(len(mlp9010_correct)):
    mlp9010_correct[i] = class_test9010.to_numpy()[i]

mlp9010_predicted = mlp_clf9010.predict(variables_test9010)

evaluater_data = {'Predicted Class': mlp9010_predicted, 'Correct Class': mlp9010_correct}
mlp9010_evaluater = pd.DataFrame(evaluater_data)

print(f'For a Multilayer Perceptron Classifier with a 90-10 split, the precision is: {int(precision(mlp9010_evaluater['Predicted Class'],
    mlp9010_evaluater['Correct Class'])*100)}% and the recall is: {int(recall(mlp9010_evaluater['Predicted Class'], mlp9010_evaluater['Correct Class'])*100)}%')


Task 5: Evaluation of classification methods

In [158]:
#5.1 Manual implementation of confusion matrix

#It was implemented in 4.1, under function confusion matrix

In [None]:
#5.2 Classification Report

print("Classification Report of Naive Bayes Classifier with 80-20 split:\n", classification_report(nb8020_correct, nb8020_predicted))
print("Classification Report of Naive Bayes Classifier with 90-10 split:\n", classification_report(nb9010_correct, nb9010_predicted, zero_division = 0))
print("Classification Report of KNN Classifier with 80-20 split:\n", classification_report(knn8020_correct, knn8020_predicted))
print("Classification Report of KNN Classifier with 90-10 split:\n", classification_report(knn9010_correct, knn9010_predicted, zero_division = 0))
print("Classification Report of Support Vector Classifier with 80-20 split:\n", classification_report(svc8020_correct, svc8020_predicted))
print("Classification Report of Support Vector Classifier with 90-10 split:\n", classification_report(svc9010_correct, svc9010_predicted, zero_division = 0))
print("Classification Report of Multilayer Perceptron Classifier with 80-20 split:\n", classification_report(mlp8020_correct, mlp8020_predicted))
print("Classification Report of Multilayer Perceptron Classifier with 90-10 split:\n", classification_report(mlp9010_correct, mlp9010_predicted, zero_division = 0))

In [None]:
#5.3 Fbeta Score