## 7324 Assignment A3 : KNN vs DT
##### Name: Thang Nguyen
##### SMU ID: 48689334

## Imports

In [454]:
# The Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# Data Wrangling Tool
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Foundational libraries
import pandas as pd
import numpy as np
import time

## Loading dataset

In [455]:
bio_df = pd.read_csv("../data/a3.biodata.csv")
# check for nulls 
bio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   f1        210 non-null    float64
 1   f2        210 non-null    float64
 2   f3        210 non-null    float64
 3   f4        210 non-null    float64
 4   f5        210 non-null    float64
 5   f6        210 non-null    float64
 6   f7        210 non-null    float64
 7   category  210 non-null    int64  
dtypes: float64(7), int64(1)
memory usage: 13.2 KB


In [456]:
# short preview for later scaling
bio_df.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,category
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1


In [457]:
# separate features from target
features = bio_df.drop('category', axis=1)
target = bio_df['category']

## All Features: KNN

In [458]:
knn_X = features
knn_y = target

# split data into training and test sets
knn_X_train, knn_X_test, knn_y_train, knn_y_test = train_test_split(knn_X, knn_y, stratify = knn_y, random_state = 0, train_size = 0.8)

In [459]:
# instantiate classifier with default 5 neighbors and 1 jobs
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=1)

In [460]:
# fit on training set
knn.fit(knn_X_train, knn_y_train)
# test on test set
knn_y_predict = knn.predict(knn_X_test)

In [461]:
# observing accuracy, precision, and confusion matrix
print(accuracy_score(knn_y_test, knn_y_predict))
print(classification_report(knn_y_test, knn_y_predict))
print(confusion_matrix(knn_y_test, knn_y_predict))

0.9285714285714286
              precision    recall  f1-score   support

           1       1.00      0.79      0.88        14
           2       0.93      1.00      0.97        14
           3       0.88      1.00      0.93        14

    accuracy                           0.93        42
   macro avg       0.94      0.93      0.93        42
weighted avg       0.94      0.93      0.93        42

[[11  1  2]
 [ 0 14  0]
 [ 0  0 14]]


## All Features w/Scaling: KNN

In [462]:
# scale features
scaler = StandardScaler()
knn_X_train_scaled = scaler.fit_transform(knn_X_train)
knn_X_test_scaled = scaler.fit_transform(knn_X_test)

In [463]:
# fit on scaled training set
knn.fit(knn_X_train_scaled, knn_y_train)
# test on scaled test set
knn_y_predict = knn.predict(knn_X_test_scaled)

In [464]:
# observing accuracy, precision, and confusion matrix
print(accuracy_score(knn_y_test, knn_y_predict))
print(classification_report(knn_y_test, knn_y_predict))
print(confusion_matrix(knn_y_test, knn_y_predict))

1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00        14

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42

[[14  0  0]
 [ 0 14  0]
 [ 0  0 14]]


## Some Features Removed: KNN 
### Removed features were determined through trial and error

In [465]:
# removing f1 and f2 
bio_df_blah = bio_df.drop(['f1', 'f2'], axis=1)

# separate features from target
features = bio_df_blah.drop('category', axis=1)
target = bio_df_blah['category']

In [466]:
knn_X = features
knn_y = target

# split data into training and test sets
knn_X_train, knn_X_test, knn_y_train, knn_y_test = train_test_split(knn_X, knn_y, stratify = knn_y, random_state = 0, train_size = 0.8)

In [467]:
# instantiate classifier with default 5 neighbors and 1 jobs
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=1)

In [468]:
# fit on training set
knn.fit(knn_X_train, knn_y_train)
# test on test set
knn_y_predict = knn.predict(knn_X_test)

In [469]:
# observing accuracy, precision, and confusion matrix
print(accuracy_score(knn_y_test, knn_y_predict))
print(classification_report(knn_y_test, knn_y_predict))
print(confusion_matrix(knn_y_test, knn_y_predict))

0.9761904761904762
              precision    recall  f1-score   support

           1       1.00      0.93      0.96        14
           2       0.93      1.00      0.97        14
           3       1.00      1.00      1.00        14

    accuracy                           0.98        42
   macro avg       0.98      0.98      0.98        42
weighted avg       0.98      0.98      0.98        42

[[13  1  0]
 [ 0 14  0]
 [ 0  0 14]]


## Some Features Removed w/Scaling: KNN

In [470]:
# scale features
scaler = StandardScaler()
knn_X_train_scaled = scaler.fit_transform(knn_X_train)
knn_X_test_scaled = scaler.fit_transform(knn_X_test)

In [471]:
# fit on scaled training set
knn.fit(knn_X_train_scaled, knn_y_train)
# test on scaled test set
knn_y_predict = knn.predict(knn_X_test_scaled)

In [472]:
# observing accuracy, precision, and confusion matrix
print(accuracy_score(knn_y_test, knn_y_predict))
print(classification_report(knn_y_test, knn_y_predict))
print(confusion_matrix(knn_y_test, knn_y_predict))

1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00        14

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42

[[14  0  0]
 [ 0 14  0]
 [ 0  0 14]]


## All Features: Decision Trees

### Resetting bio_data

In [473]:
bio_df = pd.read_csv("../data/a3.biodata.csv")

In [474]:
# separate features from target
features = bio_df.drop('category', axis=1)
target = bio_df['category']

In [475]:
dt_X = features
dt_y = target

# split data into training and test sets
dt_X_train, dt_X_test, dt_y_train, dt_y_test = train_test_split(dt_X, dt_y, stratify = dt_y, random_state = 0, train_size = 0.8)

In [476]:
dt = DecisionTreeClassifier(random_state=0)

In [477]:
# fit on training set
dt.fit(dt_X_train, dt_y_train)
# test on test set
dt_y_predict = dt.predict(dt_X_test)

In [478]:
# observing accuracy, precision, and confusion matrix
print(accuracy_score(dt_y_test, dt_y_predict))
print(classification_report(dt_y_test, dt_y_predict))
print(confusion_matrix(dt_y_test, dt_y_predict))

0.9285714285714286
              precision    recall  f1-score   support

           1       0.92      0.86      0.89        14
           2       1.00      1.00      1.00        14
           3       0.87      0.93      0.90        14

    accuracy                           0.93        42
   macro avg       0.93      0.93      0.93        42
weighted avg       0.93      0.93      0.93        42

[[12  0  2]
 [ 0 14  0]
 [ 1  0 13]]


## All Features w/Scaling: Decision Trees

## Some Features: Decision Trees

## Some Features w/Scalling: Decision Trees