In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/PythonForML/dataset

/content/drive/MyDrive/PythonForML/dataset


# Import some important libraries


In [3]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, log_loss

# Dataset Detail

In [4]:
df = pd.read_csv("diabetes.csv")
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [5]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

# kNN Implementation


## Train-test Split

In [6]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1].values

X = X.to_numpy()

### Data Spliting

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

### Data preprocessing

In [8]:
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
print(X_train_sc)

[[ 1.50755225 -1.01521454 -0.40451932 ... -1.22070104 -0.98325882
  -0.04863985]
 [-0.82986389 -0.09964691 -0.61509602 ...  0.13719053 -0.62493647
  -0.88246592]
 [-1.12204091 -0.95207195  0.54307587 ...  0.0240329   0.39884168
  -0.5489355 ]
 ...
 [ 0.04666716 -0.85735805 -0.24658679 ... -0.9440935  -0.96519215
  -1.04923114]
 [ 2.09190629 -1.14149973  0.2272108  ... -0.26514771 -0.5075031
   0.11812536]
 [ 0.33884418  0.46863645  0.64836422 ... -4.04964181  0.51627505
   2.953134  ]]


### Validation without StandardScaler

#### Training

In [9]:
# model = KNeighborsClassifier(3)
# model.fit(X_train, y_train)
model = SVC(kernel = 'linear')
model.fit(X_train, y_train)

SVC(kernel='linear')

#### Testing

In [10]:
y_pred = model.predict(X_test)

In [11]:
from sklearn.metrics import f1_score, accuracy_score, log_loss
print("F1 Score:", f1_score(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Log loss:", log_loss(y_test, y_pred))

F1 Score: 0.6607142857142857
Accuracy Score: 0.8020833333333334
Log loss: 6.835853634235408


### Validation with StandardScaler

#### Training

In [12]:
# model = KNeighborsClassifier(3)
# model.fit(X_train, y_train)
model = SVC(kernel = 'linear')
model.fit(X_train_sc, y_train)

SVC(kernel='linear')

#### Testing

In [13]:
X_test_sc = scaler.fit_transform(X_test)
y_pred = model.predict(X_test_sc)

In [14]:
from sklearn.metrics import f1_score, accuracy_score, log_loss
print("F1 Score:", f1_score(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Log loss:", log_loss(y_test, y_pred))

F1 Score: 0.672566371681416
Accuracy Score: 0.8072916666666666
Log loss: 6.655964173845248


## KFold Crossvalidation

In Kfold crossvalidation, we split the dataset into 4 subsets. Each time we pick up 1 subset/fold and use it as a testing subset. The data processing, training and testing are then performed similarly to how we do it in Train-test split.

In [15]:
type(X)

numpy.ndarray

In [16]:

kf = KFold(n_splits = 4, random_state = 0, shuffle=True)

f1 = []
f1_sc = []

accuracy = []
accuracy_sc = []

ll = []
ll_sc = []

for train_index, test_index in kf.split(X):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]
  # Data preprocessing
  scaler = StandardScaler()
  scaler.fit(X_train)

  # NO Standard Scaler
  model = SVC(kernel = 'linear', probability = True)
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)
  y_pred_proba = model.predict_proba(X_test)

  f1.append(f1_score(y_test, y_pred))
  accuracy.append(accuracy_score(y_test, y_pred))
  ll.append(log_loss(y_test, y_pred_proba))

  # With Standard Scaler 

  X_train_sc = scaler.transform(X_train)
  model_sc = SVC(kernel = 'linear', probability = True)
  model_sc.fit(X_train_sc, y_train)

  X_test_sc = scaler.transform(X_test)
  y_pred_sc = model_sc.predict(X_test_sc)
  y_pred_sc_proba = model_sc.predict_proba(X_test_sc)

  f1_sc.append(f1_score(y_test, y_pred_sc))
  accuracy_sc.append(accuracy_score(y_test, y_pred_sc))
  ll_sc.append(log_loss(y_test, y_pred_sc_proba))

f1 = pd.DataFrame(f1)
f1_sc = pd.DataFrame(f1_sc)

accuracy = pd.DataFrame(accuracy)
accuracy_sc = pd.DataFrame(accuracy_sc)

ll = pd.DataFrame(ll)
ll_sc = pd.DataFrame(ll_sc)


print("Without Standard Scaler:")
print("F1 Score:", f1.mean()[0])
print("Accuracy Score:", accuracy.mean()[0])
print("Log loss:", ll.mean()[0])

print("With Standard Scaler:")
print("F1 Score:", f1_sc.mean()[0])
print("Accuracy Score:", accuracy_sc.mean()[0])
print("Log loss:", ll_sc.mean()[0])

Without Standard Scaler:
F1 Score: 0.6309110858083691
Accuracy Score: 0.7682291666666667
Log loss: 0.4816277178578242
With Standard Scaler:
F1 Score: 0.6334260725150556
Accuracy Score: 0.7708333333333333
Log loss: 0.48200989614255785


### Data preprocessing

In [None]:
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
print(X_train_sc)

[[ 0.63624625  0.84320386  0.18048995 ...  0.2056088   0.50150893
   1.45200457]
 [-0.84507081 -1.15198273 -0.12216568 ... -0.70280089 -0.37829557
  -0.20121412]
 [ 1.22877307  1.95164085 -0.22305089 ... -1.13105117  0.64495531
  -0.11420261]
 ...
 [ 0.33998284 -0.01187611  0.18048995 ... -0.75471001 -0.7161915
  -0.28822563]
 [-0.84507081  0.14647203 -0.42482131 ... -0.24859604 -0.38467096
   1.19097004]
 [-0.84507081 -0.89862571  0.07960474 ... -0.2096642  -0.49305268
  -0.89730619]]


### Validation without StandardScaler

#### Training

In [None]:
# model = KNeighborsClassifier(3)
# model.fit(X_train, y_train)
model = SVC(kernel = 'linear')
model.fit(X_train, y_train)

SVC(kernel='linear')

#### Testing

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, log_loss
print("F1 Score:", f1_score(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Log loss:", log_loss(y_test, y_pred))

F1 Score: 0.6050420168067226
Accuracy Score: 0.7552083333333334
Log loss: 8.454879600596593


### Validation with StandardScaler

#### Training

In [None]:
# model = KNeighborsClassifier(3)
# model.fit(X_train, y_train)
model = SVC(kernel = 'linear')
model.fit(X_train_sc, y_train)

SVC(kernel='linear')

#### Testing

In [None]:
X_test_sc = scaler.fit_transform(X_test)
y_pred = model.predict(X_test_sc)

In [None]:

print("F1 Score:", f1_score(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Log loss:", log_loss(y_test, y_pred))

F1 Score: 0.6166666666666667
Accuracy Score: 0.7604166666666666
Log loss: 8.274990140206432
