# Import some important libraries


In [8]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

# Dataset Detail

In [2]:
df = pd.read_csv("diabetes.csv")
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

# kNN Implementation


## Train-test Split

### Data Spliting

In [4]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

### Data preprocessing

In [5]:
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
print(X_train_sc)

[[ 1.50755225 -1.01521454 -0.40451932 ... -1.22070104 -0.98325882
  -0.04863985]
 [-0.82986389 -0.09964691 -0.61509602 ...  0.13719053 -0.62493647
  -0.88246592]
 [-1.12204091 -0.95207195  0.54307587 ...  0.0240329   0.39884168
  -0.5489355 ]
 ...
 [ 0.04666716 -0.85735805 -0.24658679 ... -0.9440935  -0.96519215
  -1.04923114]
 [ 2.09190629 -1.14149973  0.2272108  ... -0.26514771 -0.5075031
   0.11812536]
 [ 0.33884418  0.46863645  0.64836422 ... -4.04964181  0.51627505
   2.953134  ]]


### Validation without StandardScaler

#### Training

In [9]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

#### Testing

In [10]:
y_pred = model.predict(X_test)

In [11]:
from sklearn.metrics import f1_score, accuracy_score, log_loss
print("F1 Score:", f1_score(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Log loss:", log_loss(y_test, y_pred))

F1 Score: 0.6491228070175439
Accuracy Score: 0.7916666666666666
Log loss: 7.195640884155627


### Validation with StandardScaler

#### Training

In [12]:
model = LogisticRegression()
model.fit(X_train_sc, y_train)

LogisticRegression()

#### Testing

In [13]:
X_test_sc = scaler.fit_transform(X_test)
y_pred = model.predict(X_test_sc)

In [14]:
from sklearn.metrics import f1_score, accuracy_score, log_loss
print("F1 Score:", f1_score(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Log loss:", log_loss(y_test, y_pred))

F1 Score: 0.6486486486486487
Accuracy Score: 0.796875
Log loss: 7.015743094625569


## KFold Crossvalidation

In Kfold crossvalidation, we split the dataset into 4 subsets. Each time we pick up 1 subset/fold and use it as a testing subset. The data processing, training and testing are then performed similarly to how we do it in Train-test split.

In [15]:
from sklearn.model_selection import KFold
print(df.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [16]:
k = 4
kf = KFold(n_splits = k, shuffle = True, random_state = 0)

In [17]:
X_to_numpy = X.to_numpy()

#### Validation without StandardScaler

In [18]:
f1 = []
acc = []
ll = []

for train_index, test_index in kf.split(X_to_numpy):
  X_train, X_test = X_to_numpy[train_index], X_to_numpy[test_index]
  y_train, y_test = y[train_index], y[test_index]

  model = LogisticRegression()
  model.fit(X_train, y_train)
  
  # Tesing 
  y_pred = model.predict(X_test)

  # Evaluation 
  f1.append(f1_score(y_test, y_pred, average='macro'))
  acc.append(accuracy_score(y_test, y_pred))
  prediction_prob = model.predict_proba(X_test)
  ll.append(log_loss(y_test, prediction_prob))
  

print("Validation without StandardScaler")
print("Mean f1 score: ", np.mean(f1))
print("Mean accuracy score: ", np.mean(acc))
print("Mean log loss score: ", np.mean(ll))

Validation without StandardScaler
Mean f1 score:  0.7296577654086522
Mean accuracy score:  0.7682291666666666
Mean log loss score:  0.4874982591499988


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

#### Validation with StandardScaler

In [20]:
f1 = []
acc = []
ll = []

for train_index, test_index in kf.split(X_to_numpy):
  X_train, X_test = X_to_numpy[train_index], X_to_numpy[test_index]
  y_train, y_test = y[train_index], y[test_index]

  # Apply StandardScaler
  scaler = StandardScaler()
  X_train_sc = scaler.fit_transform(X_train)
  X_test_sc = scaler.fit_transform(X_test)
  


  # Train Logictic Regression model
  model = LogisticRegression()
  model.fit(X_train_sc, y_train)

  # Tesing 
  y_pred_sc = model.predict(X_test_sc)

  # Evaluation 
  f1.append(f1_score(y_test, y_pred_sc, average='macro'))
  acc.append(accuracy_score(y_test, y_pred_sc))
  prediction_prob = model.predict_proba(X_test_sc)
  ll.append(log_loss(y_test, prediction_prob))
  

print("Validation with StandardScaler")
print("Mean f1 score: ", np.mean(f1))
print("Mean accuracy score: ", np.mean(acc))
print("Mean log loss score: ", np.mean(ll))

Validation without StandardScaler
Mean f1 score:  0.7345362700894764
Mean accuracy score:  0.7734375000000001
Mean log loss score:  0.48138820269613747
