# K-fold cross validation of Logistic Regression model

In [7]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

print("X : ", type(X), X.shape)
print("y : ", type(y), y.shape)

# Scale features
scaler = StandardScaler()   # Z score normalization mean=0, standard_deviation=1
X_scaled = scaler.fit_transform(X)

# Set up K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)   # The shuffling is done only once, at the beginning, before the data is split into K folds
acc_scores = []

# Manual K-Fold CV
for train_index, val_index in kf.split(X_scaled):
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Train logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    acc_scores.append(acc)

# Results
print("Manual CV Accuracy Scores:", acc_scores)
print("Average Accuracy:", np.mean(acc_scores))

X :  <class 'numpy.ndarray'> (569, 30)
y :  <class 'numpy.ndarray'> (569,)
Manual CV Accuracy Scores: [0.9736842105263158, 0.9824561403508771, 0.9649122807017544, 0.9912280701754386, 0.9734513274336283]
Average Accuracy: 0.9771464058376029


# Getting the final model

In [8]:
# Train final model on full dataset
final_model = LogisticRegression(max_iter=1000)
final_model.fit(X_scaled, y)

print("Final trained models is ready to predict on other data")

Final trained models is ready to predict on other data
