In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

Demo KFold split

In [2]:
kf = KFold(n_splits=3) #3 folds

In [3]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8]):
    print(train_index, test_index)

[3 4 5 6 7] [0 1 2]
[0 1 2 6 7] [3 4 5]
[0 1 2 3 4 5] [6 7]


Load IRIS data

In [4]:
iris = load_iris()

In [5]:
dir (iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

Split data into train and test

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.33)

Define a function to get score passing algorithm and data set details

In [7]:
def get_score(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [8]:
get_score(LogisticRegression(), X_train, y_train, X_test, y_test)

0.98

In [9]:
get_score(SVC(), X_train, y_train, X_test, y_test)

0.98

In [10]:
get_score(RandomForestClassifier(n_estimators=10), X_train, y_train, X_test, y_test)

0.98

**All seem equally fine**

Now compare it with Stratified KFold

In [11]:
import statistics
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3)

Manual calculation

In [12]:
scores_lr =[]
scores_svc = []
scores_rf = []

for train_index, test_index in skf.split(iris.data, iris.target):
    
    X_train, X_test = iris.data[train_index], iris.data[test_index]
    y_train, y_test = iris.target[train_index], iris.target[test_index]

    scores_lr.append(get_score(LogisticRegression(solver='lbfgs', max_iter=150), X_train, y_train, X_test, y_test))
    scores_svc.append(get_score(SVC(), X_train, y_train, X_test, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=42), X_train, y_train, X_test, y_test))

print(f"Logistic Regression Mean: {(statistics.mean(scores_lr)):.5f}")
print(f"SVM Mean: {(statistics.mean(scores_svc)):.5f}")
print(f"Random Forest Mean: {(statistics.mean(scores_rf)):.5f}")

Logistic Regression Mean: 0.97333
SVM Mean: 0.96000
Random Forest Mean: 0.95333


**Logistic Regression seems to be the best**

Now, automating using Scikit Learn's cross_val_score

In [13]:
from sklearn.model_selection import cross_val_score

In [14]:
scores_lr_cvs = cross_val_score(LogisticRegression(solver='lbfgs', max_iter=150), iris.data, iris.target)
statistics.mean(scores_lr_cvs)

np.float64(0.9733333333333334)

In [15]:
scores_svm_cvs = cross_val_score(SVC(), iris.data, iris.target)
statistics.mean(scores_svm_cvs)

np.float64(0.9666666666666667)

In [16]:
scores_rf_cvs = cross_val_score(RandomForestClassifier(n_estimators=42), iris.data, iris.target)
statistics.mean(scores_rf_cvs)

np.float64(0.96)

**Logistic Regression is the best, seems to match manual caluclation**