In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

iris = load_iris()
X = iris.data
y = iris.target

cancer = load_breast_cancer()
X_cancer = cancer.data
y_cancer = cancer.target

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_cancer = scaler.fit_transform(X_cancer)

knn = KNeighborsClassifier(n_neighbors=3)
dt = DecisionTreeClassifier()
gnb = GaussianNB()

ratios = [0.75, 0.666]
for ratio in ratios:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=42)
    knn.fit(X_train, y_train)
    dt.fit(X_train, y_train)
    gnb.fit(X_train, y_train)
    y_pred_knn = knn.predict(X_test)
    y_pred_dt = dt.predict(X_test)
    y_pred_gnb = gnb.predict(X_test)
    print(f"I. a) Training set = {ratio*100}%, Test set = {(1-ratio)*100}%")
    print(f"KNN accuracy: {accuracy_score(y_test, y_pred_knn)*100:.2f}%")
    print(f"DT accuracy: {accuracy_score(y_test, y_pred_dt)*100:.2f}%")
    print(f"GNB accuracy: {accuracy_score(y_test, y_pred_gnb)*100:.2f}%")
    print()

methods = ['hold out', 'random subsampling', 'cross-validation']
for method in methods:
    if method == 'hold out':
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    elif method == 'random subsampling':
        X_train, y_train = resample(X, y, replace=False, n_samples=int(len(X)*2/3), random_state=42)
        X_test, y_test = X[~np.in1d(np.arange(len(X)), np.where(np.isin(y, y_train))[0])], y[~np.in1d(np.arange(len(y)), np.where(np.isin(y, y_train))[0])]
    elif method == 'cross-validation':
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            knn.fit(X_train, y_train)
            dt.fit(X_train, y_train)
            gnb.fit(X_train, y_train)
            y_pred_knn = knn.predict(X_test)
            y_pred_dt = dt.predict(X_test)
            y_pred_gnb = gnb.predict(X_test)
            print(f"Method: {method}")
            print(f"KNN accuracy: {accuracy_score(y_test, y_pred_knn)*100:.2f}%")
            print(f"DT accuracy: {accuracy_score(y_test, y_pred_dt)*100:.2f}%")
            print(f"GNB accuracy: {accuracy_score(y_test, y_pred_gnb)*100:.2f}%")
            print()


I. a) Training set = 75.0%, Test set = 25.0%
KNN accuracy: 94.69%
DT accuracy: 92.92%
GNB accuracy: 95.58%

I. a) Training set = 66.60000000000001%, Test set = 33.4%
KNN accuracy: 93.00%
DT accuracy: 92.00%
GNB accuracy: 94.00%

Method: cross-validation
KNN accuracy: 100.00%
DT accuracy: 100.00%
GNB accuracy: 98.00%

Method: cross-validation
KNN accuracy: 90.00%
DT accuracy: 92.00%
GNB accuracy: 94.00%

Method: cross-validation
KNN accuracy: 96.00%
DT accuracy: 94.00%
GNB accuracy: 94.00%

