<a href="https://colab.research.google.com/github/wiroel/F1AML/blob/main/K_FOLD_cv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub
import pandas as pd
import os


# Download latest version
dataset_dir = kagglehub.dataset_download("fredericobreno/play-tennis")

# Find the CSV file within the directory - assuming there's only one CSV file
for filename in os.listdir(dataset_dir):
    if filename.endswith(".csv"):
        filepath = os.path.join(dataset_dir, filename)
        break  # Stop after finding the first CSV file

print("Path to dataset file:", filepath)

Downloading from https://www.kaggle.com/api/v1/datasets/download/fredericobreno/play-tennis?dataset_version_number=1...


100%|██████████| 337/337 [00:00<00:00, 312kB/s]

Extracting files...
Path to dataset file: /root/.cache/kagglehub/datasets/fredericobreno/play-tennis/versions/1/play_tennis.csv





In [None]:
df = pd.read_csv(filepath)
df

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
5,D6,Rain,Cool,Normal,Strong,No
6,D7,Overcast,Cool,Normal,Strong,Yes
7,D8,Sunny,Mild,High,Weak,No
8,D9,Sunny,Cool,Normal,Weak,Yes
9,D10,Rain,Mild,Normal,Weak,Yes


In [None]:
print(f'rows: {df.shape[0]}, columns: {df.shape[1]}')

rows: 14, columns: 6


In [None]:
df.describe()

Unnamed: 0,day,outlook,temp,humidity,wind,play
count,14,14,14,14,14,14
unique,14,3,3,2,2,2
top,D1,Sunny,Mild,High,Weak,Yes
freq,1,5,6,7,8,9


In [None]:
df['play'].value_counts()

Unnamed: 0_level_0,count
play,Unnamed: 1_level_1
Yes,9
No,5


In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [None]:
label_encoders = {}
for col in df.columns:              #ensures it process only strings
  if df[col].dtype == 'object':
    le= LabelEncoder()
    df[col] = le.fit_transform(df[col])     # fit learns the unique categories and assign them different numbers while transform() replaces them wth those numbers
    label_encoders[col] = le                # stores encoder for reverse transformation

In [None]:
X = df.iloc[:, :-1]     # features-except last column
y = df.iloc[:, -1]      # target varieable-only last column

In [None]:
train_test_ratios = [0.8, 0.7, 0.6]

# K-fold cross  validation

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
import joblib

for ratio in train_test_ratios:
  print(f"\n Train-Test Ratio: {int(ratio*100)} - {int((1-ratio)*100)}")

  fold_accuracies = []

  for fold, (train_index, test_index) in enumerate(kf.split(X), start=1):

    # Split data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # adjust train-test split ratio
    train_size= int(len(X_train)*ratio)
    X_train, y_train= X_train[:train_size], y_train[:train_size]

    # train model
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)

    # Save the trained model for this fold and ratio
    model_filename = f"decision_tree_ratio_{int(ratio * 100)}_fold_{fold}.pkl"
    joblib.dump(model, model_filename)


    # predict and evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    fold_accuracies.append(accuracy)

    print(f"Fold {fold} Accuracy: {accuracy:.4f} (Saved as {model_filename})")



  print(f"Average accuracy: {sum(fold_accuracies)/len(fold_accuracies):4f}")


# Load and test a saved model (example: first model)
sample_model = joblib.load("decision_tree_ratio_80_fold_1.pkl")
y_pred_sample = sample_model.predict(X_test)
print("\nLoaded Model Test Accuracy:", accuracy_score(y_test, y_pred_sample))


 Train-Test Ratio: 80 - 19
Fold 1 Accuracy: 0.3333 (Saved as decision_tree_ratio_80_fold_1.pkl)
Fold 2 Accuracy: 0.3333 (Saved as decision_tree_ratio_80_fold_2.pkl)
Fold 3 Accuracy: 0.3333 (Saved as decision_tree_ratio_80_fold_3.pkl)
Fold 4 Accuracy: 0.3333 (Saved as decision_tree_ratio_80_fold_4.pkl)
Fold 5 Accuracy: 0.5000 (Saved as decision_tree_ratio_80_fold_5.pkl)
Average accuracy: 0.366667

 Train-Test Ratio: 70 - 30
Fold 1 Accuracy: 1.0000 (Saved as decision_tree_ratio_70_fold_1.pkl)
Fold 2 Accuracy: 0.3333 (Saved as decision_tree_ratio_70_fold_2.pkl)
Fold 3 Accuracy: 0.3333 (Saved as decision_tree_ratio_70_fold_3.pkl)
Fold 4 Accuracy: 0.3333 (Saved as decision_tree_ratio_70_fold_4.pkl)
Fold 5 Accuracy: 0.5000 (Saved as decision_tree_ratio_70_fold_5.pkl)
Average accuracy: 0.500000

 Train-Test Ratio: 60 - 40
Fold 1 Accuracy: 0.6667 (Saved as decision_tree_ratio_60_fold_1.pkl)
Fold 2 Accuracy: 0.3333 (Saved as decision_tree_ratio_60_fold_2.pkl)
Fold 3 Accuracy: 0.6667 (Saved as 