# TM10007 Assignment template

In [25]:
# Run this to use from colab environment
#!pip install -q --upgrade git+https://github.com/jveenland/tm10007_ml.git

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [1]:
# Data loading functions. Uncomment the one you want to use
#from worcgist.load_data import load_data
from worclipo.load_data import load_data
#from worcliver.load_data import load_data
#from ecg.load_data import load_data

# Import classifiers
from sklearn import model_selection
from sklearn.model_selection import train_test_split, StratifiedKFold


data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

X = data.drop("label",axis=1)
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
    X_train_fold = X_train.iloc[train_idx]
    y_train_fold = y_train.iloc[train_idx]
    X_val_fold = X_train.iloc[val_idx]
    y_val_fold = y_train.iloc[val_idx]

    print(f"Fold {fold}:")
    print(f" - Train size: {len(X_train_fold)}")
    print(f" - Validation size: {len(X_val_fold)}")


The number of samples: 115
The number of columns: 494
Fold 1:
 - Train size: 73
 - Validation size: 19
Fold 2:
 - Train size: 73
 - Validation size: 19
Fold 3:
 - Train size: 74
 - Validation size: 18
Fold 4:
 - Train size: 74
 - Validation size: 18
Fold 5:
 - Train size: 74
 - Validation size: 18


### Preprocessing

Missing data
* Part 1: Finding missing data

In [None]:
# Define missing value indicators
custom_missing = ['NA', 'N/A', '?', 'None', 'none', '-']

# Count NaNs
nan_counts = X_train.isna().sum()

# Count empty strings
empty_string_counts = (X_train == '').sum()

# Count custom missing indicators (case-insensitive match)
custom_missing_counts = X_train.apply(lambda col: col.astype(str).str.lower().isin([val.lower() for val in custom_missing]).sum())

# Compute total missing count per column
total_missing = nan_counts + empty_string_counts + custom_missing_counts

# Filter out columns where total missing is zero
total_missing_selected = total_missing[total_missing != 0]

# Print total missing counts
print(total_missing_selected)

Series([], dtype: int64)


* Part 2: Processing missing data

In [None]:
# Import
import numpy as np
from sklearn.impute import SimpleImputer
from scipy.stats import shapiro

# Replacing missing values with NaN
custom_missing = ['NA', 'N/A', '?', 'None', 'none', '-', '']
X_train.replace(custom_missing, np.nan, inplace=True)
X_test.replace(custom_missing, np.nan, inplace=True)

# If 50% or more of the data within one feature is missing the feature is deleted
limit = len(X_train.index)*50/100
valid_columns = [col for col, count in total_missing.items() if count < limit]

# Keep only the valid columns in both X_train and X_test
X_train = X_train[valid_columns]
X_test = X_test[valid_columns]

# Imputate 

# Check if imputation is needed
if X_train.isna().sum().sum() == 0:
    pass
else:
    # Dictionary to store mean/median decision per column
    imputation_strategies = {}

    for col in X_train.select_dtypes(include=['number']).columns:  # Only numeric columns
        col_data = X_train[col].dropna()  # Remove NaN values for testing

        if len(col_data) > 3:  # Shapiro requires at least 3 non-null values
            if col_data.nunique() == 1:  # Check if all values are the same
                strategy = 'median'  # Default to median if no variability
            else:
                _, p = shapiro(col_data)
                strategy = 'mean' if p > 0.05 else 'median'
        else:
            strategy = 'median'  # Default to median if too few values

        imputation_strategies[col] = strategy

    # Create imputers for mean and median
    mean_imputer = SimpleImputer(strategy='mean')
    median_imputer = SimpleImputer(strategy='median')

    # Apply imputers for each feature
    for col, strategy in imputation_strategies.items():
        imputer = mean_imputer if strategy == 'mean' else median_imputer
        X_train[col] = imputer.fit_transform(X_train[[col]])
        X_test[col] = imputer.transform(X_test[[col]])  # Use the same imputer

Scaling data

In [29]:
import pandas as pd
from sklearn import preprocessing

scaler_robust = preprocessing.RobustScaler()

scaled_robust_array_train = scaler_robust.fit_transform(X_train)
scaled_robust_array_test = scaler_robust.transform(X_test)

X_scaled_robust_train = pd.DataFrame(scaled_robust_array_train, columns=X_train.columns)
X_scaled_robust_test = pd.DataFrame(scaled_robust_array_test, columns=X_test.columns)

# print(X_scaled_robust_train)

    PREDICT_original_sf_compactness_avg_2.5D  \
0                                   0.311125   
1                                  -0.711701   
2                                   0.909548   
3                                  -0.094719   
4                                   0.841177   
..                                       ...   
81                                 -0.882556   
82                                 -0.900222   
83                                 -0.081870   
84                                  1.175088   
85                                  0.240824   

    PREDICT_original_sf_compactness_std_2.5D  \
0                                  -0.017272   
1                                   0.162159   
2                                  -0.333623   
3                                   0.117707   
4                                  -0.563987   
..                                       ...   
81                                  0.330932   
82                                  0.5

Feature extraction

Lloyd

Inge

## Model selection
Random forest, decision tree and bagging

In [None]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Encode categorical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Define models and parameter distributions
models = {
    "Random Forest": (RandomForestClassifier(), {
        'n_estimators': randint(5, 200),
        'max_depth': randint(3, 30),
        'min_samples_split': randint(2, 20),
        'bootstrap': [True, False]
    }),
    "Bagging": (BaggingClassifier(DecisionTreeClassifier()), {
        'n_estimators': randint(5, 200),
        'estimator__max_depth': randint(3, 30),
        'estimator__min_samples_split': randint(2, 20),
        'bootstrap': [True, False]
    }),
    "Decision Tree": (DecisionTreeClassifier(), {
        'max_depth': randint(3, 30),
        'min_samples_split': randint(2, 20)
    })
}

# Perform Randomized Search and store results
best_estimators = {}
best_params = {}
best_scores = {}

for name, (model, param_dist) in models.items():
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=20,
        scoring='accuracy',
        cv=10,
        n_jobs=-1,
        random_state=42
    )
    search.fit(X_train, y_train_encoded)
    
    best_estimators[name] = search.best_estimator_
    best_params[name] = search.best_params_
    best_scores[name] = search.best_score_

# Print results
print("\n=== Model Comparison ===")
print(f"Best Random Forest: {best_rf}")
print(f"Best RF Parameters: {best_params_rf}")
print(f"Best RF Accuracy: {best_score_rf:.4f}")

print(f"\nBest Bagging Classifier: {best_bagging}")
print(f"Best Bagging Parameters: {best_params_bagging}")
print(f"Best Bagging Accuracy: {best_score_bagging:.4f}")

print(f"\nBest Decision Tree Classifier: {best_dt}")
print(f"Best Decision Tree Parameters: {best_params_dt}")
print(f"Best Decision Tree Accuracy: {best_score_dt:.4f}")


Fitting 10 folds for each of 20 candidates, totalling 200 fits
Fitting 10 folds for each of 20 candidates, totalling 200 fits




Fitting 10 folds for each of 11 candidates, totalling 110 fits

=== Model Comparison ===
Best Random Forest: RandomForestClassifier(bootstrap=False, max_depth=40, n_estimators=156)
Best RF Parameters: {'bootstrap': False, 'max_depth': 40, 'n_estimators': 156}
Best RF Accuracy: 0.7764

Best Bagging Classifier: BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=25)
Best Bagging Parameters: {'bootstrap': True, 'n_estimators': 25}
Best Bagging Accuracy: 0.8000

Best Decision Tree Classifier: DecisionTreeClassifier(max_depth=18)
Best Decision Tree Parameters: {'max_depth': 18}
Best Decision Tree Accuracy: 0.7792
