# TM10007 Assignment template

In [25]:
# Run this to use from colab environment
#!pip install -q --upgrade git+https://github.com/jveenland/tm10007_ml.git

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [28]:
# Data loading functions. Uncomment the one you want to use
#from worcgist.load_data import load_data
from worclipo.load_data import load_data
#from worcliver.load_data import load_data
#from ecg.load_data import load_data

# Import classifiers
from sklearn import model_selection

data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

X = data.drop("label",axis=1)
y = data["label"]

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)


The number of samples: 115
The number of columns: 494


### Preprocessing

Missing data
* Part 1: Finding missing data

In [29]:
# Define missing value indicators
custom_missing = ['NA', 'N/A', '?', 'None', 'none', '-']

# Count NaNs
nan_counts = X_train.isna().sum()

# Count empty strings
empty_string_counts = (X_train == '').sum()

# Count custom missing indicators (case-insensitive match)
custom_missing_counts = X_train.apply(lambda col: col.astype(str).str.lower().isin([val.lower() for val in custom_missing]).sum())

# Compute total missing count per column
total_missing = nan_counts + empty_string_counts + custom_missing_counts

# Filter out columns where total missing is zero
total_missing_selected = total_missing[total_missing != 0]

# Print only total missing counts
print(total_missing_selected)


Series([], dtype: int64)


* Part 2: Processing missing data

In [30]:
# Missing data was only find in the form of zero's, therefore only zero_counts is further used.
limit = len(X_train.index)*50/100 #If 50% or more of the data within one feature is missing the feature is deleted
valid_columns = [col for col, count in total_missing.items() if count < limit]

# Keep only the valid columns in both X_train and X_test
X_train = X_train[valid_columns]
X_test = X_test[valid_columns]

# Imputate remaining zero's
# Import
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from scipy.stats import shapiro

# Replace zeros with NaN
X_train.replace(0, np.nan, inplace=True)
X_test.replace(0, np.nan, inplace=True)

imputation_strategies = {}  # Store mean/median decision per column

for col in X_train.select_dtypes(include=['number']).columns:  # Only numeric columns
    col_data = X_train[col].dropna()  # Remove NaN values for testing

    if len(col_data) > 3:  # Shapiro requires at least 3 non-null values
        if col_data.nunique() == 1:  # Check if all values are the same
            strategy = 'median'  # Default to median if no variability
        else:
            _, p = shapiro(col_data)
            strategy = 'mean' if p > 0.05 else 'median'
    else:
        strategy = 'median'  # Default to median if too few values

    imputation_strategies[col] = strategy

# Create imputers for mean and median
mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')

for col, strategy in imputation_strategies.items():
    imputer = mean_imputer if strategy == 'mean' else median_imputer
    X_train[[col]] = imputer.fit_transform(X_train[[col]])
    X_test[[col]] = imputer.transform(X_test[[col]])  # Use the same imputer
X_train



ValueError: Columns must be same length as key

Scaling data

In [29]:
import pandas as pd
from sklearn import preprocessing

scaler_MaxAbsScaler = preprocessing.MaxAbsScaler()
scaler_Normalizer = preprocessing.Normalizer()
scaler_standard = preprocessing.StandardScaler()
scaler_minmax = preprocessing.MinMaxScaler()
scaler_robust = preprocessing.RobustScaler()

scaled_MaxAbsScaler_array_train = scaler_MaxAbsScaler.fit_transform(X_train)
scaled_MaxAbsScaler_array_test = scaler_MaxAbsScaler.transform(X_test)
scaled_Normalizer_array_train = scaler_Normalizer.fit_transform(X_train)
scaled_Normalizer_array_test = scaler_Normalizer.transform(X_test)
scaled_standard_array_train = scaler_standard.fit_transform(X_train)
scaled_standard_array_test = scaler_standard.transform(X_test)
scaled_minmax_array_train = scaler_minmax.fit_transform(X_train)
scaled_minmax_array_test = scaler_minmax.transform(X_test)
scaled_robust_array_train = scaler_robust.fit_transform(X_train)
scaled_robust_array_test = scaler_robust.transform(X_test)

X_scaled_MaxAbsScaler_train = pd.DataFrame(scaled_MaxAbsScaler_array_train, columns=X_train.columns)
X_df_scaled_Normalizer_train = pd.DataFrame(scaled_Normalizer_array_train, columns=X_train.columns)
X_scaled_Normalizer_test = pd.DataFrame(scaled_Normalizer_array_test, columns=X_test.columns)
X_scaled_standard_train = pd.DataFrame(scaled_standard_array_train, columns=X_train.columns)
X_scaled_standard_test = pd.DataFrame(scaled_standard_array_test, columns=X_test.columns)
X_scaled_minmax_train = pd.DataFrame(scaled_minmax_array_train, columns=X_train.columns)
X_scaled_minmax_test = pd.DataFrame(scaled_minmax_array_test, columns=X_test.columns)
X_scaled_robust_train = pd.DataFrame(scaled_robust_array_train, columns=X_train.columns)
X_scaled_robust_test = pd.DataFrame(scaled_robust_array_test, columns=X_test.columns)


Feature extraction

Lloyd

Inge

## Model selection
Random forest, decision tree and bagging

In [10]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Encode categorical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Define the parameter distributions for each model

param_dist_rf = {
    'n_estimators': randint(5, 200),  # Random number of trees between 5 and 200
    'max_depth': [None] + list(randint(5, 50).rvs(10)),  # None or random depths between 5 and 50
    'bootstrap': [True, False]  # Random choice of bootstrap
}

param_dist_bagging = {
    'n_estimators': randint(5, 200),  # Number of estimators (5-200)
    'bootstrap': [True, False]  # Random choice of bootstrap
}

param_dist_dt = {
    'max_depth': [None] + list(randint(5, 50).rvs(10)),  # None or depths between 5 and 50
}

# Initialize classifiers
rf = RandomForestClassifier()
bagging = BaggingClassifier(DecisionTreeClassifier())  # Homemade Random Forest
dt = DecisionTreeClassifier()

# Randomized Search for Random Forest
random_search_rf = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist_rf,
    n_iter=20,  # 20 random searches
    scoring='accuracy',
    cv=10,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Randomized Search for BaggingClassifier
random_search_bagging = RandomizedSearchCV(
    estimator=bagging,
    param_distributions=param_dist_bagging,
    n_iter=20,
    scoring='accuracy',
    cv=10,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Randomized Search for DecisionTreeClassifier
random_search_dt = RandomizedSearchCV(
    estimator=dt,
    param_distributions=param_dist_dt,
    n_iter=20,
    scoring='accuracy',
    cv=10,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit searches
random_search_rf.fit(X_train, y_train_encoded)
random_search_bagging.fit(X_train, y_train_encoded)
random_search_dt.fit(X_train, y_train_encoded)

# Get best models
best_rf = random_search_rf.best_estimator_
best_bagging = random_search_bagging.best_estimator_
best_dt = random_search_dt.best_estimator_

# Get best parameters and accuracy
best_params_rf = random_search_rf.best_params_
best_params_bagging = random_search_bagging.best_params_
best_params_dt = random_search_dt.best_params_

best_score_rf = random_search_rf.best_score_
best_score_bagging = random_search_bagging.best_score_
best_score_dt = random_search_dt.best_score_

# Print results
print("\n=== Model Comparison ===")
print(f"Best Random Forest: {best_rf}")
print(f"Best RF Parameters: {best_params_rf}")
print(f"Best RF Accuracy: {best_score_rf:.4f}")

print(f"\nBest Bagging Classifier: {best_bagging}")
print(f"Best Bagging Parameters: {best_params_bagging}")
print(f"Best Bagging Accuracy: {best_score_bagging:.4f}")

print(f"\nBest Decision Tree Classifier: {best_dt}")
print(f"Best Decision Tree Parameters: {best_params_dt}")
print(f"Best Decision Tree Accuracy: {best_score_dt:.4f}")


Fitting 10 folds for each of 20 candidates, totalling 200 fits
Fitting 10 folds for each of 20 candidates, totalling 200 fits




Fitting 10 folds for each of 11 candidates, totalling 110 fits

=== Model Comparison ===
Best Random Forest: RandomForestClassifier(bootstrap=False, max_depth=40, n_estimators=156)
Best RF Parameters: {'bootstrap': False, 'max_depth': 40, 'n_estimators': 156}
Best RF Accuracy: 0.7764

Best Bagging Classifier: BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=25)
Best Bagging Parameters: {'bootstrap': True, 'n_estimators': 25}
Best Bagging Accuracy: 0.8000

Best Decision Tree Classifier: DecisionTreeClassifier(max_depth=18)
Best Decision Tree Parameters: {'max_depth': 18}
Best Decision Tree Accuracy: 0.7792
