In [None]:
from typing import Dict

import pandas as pd
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from functools import wraps
import warnings



Defining some constants

In [2]:
RANDOM_STATE = 42

ignore warnings about convergence of models, etc.

In [3]:
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

# Import dataset



In [44]:
df_breast_cancer = pd.read_csv('./data/breast-cancer-diagnostic.shuf.lrn.csv')
df_road_safety = pd.read_csv('./data/road_safety.csv')
df_phishing = pd.read_csv('./data/PhiUSIIL_Phishing_URL_Dataset.csv')
df_loan = pd.read_csv('./data/loan-10k.lrn.csv')

df_dict = {'breast_cancer': df_breast_cancer, 
           'phishing': df_phishing, 
           'road_safety': df_road_safety, 
           'loan': df_loan
           }

  df_road_safety = pd.read_csv('./data/road_safety.csv')


In [31]:
df_breast_cancer.head()

Unnamed: 0,ID,class,radiusMean,textureMean,perimeterMean,areaMean,smoothnessMean,compactnessMean,concavityMean,concavePointsMean,...,radiusWorst,textureWorst,perimeterWorst,areaWorst,smoothnessWorst,compactnessWorst,concavityWorst,concavePointsWorst,symmetryWorst,fractalDimensionWorst
0,886452,True,13.96,17.05,91.43,602.4,0.1096,0.1279,0.09789,0.05246,...,16.39,22.07,108.1,826.0,0.1512,0.3262,0.3209,0.1374,0.3068,0.07957
1,84348301,True,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
2,9012795,True,21.37,15.1,141.3,1386.0,0.1001,0.1515,0.1932,0.1255,...,22.69,21.84,152.1,1535.0,0.1192,0.284,0.4024,0.1966,0.273,0.08666
3,894326,True,18.22,18.87,118.7,1027.0,0.09746,0.1117,0.113,0.0795,...,21.84,25.0,140.9,1485.0,0.1434,0.2763,0.3853,0.1776,0.2812,0.08198
4,867387,False,15.71,13.93,102.0,761.7,0.09462,0.09462,0.07135,0.05933,...,17.5,19.25,114.3,922.8,0.1223,0.1949,0.1709,0.1374,0.2723,0.07071


In [40]:
df_breast_cancer.describe()

Unnamed: 0,ID,radiusMean,textureMean,perimeterMean,areaMean,smoothnessMean,compactnessMean,concavityMean,concavePointsMean,symmetryMean,...,radiusWorst,textureWorst,perimeterWorst,areaWorst,smoothnessWorst,compactnessWorst,concavityWorst,concavePointsWorst,symmetryWorst,fractalDimensionWorst
count,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,...,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0,285.0
mean,25755170.0,13.946439,19.376246,90.756842,637.428772,0.096595,0.104231,0.085204,0.047139,0.179774,...,16.038446,25.909614,105.767088,854.987719,0.132928,0.253865,0.266263,0.112879,0.287262,0.08377
std,107390000.0,3.488308,4.278841,24.062045,340.172969,0.014748,0.05523,0.077423,0.038661,0.029706,...,4.785408,6.101124,33.468918,550.723964,0.025036,0.165161,0.210121,0.067894,0.062336,0.019355
min,8913.0,7.691,9.71,47.98,170.4,0.06251,0.01938,0.0,0.0,0.106,...,8.678,12.02,54.49,223.6,0.08125,0.03432,0.0,0.0,0.1566,0.05521
25%,868871.0,11.51,16.39,73.99,406.3,0.08588,0.06545,0.02987,0.01899,0.1601,...,12.84,21.59,82.98,506.2,0.1148,0.1432,0.1117,0.06296,0.2482,0.07055
50%,905189.0,13.14,18.9,85.24,530.6,0.09597,0.08751,0.05485,0.0311,0.1776,...,14.73,25.34,96.09,656.7,0.1312,0.2053,0.1932,0.09265,0.279,0.07944
75%,8812816.0,15.5,21.84,102.8,747.2,0.1059,0.1284,0.1155,0.06772,0.1943,...,18.13,29.94,123.5,1030.0,0.1483,0.3253,0.3853,0.1663,0.3157,0.0918
max,911296200.0,25.73,39.28,174.2,2010.0,0.1634,0.3454,0.4264,0.1913,0.304,...,33.13,44.87,229.3,3234.0,0.2226,1.058,1.17,0.291,0.6638,0.2075


In [33]:
df_breast_cancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       285 non-null    int64  
 1   class                    285 non-null    bool   
 2   radiusMean               285 non-null    float64
 3    textureMean             285 non-null    float64
 4    perimeterMean           285 non-null    float64
 5    areaMean                285 non-null    float64
 6    smoothnessMean          285 non-null    float64
 7    compactnessMean         285 non-null    float64
 8    concavityMean           285 non-null    float64
 9    concavePointsMean       285 non-null    float64
 10   symmetryMean            285 non-null    float64
 11   fractalDimensionMean    285 non-null    float64
 12   radiusStdErr            285 non-null    float64
 13   textureStdErr           285 non-null    float64
 14   perimeterStdErr         2

In [27]:
df_phishing.head()

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,521848.txt,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.0,1.0,0.522907,...,0,0,1,34,20,28,119,0,124,1
1,31372.txt,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.0,0.666667,0.03265,...,0,0,1,50,9,8,39,0,217,1
2,597387.txt,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.0,0.866667,0.028555,...,0,0,1,10,2,7,42,2,5,1
3,554095.txt,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.0,1.0,0.522907,...,1,1,1,3,27,15,22,1,31,1
4,151578.txt,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.0,1.0,0.079963,...,1,0,1,244,15,34,72,1,85,1


In [34]:
df_phishing.describe()

Unnamed: 0,URLLength,DomainLength,IsDomainIP,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,HasObfuscation,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
count,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,...,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0,235795.0
mean,34.573095,21.470396,0.002706,78.430778,0.845508,0.260423,0.055747,2.764456,1.164758,0.002057,...,0.237007,0.023474,0.486775,26.075689,6.333111,10.522305,65.071113,2.377629,49.262516,0.571895
std,41.314153,9.150793,0.051946,28.976055,0.216632,0.251628,0.010587,0.599739,0.600969,0.045306,...,0.425247,0.151403,0.499826,79.411815,74.866296,22.312192,176.687539,17.641097,161.02743,0.494805
min,13.0,4.0,0.0,0.155574,0.0,0.0,0.001083,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,23.0,16.0,0.0,57.024793,0.68,0.005977,0.050747,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,27.0,20.0,0.0,100.0,1.0,0.079963,0.05797,3.0,1.0,0.0,...,0.0,0.0,0.0,8.0,2.0,6.0,12.0,0.0,10.0,1.0
75%,34.0,24.0,0.0,100.0,1.0,0.522907,0.062875,3.0,1.0,0.0,...,0.0,0.0,1.0,29.0,8.0,15.0,88.0,1.0,57.0,1.0
max,6097.0,110.0,1.0,100.0,1.0,0.522907,0.090824,13.0,10.0,1.0,...,1.0,1.0,1.0,8956.0,35820.0,6957.0,27397.0,4887.0,27516.0,1.0


In [35]:
df_phishing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 235795 entries, 0 to 235794
Data columns (total 56 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   FILENAME                    235795 non-null  object 
 1   URL                         235795 non-null  object 
 2   URLLength                   235795 non-null  int64  
 3   Domain                      235795 non-null  object 
 4   DomainLength                235795 non-null  int64  
 5   IsDomainIP                  235795 non-null  int64  
 6   TLD                         235795 non-null  object 
 7   URLSimilarityIndex          235795 non-null  float64
 8   CharContinuationRate        235795 non-null  float64
 9   TLDLegitimateProb           235795 non-null  float64
 10  URLCharProb                 235795 non-null  float64
 11  TLDLength                   235795 non-null  int64  
 12  NoOfSubDomain               235795 non-null  int64  
 13  HasObfuscation

In [28]:
df_road_safety.head()

Unnamed: 0.1,Unnamed: 0,Accident_Index,Vehicle_Reference_df_res,Vehicle_Type,Towing_and_Articulation,Vehicle_Manoeuvre,Vehicle_Location-Restricted_Lane,Junction_Location,Skidding_and_Overturning,Hit_Object_in_Carriageway,...,Age_Band_of_Casualty,Casualty_Severity,Pedestrian_Location,Pedestrian_Movement,Car_Passenger,Bus_or_Coach_Passenger,Pedestrian_Road_Maintenance_Worker,Casualty_Type,Casualty_Home_Area_Type,Casualty_IMD_Decile
0,0,201501BS70001,1,19.0,0.0,9.0,0.0,8.0,0.0,0.0,...,7.0,3,5.0,1.0,0.0,0.0,2.0,0,,
1,1,201501BS70002,1,9.0,0.0,9.0,0.0,8.0,0.0,0.0,...,5.0,3,9.0,9.0,0.0,0.0,2.0,0,1.0,3.0
2,2,201501BS70004,1,9.0,0.0,9.0,0.0,2.0,0.0,0.0,...,6.0,3,1.0,3.0,0.0,0.0,2.0,0,1.0,6.0
3,3,201501BS70005,1,9.0,0.0,9.0,0.0,2.0,0.0,0.0,...,2.0,3,5.0,1.0,0.0,0.0,2.0,0,1.0,2.0
4,4,201501BS70008,1,1.0,0.0,18.0,0.0,8.0,0.0,0.0,...,8.0,2,0.0,0.0,0.0,0.0,0.0,1,1.0,3.0


In [37]:
df_road_safety.describe()

Unnamed: 0.1,Unnamed: 0,Vehicle_Reference_df_res,Vehicle_Type,Towing_and_Articulation,Vehicle_Manoeuvre,Vehicle_Location-Restricted_Lane,Junction_Location,Skidding_and_Overturning,Hit_Object_in_Carriageway,Vehicle_Leaving_Carriageway,...,Age_Band_of_Casualty,Casualty_Severity,Pedestrian_Location,Pedestrian_Movement,Car_Passenger,Bus_or_Coach_Passenger,Pedestrian_Road_Maintenance_Worker,Casualty_Type,Casualty_Home_Area_Type,Casualty_IMD_Decile
count,363243.0,363243.0,363181.0,362864.0,363059.0,363067.0,363159.0,363067.0,363080.0,363084.0,...,357674.0,363243.0,363241.0,363241.0,362481.0,363197.0,363077.0,363243.0,323448.0,293666.0
mean,181621.0,1.696203,9.756953,0.029766,12.607326,0.109233,2.609361,0.188139,0.30748,0.366689,...,6.431284,2.875725,0.380731,0.276467,0.281027,0.066127,0.032833,7.84008,1.308186,5.107323
std,104859.366253,1.487094,8.315189,0.294127,6.218689,0.903131,3.249245,0.714243,1.595551,1.374107,...,2.15786,0.355195,1.52222,1.294574,0.591239,0.493174,0.25378,7.366436,0.657776,2.829458
min,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,90810.5,1.0,9.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,...,5.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,3.0
50%,181621.0,1.0,9.0,0.0,17.0,0.0,1.0,0.0,0.0,0.0,...,6.0,3.0,0.0,0.0,0.0,0.0,0.0,9.0,1.0,5.0
75%,272431.5,2.0,9.0,0.0,18.0,0.0,6.0,0.0,0.0,0.0,...,8.0,3.0,0.0,0.0,0.0,0.0,0.0,9.0,1.0,7.0
max,363242.0,37.0,98.0,5.0,18.0,9.0,8.0,5.0,12.0,8.0,...,11.0,3.0,10.0,9.0,2.0,4.0,2.0,98.0,3.0,10.0


In [36]:
df_road_safety.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363243 entries, 0 to 363242
Data columns (total 68 columns):
 #   Column                                       Non-Null Count   Dtype  
---  ------                                       --------------   -----  
 0   Unnamed: 0                                   363243 non-null  int64  
 1   Accident_Index                               363243 non-null  object 
 2   Vehicle_Reference_df_res                     363243 non-null  int64  
 3   Vehicle_Type                                 363181 non-null  float64
 4   Towing_and_Articulation                      362864 non-null  float64
 5   Vehicle_Manoeuvre                            363059 non-null  float64
 6   Vehicle_Location-Restricted_Lane             363067 non-null  float64
 7   Junction_Location                            363159 non-null  float64
 8   Skidding_and_Overturning                     363067 non-null  float64
 9   Hit_Object_in_Carriageway                    363080 non-nul

In [29]:
df_loan.head()

Unnamed: 0,ID,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,home_ownership,annual_inc,...,debt_settlement_flag,issue_d_month,issue_d_year,earliest_cr_line_month,earliest_cr_line_year,last_pymnt_d_month,last_pymnt_d_year,last_credit_pull_d_month,last_credit_pull_d_year,grade
0,24341,12500.0,12500.0,12500.0,36 months,7.21,387.17,< 1 year,MORTGAGE,81000.0,...,N,6,2018,6,2000,2,2019,2,2019,A
1,67534,33850.0,33850.0,33775.0,60 months,20.99,915.57,1 year,MORTGAGE,80000.0,...,N,10,2015,9,1984,2,2019,2,2019,E
2,35080,10000.0,10000.0,10000.0,60 months,20.0,264.94,< 1 year,RENT,36580.0,...,N,9,2017,10,2006,1,2018,11,2018,D
3,4828,20250.0,20250.0,20250.0,36 months,14.31,695.15,9 years,RENT,48700.0,...,N,0,2015,6,1996,6,2016,9,2017,C
4,59259,25000.0,25000.0,25000.0,36 months,14.99,866.52,1 year,MORTGAGE,85000.0,...,N,11,2016,0,2002,2,2019,2,2019,C


In [38]:
df_loan.describe()

Unnamed: 0,ID,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,fico_range_low,...,total_bc_limit,total_il_high_credit_limit,issue_d_month,issue_d_year,earliest_cr_line_month,earliest_cr_line_year,last_pymnt_d_month,last_pymnt_d_year,last_credit_pull_d_month,last_credit_pull_d_year
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,50043.4302,15631.1525,15631.1525,15625.9925,13.216959,461.084183,82128.97,19.089188,0.3034,698.1795,...,24230.97,48294.3718,5.7767,2016.0031,5.7246,2000.0179,4.003,2017.7473,3.3342,2018.4534
std,28982.440166,9314.246117,9314.246117,9312.426843,4.855838,269.903751,67692.84,9.523219,0.841201,32.170977,...,25395.48,46981.179499,3.40482,1.587496,3.382558,7.302583,3.224257,1.463235,2.909666,0.963494
min,0.0,1000.0,1000.0,1000.0,5.31,30.12,5000.0,0.0,0.0,660.0,...,200.0,0.0,0.0,2012.0,0.0,1965.0,0.0,2012.0,0.0,2012.0
25%,24873.25,8350.0,8350.0,8343.75,9.75,263.77,50000.0,12.4375,0.0,675.0,...,8800.0,18881.25,3.0,2015.0,3.0,1996.0,2.0,2017.0,2.0,2018.0
50%,50033.5,14000.0,14000.0,14000.0,12.73,396.78,70000.0,18.33,0.0,690.0,...,17100.0,36533.5,6.0,2016.0,6.0,2001.0,2.0,2018.0,2.0,2019.0
75%,75261.5,21000.0,21000.0,21000.0,16.01,616.3375,97000.0,24.93,0.0,715.0,...,31400.0,63500.5,9.0,2017.0,9.0,2005.0,6.0,2019.0,3.0,2019.0
max,99999.0,40000.0,40000.0,40000.0,30.99,1717.63,3200000.0,168.52,16.0,845.0,...,1090700.0,768775.0,11.0,2018.0,11.0,2015.0,11.0,2019.0,11.0,2019.0


In [39]:
df_loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 92 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          10000 non-null  int64  
 1   loan_amnt                   10000 non-null  float64
 2   funded_amnt                 10000 non-null  float64
 3   funded_amnt_inv             10000 non-null  float64
 4   term                        10000 non-null  object 
 5   int_rate                    10000 non-null  float64
 6   installment                 10000 non-null  float64
 7   emp_length                  10000 non-null  object 
 8   home_ownership              10000 non-null  object 
 9   annual_inc                  10000 non-null  float64
 10  verification_status         10000 non-null  object 
 11  loan_status                 10000 non-null  object 
 12  pymnt_plan                  10000 non-null  object 
 13  purpose                     1000

## Data preparation

1. Check for missing values


In [45]:
for key in df_dict.keys():
    print(f'{key} missing values: {df_dict[key].isnull().sum().any()}')

breast_cancer missing values: False
phishing missing values: False
road_safety missing values: True
loan missing values: False


1Encode Boolean target attribute 'class' as Integer
2Drop the 'ID' attribute
3Separate the 'class' attribute into its own variable

In [46]:
df_dict['breast_cancer']['class'] = df_dict['breast_cancer']['class'].astype(int)
X = df_dict['breast_cancer'].drop(columns=['ID', 'class'])
Y = df_dict['breast_cancer']['class']


Training-test data split for holdout method

In [None]:
holdout_X_train, holdout_X_test, holdout_Y_train, holdout_Y_test = train_test_split(X, Y, test_size=0.3,
                                                                                    random_state=42)


Data split for cross-validation method
Scaling set up in pipelines for individual algorithms

In [48]:
cross_validation_split = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Prepare data structures and useful functions

In [None]:
def get_metrics_dict(
        accuracy: float,
        f1: float,
        precision: float,
        recall: float,
) -> Dict[str, float]:
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }


def timer(func):
    """
    A decorator to measure and print the execution time of a function.

    Args:
    - func (function): The function to be wrapped by the timer decorator.

    Returns:
    - wrapper (function): A wrapped function that calculates and prints the time
                           taken to execute the original function.

    This decorator can be used to wrap functions and output their execution time
    in seconds.
    """
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        duration = end_time - start_time
        print(f"{func.__name__} executed in {duration:.4f} seconds")
        return result
    return wrapper


@timer
def find_best_estimator(
        classifier,
        param_grid: dict,
        cv: int = 5
) -> GridSearchCV:
    grid_search = GridSearchCV(
        classifier,
        param_grid=param_grid,
        cv=cv,
        scoring="accuracy"
    )
    grid_search.fit(holdout_X_train, holdout_Y_train)
    return grid_search.best_estimator_

## Random Forest

In [None]:
@timer
def run_random_forest(classifier: RandomForestClassifier | None = None) -> list[dict[str, any]]:
    if classifier is None:
        classifier = RandomForestClassifier()

    classifier.set_params(random_state=RANDOM_STATE)

    # Holdout method
    classifier.fit(holdout_X_train, holdout_Y_train)
    holdout_y_pred = classifier.predict(holdout_X_test)
    holdout_results = get_metrics_dict(
        accuracy=accuracy_score(holdout_Y_test, holdout_y_pred),
        f1=f1_score(holdout_Y_test, holdout_y_pred),
        precision=precision_score(holdout_Y_test, holdout_y_pred),
        recall=recall_score(holdout_Y_test, holdout_y_pred),
    )

    # Cross-validation
    cv_scores = cross_validate(classifier, X, Y, cv=cross_validation_split,
                               scoring=['accuracy', 'f1', 'precision', 'recall'])
    cv_results = get_metrics_dict(
        accuracy=cv_scores['test_accuracy'].mean(),
        f1=cv_scores['test_f1'].mean(),
        precision=cv_scores['test_precision'].mean(),
        recall=cv_scores['test_recall'].mean(),
    )

    common_results = {
        "classifier": "Random Forest",
        "n_estimators": classifier.n_estimators,
        "max_depth": classifier.max_depth,
        "min_samples_split": classifier.min_samples_split,
        "min_samples_leaf": classifier.min_samples_leaf,
    }

    return [
        {
            **common_results,
            "Data Split": "Holdout",
            **holdout_results
        },
        {
            **common_results,
            "Data Split": "Cross Validation",
            **cv_results
        }
    ]


Test random forest in various configurations

In [61]:
rf_classifiers = [
    RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1),
    RandomForestClassifier(n_estimators=200, min_samples_split=4, min_samples_leaf=1),
    RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=4, max_depth=15),
    RandomForestClassifier(n_estimators=150, min_samples_split=5, min_samples_leaf=2, max_depth=20),
    RandomForestClassifier(n_estimators=250, min_samples_split=3, min_samples_leaf=3, max_depth=10)
]

rf_results = []
for classifier in rf_classifiers:
    rf_results.extend(run_random_forest(classifier))  # Assumes run_random_forest is defined elsewhere

rf_results_df = pd.DataFrame(rf_results)
rf_results_df.sort_values(by='accuracy', ascending=False).round(3)

Unnamed: 0,classifier,n_estimators,max_depth,min_samples_split,min_samples_leaf,Data Split,accuracy,f1,precision,recall
3,Random Forest,200,,4,1,Cross Validation,0.961,0.943,0.959,0.928
1,Random Forest,100,,2,1,Cross Validation,0.958,0.937,0.948,0.928
9,Random Forest,250,10.0,3,3,Cross Validation,0.954,0.933,0.938,0.928
8,Random Forest,250,10.0,3,3,Holdout,0.953,0.939,0.939,0.939
6,Random Forest,150,20.0,5,2,Holdout,0.953,0.939,0.939,0.939
2,Random Forest,200,,4,1,Holdout,0.953,0.939,0.939,0.939
7,Random Forest,150,20.0,5,2,Cross Validation,0.951,0.927,0.938,0.918
5,Random Forest,100,15.0,2,4,Cross Validation,0.951,0.926,0.948,0.908
0,Random Forest,100,,2,1,Holdout,0.942,0.923,0.938,0.909
4,Random Forest,100,15.0,2,4,Holdout,0.942,0.923,0.938,0.909


Attempt to find best configuration using GridSearchCV

In [62]:
rf_param_grid = {
    'n_estimators': [100, 150, 200, 250, 300],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 3],
}

best_rf = find_best_estimator(
    classifier=RandomForestClassifier(),
    param_grid=rf_param_grid,
    cv=5
)

best_rf_results = pd.DataFrame(run_random_forest(best_rf))
best_rf_results

Unnamed: 0,classifier,n_estimators,max_depth,min_samples_split,min_samples_leaf,Data Split,accuracy,f1,precision,recall
0,Random Forest,200,,2,1,Holdout,0.953488,0.939394,0.939394,0.939394
1,Random Forest,200,,2,1,Cross Validation,0.961404,0.942708,0.959474,0.928421



## MLP


In [63]:
def run_mlp(classifier: MLPClassifier | None = None) -> list[dict[str, any]]:
    if classifier is None:
        classifier = MLPClassifier()

    # create a pipeline which both scales data using standard scaler and then estimates using MLP
    classifier.set_params(random_state=RANDOM_STATE)
    pipeline = Pipeline([
        ('scale', StandardScaler()),
        ('mlp', classifier),
    ])
    # holdout method
    pipeline.fit(holdout_X_train, holdout_Y_train)
    holdout_y_pred = pipeline.predict(holdout_X_test)

    holdout_results = get_metrics_dict(
        accuracy=accuracy_score(holdout_Y_test, holdout_y_pred),
        f1=f1_score(holdout_Y_test, holdout_y_pred),
        precision=precision_score(holdout_Y_test, holdout_y_pred),
        recall=recall_score(holdout_Y_test, holdout_y_pred),
    )

    # cross validation
    cv_scores = cross_validate(pipeline, X, Y, cv=cross_validation_split,
                               scoring=['accuracy', 'f1', 'precision', 'recall'])
    cv_results = get_metrics_dict(
        accuracy=cv_scores['test_accuracy'].mean(),
        f1=cv_scores['test_f1'].mean(),
        precision=cv_scores['test_precision'].mean(),
        recall=cv_scores['test_recall'].mean(),
    )

    common_results = {
        "classifier": "MLP",
        "hidden_layer_sizes": classifier.hidden_layer_sizes,
        "max_iter": classifier.max_iter,
        "activation": classifier.activation,
        "solver": classifier.solver,
    }

    return [
        {
            **common_results,
            "Data Split": "Holdout",
            **holdout_results
        },
        {
            **common_results,
            "Data Split": "Cross Validation",
            **cv_results
        }
    ]

Test MLP in various configurations.
There are quite a lot of parameters to vary here, we can test only a limited amount.

In [64]:

mlp_classifiers = [
    MLPClassifier(hidden_layer_sizes=(100,), max_iter=200),
    MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=200),
    MLPClassifier(hidden_layer_sizes=(200,), max_iter=300, activation="logistic"),
    MLPClassifier(hidden_layer_sizes=(100, 50, 25), max_iter=300, solver="lbfgs"),
    MLPClassifier(hidden_layer_sizes=(300,), max_iter=500, activation="identity")
]

mlp_results = []
for classifier in mlp_classifiers:
    mlp_results.extend(run_mlp(classifier))

mlp_results_df = pd.DataFrame(mlp_results)
mlp_results_df.sort_values(by='accuracy', ascending=False).round(3)

Unnamed: 0,classifier,hidden_layer_sizes,max_iter,activation,solver,Data Split,accuracy,f1,precision,recall
4,MLP,"(200,)",300,logistic,adam,Holdout,0.988,0.985,1.0,0.97
5,MLP,"(200,)",300,logistic,adam,Cross Validation,0.982,0.973,1.0,0.948
3,MLP,"(100, 50)",200,relu,adam,Cross Validation,0.979,0.969,0.979,0.959
0,MLP,"(100,)",200,relu,adam,Holdout,0.977,0.97,0.97,0.97
8,MLP,"(300,)",500,identity,adam,Holdout,0.977,0.97,0.97,0.97
1,MLP,"(100,)",200,relu,adam,Cross Validation,0.975,0.964,0.97,0.959
9,MLP,"(300,)",500,identity,adam,Cross Validation,0.968,0.954,0.95,0.959
2,MLP,"(100, 50)",200,relu,adam,Holdout,0.965,0.955,0.941,0.97
7,MLP,"(100, 50, 25)",300,relu,lbfgs,Cross Validation,0.965,0.948,0.95,0.948
6,MLP,"(100, 50, 25)",300,relu,lbfgs,Holdout,0.942,0.928,0.889,0.97


Attempt to find best configuration using GridSearchCV

In [65]:
mlp_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (200,), (100, 50), (100, 50, 25)],
    'max_iter': [200, 300, 500],
    'activation': ['relu', 'tanh', 'logistic'],  # Optional for activation exploration
    'solver': ['adam', 'sgd'],  # Optional for solver exploration
}

best_mlp = find_best_estimator(
    classifier=MLPClassifier(),
    param_grid=mlp_param_grid,
    cv=5
)

best_mlp_results = pd.DataFrame(run_mlp(best_mlp))
best_mlp_results

Unnamed: 0,classifier,hidden_layer_sizes,max_iter,activation,solver,Data Split,accuracy,f1,precision,recall
0,MLP,"(100, 50)",300,tanh,adam,Holdout,0.965116,0.955224,0.941176,0.969697
1,MLP,"(100, 50)",300,tanh,adam,Cross Validation,0.964912,0.948421,0.948421,0.948421


## SVC

In [66]:
def run_svc(classifier: SVC | None = None) -> list[dict[str, any]]:
    if classifier is None:
        classifier = SVC()

    pipeline = Pipeline([
        ('scale', StandardScaler()),
        ('svc', classifier),
    ])
    # Holdout method
    pipeline.fit(holdout_X_train, holdout_Y_train)
    holdout_y_pred = pipeline.predict(holdout_X_test)
    holdout_results = get_metrics_dict(
        accuracy=accuracy_score(holdout_Y_test, holdout_y_pred),
        f1=f1_score(holdout_Y_test, holdout_y_pred),
        precision=precision_score(holdout_Y_test, holdout_y_pred),
        recall=recall_score(holdout_Y_test, holdout_y_pred),
    )

    # Cross-validation
    cv_scores = cross_validate(pipeline, X, Y, cv=cross_validation_split,
                               scoring=['accuracy', 'f1', 'precision', 'recall'])
    cv_results = get_metrics_dict(
        accuracy=cv_scores['test_accuracy'].mean(),
        f1=cv_scores['test_f1'].mean(),
        precision=cv_scores['test_precision'].mean(),
        recall=cv_scores['test_recall'].mean(),
    )

    common_results = {
        "classifier": "SVC",
        "kernel": classifier.kernel,
        "C": classifier.C,
        "gamma": classifier.gamma,
        "degree": classifier.degree,
        "coef0": classifier.coef0
    }

    return [
        {
            **common_results,
            "Data Split": "Holdout",
            **holdout_results
        },
        {
            **common_results,
            "Data Split": "Cross Validation",
            **cv_results
        }
    ]


Test SVC in various configurations

In [67]:
svc_classifiers = [
    SVC(kernel='linear', C=0.1, gamma='scale'),
    SVC(kernel='rbf', C=1.0, gamma=0.1),
    SVC(kernel='poly', degree=2, C=1.0, gamma='auto', coef0=0.0),
    SVC(kernel='poly', degree=3, C=10.0, gamma='scale', coef0=1.0),
    SVC(kernel='sigmoid', C=0.5, gamma=0.01, coef0=0.5)
]

svc_results = []
for classifier in svc_classifiers:
    svc_results.extend(run_svc(classifier))

svc_results_df = pd.DataFrame(svc_results)
svc_results_df.sort_values(by='accuracy', ascending=False).round(3)

Unnamed: 0,classifier,kernel,C,gamma,degree,coef0,Data Split,accuracy,f1,precision,recall
0,SVC,linear,0.1,scale,3,0.0,Holdout,0.988,0.985,1.0,0.97
1,SVC,linear,0.1,scale,3,0.0,Cross Validation,0.982,0.974,1.0,0.949
9,SVC,sigmoid,0.5,0.01,3,0.5,Cross Validation,0.968,0.951,1.0,0.907
2,SVC,rbf,1.0,0.1,3,0.0,Holdout,0.965,0.954,0.969,0.939
8,SVC,sigmoid,0.5,0.01,3,0.5,Holdout,0.965,0.952,1.0,0.909
7,SVC,poly,10.0,scale,3,1.0,Cross Validation,0.958,0.94,0.924,0.959
3,SVC,rbf,1.0,0.1,3,0.0,Cross Validation,0.947,0.925,0.915,0.939
6,SVC,poly,10.0,scale,3,1.0,Holdout,0.942,0.928,0.889,0.97
4,SVC,poly,1.0,auto,2,0.0,Holdout,0.826,0.706,1.0,0.545
5,SVC,poly,1.0,auto,2,0.0,Cross Validation,0.8,0.605,0.936,0.454


Attempt to find best configuration using GridSearchCV


In [68]:
svc_param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 0.1],
    'degree': [2, 3],
    'coef0': [0.0, 0.5]
}

best_svc = find_best_estimator(
    classifier=SVC(),
    param_grid=svc_param_grid,
    cv=5
)

best_svc_results = pd.DataFrame(run_svc(best_svc))
best_svc_results


Unnamed: 0,classifier,kernel,C,gamma,degree,coef0,Data Split,accuracy,f1,precision,recall
0,SVC,linear,1,scale,2,0.0,Holdout,0.988372,0.984615,1.0,0.969697
1,SVC,linear,1,scale,2,0.0,Cross Validation,0.978947,0.968355,0.99,0.948421


## Combining results

In [87]:
results = pd.concat(
    [rf_results_df, mlp_results_df, svc_results_df, best_rf_results, best_mlp_results, best_svc_results], join='inner')
results.sort_values(by='accuracy', ascending=False).round(3)

Unnamed: 0,classifier,Data Split,accuracy,f1,precision,recall
0,SVC,Holdout,0.988,0.985,1.0,0.97
0,SVC,Holdout,0.988,0.985,1.0,0.97
4,MLP,Holdout,0.988,0.985,1.0,0.97
5,MLP,Cross Validation,0.982,0.973,1.0,0.948
1,SVC,Cross Validation,0.982,0.974,1.0,0.949
1,SVC,Cross Validation,0.979,0.968,0.99,0.948
3,MLP,Cross Validation,0.979,0.969,0.979,0.959
0,MLP,Holdout,0.977,0.97,0.97,0.97
8,MLP,Holdout,0.977,0.97,0.97,0.97
1,MLP,Cross Validation,0.975,0.964,0.97,0.959
