In [1]:
import os, time, ast, json, csv
import pandas as pd
import numpy as np
from scipy.special import softmax
from tqdm.notebook import tqdm
import joblib
from joblib import Parallel, delayed
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, matthews_corrcoef
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import  RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# Load encoded NEGONE and MEDIAN feature lists.
file_path = '../data/negone_median.json'

with open(file_path, 'r') as file:
    data = json.load(file)

negone_features = data.get('NEGONE_FEATURES', [])
median_features = data.get('MEDIAN_FEATURES', [])

print('NEGONE_FEATURES:', negone_features)
print('MEDIAN_FEATURES:', median_features)

NEGONE_FEATURES: ['feature369', 'feature265', 'feature473', 'feature420', 'feature461', 'feature415', 'feature373', 'feature433', 'feature395', 'feature443', 'feature242', 'feature258', 'feature451', 'feature366', 'feature425', 'feature46', 'feature31', 'feature6', 'feature89', 'feature52', 'feature468', 'feature286', 'feature315', 'feature191', 'feature196', 'feature91', 'feature192', 'feature27', 'feature20', 'feature111', 'feature100', 'feature90', 'feature72', 'feature255', 'feature298', 'feature50', 'feature67', 'feature136', 'feature59', 'feature92', 'feature139', 'feature60', 'feature48', 'feature145', 'feature78', 'feature115', 'feature372', 'feature472', 'feature439', 'feature382', 'feature475', 'feature353', 'feature134', 'feature132', 'feature35', 'feature42', 'feature96', 'feature249', 'feature82', 'feature18', 'feature23', 'feature123', 'feature99', 'feature32', 'feature70', 'feature68', 'feature178', 'feature401', 'feature141', 'feature294', 'feature447', 'feature95', 'fe

In [3]:
# Load other encoded feature names. 

file_path = '../data/patterns.json'

with open(file_path, 'r') as file:
    data = json.load(file)

# Get keys, convert to integer numbers.
keys_list = [int(key.replace('feature', '')) for key in data.keys()]
drop_numbers = keys_list
print('Extracted Keys:', drop_numbers)
print('Length:', len(drop_numbers))

Extracted Keys: [6, 18, 23, 32, 42, 48, 50, 59, 60, 67, 68, 70, 78, 82, 89, 92, 96, 99, 115, 123, 136, 139, 145, 178, 249, 298]
Length: 26


In [13]:
class RemoveBeforeAfterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.drop_cols = None

    def fit(self, X, y=None):
        print(X.shape)
        self.drop_cols = [col for col in X.columns if any(f'feature{num}' == col for num in drop_numbers)]
        print(f"Columns to be dropped: {self.drop_cols}")
        return self

    def transform(self, X, y=None):
        print(f"Columns being dropped: {[col for col in self.drop_cols if col in X.columns]}")
        X = X.drop(columns=self.drop_cols, errors='ignore')
        print(X.shape)
        return X

    def get_feature_names_out(self, input_features=None):
        return [f for f in input_features if f not in self.drop_cols]

In [14]:
def preprocess(preprocessor, train_data, train_labels, quiet=False):
    for k, v in preprocessor.steps:
        print(f"Applying step: {k}")
        print(f"Input shape: {train_data.shape}")
        print(f"Input columns: {train_data.columns}")
        
        if k == 'initial':
            start = time.time()
            v.fit(train_data)
            train_data = pd.DataFrame(v.transform(train_data), columns=v.get_feature_names_out())
            end = time.time()
            if not quiet:
                print(k + ' took ' + str(end - start) + ' to run.')
        elif k == 'oversampling' or k == 'undersampling':
            start = time.time()
            input_features = train_data.columns
            train_data, train_labels = v.fit_resample(train_data, train_labels)
            train_data = pd.DataFrame(train_data, columns=input_features)
            end = time.time()
            if not quiet:
                print(k + ' took ' + str(end - start) + ' to run.')
        else:
            start = time.time()
            v.fit(train_data)
            input_features = train_data.columns
            train_data = pd.DataFrame(v.transform(train_data), columns=v.get_feature_names_out(input_features))
            end = time.time()
            if not quiet:
                print(k + ' took ' + str(end - start) + ' to run.')
        
        print(f"Output shape: {train_data.shape}")
        print(f"Output columns: {train_data.columns}")
        print("---")

    for col in train_data.columns:
        try:
            train_data[col] = train_data[col].astype('float')
        except:
            train_data[col] = train_data[col].astype('category')

    return train_data, train_labels

def transform(test_data, preprocessor, quiet=False):
    for k, v in preprocessor.steps:
        if k == 'initial':
            test_data = pd.DataFrame(v.transform(test_data), columns=v.get_feature_names_out())
        elif k == 'oversampling' or k == 'undersampling':
            continue
        else:
            test_data = v.transform(test_data)
    test_data = pd.DataFrame(test_data)

    for col in test_data.columns:
        try:
            test_data[col] = test_data[col].astype('float')
        except:
            test_data[col] = test_data[col].astype('category')

    return test_data.to_numpy()

In [4]:
# This is the soft-voting function used to aggregate the predictions.
def soft_vote(preds):
    summed_preds = [[np.sum(preds[:, j][:, i]) for i in range(3)] for j in range(len(preds[0]))]
    return [softmax(np.log(sp)) for sp in summed_preds]

# This is used to drop columns that only contain NaN values.
def drop_allnan(data):
    for col in data.columns:
        if data[col].isna().sum() == len(data):
            data = data.drop(columns=col)
    return data

In [16]:
""" Iterate through the entire testing script for each subeset DataFrame in the dictionary. """

for number in range(1, 500):  
    print(f"Processing model {number}")
    
    # Load both datasets.
    data = pd.read_csv(f'../data/encoded/test/X_test_{number}_encoded.csv', low_memory=False)
    print(f'Shape of data: {data.shape}')
    X_train = pd.read_csv(f'../data/encoded/train/X_train_{number}_encoded.csv', low_memory=False)
    print(f'Shape of X_train: {X_train.shape}')

    model_name = f'xgb_df{number}'
    model_path = f'../XGBoost/models/{model_name}'
    output_path = f'../XGBoost/results/{model_name}.csv'
    metrics_path = f'../XGBoost/metrics/individual/{model_name}_metrics.csv'
    report_path = f'../XGBoost/metrics/report/{model_name}_report.csv'
    matrix_path = f'../XGBoost/metrics/matrix/{model_name}_matrix.csv'

    # These lists are used to determine imputation strategies for the specified features (inherited from LoGoFunc)
    NEGONE_FEATURES = ['feature369', 'feature265', 'feature473', 'feature420', 'feature461', 'feature415', 'feature373', 'feature433', 'feature395', 'feature443', 'feature242', 'feature258', 'feature451', 'feature366', 'feature425', 'feature46', 'feature31', 'feature6', 'feature89', 'feature52', 'feature468', 'feature286', 'feature315', 'feature191', 'feature196', 'feature91', 'feature192', 'feature27', 'feature20', 'feature111', 'feature100', 'feature90', 'feature72', 'feature255', 'feature298', 'feature50', 'feature67', 'feature136', 'feature59', 'feature92', 'feature139', 'feature60', 'feature48', 'feature145', 'feature78', 'feature115', 'feature372', 'feature472', 'feature439', 'feature382', 'feature475', 'feature353', 'feature134', 'feature132', 'feature35', 'feature42', 'feature96', 'feature249', 'feature82', 'feature18', 'feature23', 'feature123', 'feature99', 'feature32', 'feature70', 'feature68', 'feature178', 'feature401', 'feature141', 'feature294', 'feature447', 'feature95', 'feature375', 'feature55', 'feature421', 'feature56', 'feature34', 'feature304', 'feature77', 'feature435', 'feature365', 'feature226', 'feature360', 'feature347', 'feature193', 'feature314', 'feature299', 'feature403', 'feature402', 'feature334', 'feature350', 'feature110', 'feature271', 'feature229', 'feature337', 'feature268', 'feature328', 'feature293', 'feature223', 'feature85', 'feature361', 'feature140', 'feature297', 'feature254', 'feature340', 'feature306', 'feature172', 'feature155', 'feature240', 'feature236', 'feature239', 'feature209', 'feature238', 'feature187', 'feature364', 'feature330', 'feature437', 'feature159', 'feature441', 'feature431', 'feature396', 'feature358', 'feature295', 'feature444', 'feature316', 'feature380', 'feature75', 'feature448', 'feature205', 'feature430', 'feature148', 'feature335', 'feature326', 'feature308', 'feature320', 'feature318', 'feature389', 'feature276', 'feature305', 'feature296', 'feature390', 'feature204', 'feature127', 'feature54', 'feature408', 'feature376', 'feature413', 'feature282', 'feature338', 'feature356', 'feature385', 'feature283', 'feature245', 'feature93', 'feature362', 'feature352', 'feature264', 'feature272', 'feature333', 'feature384', 'feature86', 'feature302', 'feature429', 'feature231', 'feature152', 'feature174', 'feature349', 'feature322', 'feature379', 'feature300', 'feature252', 'feature464', 'feature200', 'feature164', 'feature194', 'feature197', 'feature218', 'feature368', 'feature423', 'feature339', 'feature387', 'feature374', 'feature391', 'feature181', 'feature311', 'feature427', 'feature213', 'feature125', 'feature184', 'feature290', 'feature284', 'feature317', 'feature406', 'feature237', 'feature281', 'feature449', 'feature310', 'feature248', 'feature177', 'feature359', 'feature394', 'feature291', 'feature416', 'feature371', 'feature327', 'feature336', 'feature355', 'feature312', 'feature182', 'feature367', 'feature325', 'feature203', 'feature263', 'feature412', 'feature214', 'feature219', 'feature377', 'feature138', 'feature79', 'feature109', 'feature195', 'feature454', 'feature405', 'feature201', 'feature463', 'feature432', 'feature210', 'feature465', 'feature484', 'feature397', 'feature381', 'feature456', 'feature462', 'feature288', 'feature438', 'feature275', 'feature383', 'feature363', 'feature212', 'feature158', 'feature392', 'feature445', 'feature498', 'feature494', 'feature492', 'feature478', 'feature493', 'feature469', 'feature489', 'feature188', 'feature168', 'feature418', 'feature118', 'feature404', 'feature167', 'feature113', 'feature142', 'feature107', 'feature147', 'feature157', 'feature257', 'feature103', 'feature114', 'feature94', 'feature189', 'feature112', 'feature76', 'feature260', 'feature161', 'feature230', 'feature143', 'feature83', 'feature175', 'feature162', 'feature105', 'feature58', 'feature227', 'feature73', 'feature228', 'feature222', 'feature351', 'feature388', 'feature126', 'feature206', 'feature440', 'feature292', 'feature146', 'feature341', 'feature303', 'feature307', 'feature321', 'feature243', 'feature287', 'feature149', 'feature128', 'feature51', 'feature69', 'feature190', 'feature221', 'feature428', 'feature124', 'feature133', 'feature235', 'feature344', 'feature217', 'feature483', 'feature487', 'feature477', 'feature488', 'feature474', 'feature453', 'feature471', 'feature499', 'feature496', 'feature497', 'feature495', 'feature393', 'feature476', 'feature479', 'feature470', 'feature457', 'feature485', 'feature446', 'feature486', 'feature481', 'feature490', 'feature480', 'feature491', 'feature323', 'feature399', 'feature244', 'feature482', 'feature400', 'feature466', 'feature436', 'feature411', 'feature121', 'feature153', 'feature173', 'feature80', 'feature154', 'feature62', 'feature150', 'feature186', 'feature279', 'feature57', 'feature280', 'feature176', 'feature116', 'feature269', 'feature37', 'feature65', 'feature108', 'feature250', 'feature259', 'feature53', 'feature64', 'feature234', 'feature39', 'feature104', 'feature97', 'feature165', 'feature313', 'feature459', 'feature442', 'feature261', 'feature407', 'feature342', 'feature169', 'feature151', 'feature33', 'feature117', 'feature266', 'feature426', 'feature331', 'feature424', 'feature370', 'feature422', 'feature343', 'feature285', 'feature357', 'feature274', 'feature256', 'feature409', 'feature267', 'feature417', 'feature273', 'feature233']
    MEDIAN_FEATURES = ['feature40', 'feature84', 'feature378', 'feature156', 'feature458', 'feature253', 'feature120', 'feature38', 'feature87', 'feature171', 'feature199', 'feature246', 'feature324', 'feature220', 'feature467', 'feature47', 'feature170', 'feature12', 'feature15', 'feature22', 'feature49', 'feature26', 'feature61', 'feature130', 'feature4', 'feature13', 'feature45', 'feature289', 'feature24', 'feature43', 'feature329', 'feature11', 'feature348', 'feature332', 'feature21', 'feature144', 'feature180', 'feature232', 'feature10', 'feature25', 'feature160', 'feature119', 'feature224', 'feature3', 'feature66', 'feature309', 'feature241', 'feature102', 'feature19', 'feature9', 'feature7', 'feature225', 'feature8', 'feature5', 'feature131', 'feature455', 'feature354', 'feature452', 'feature460', 'feature247', 'feature101', 'feature88', 'feature17', 'feature319', 'feature216', 'feature30', 'feature44', 'feature135', 'feature208', 'feature278', 'feature198', 'feature166', 'feature202', 'feature16', 'feature129', 'feature137', 'feature179', 'feature211', 'feature71', 'feature106', 'feature122', 'feature215', 'feature270', 'feature207', 'feature81', 'feature450', 'feature41', 'feature14', 'feature345', 'feature163', 'feature63', 'feature98', 'feature36', 'feature346', 'feature185', 'feature183', 'feature251', 'feature386', 'feature74', 'feature28']

    # Filter NEGONE_FEATURES and MEDIAN_FEATURES based on current X_train columns.
    negone_features_filtered = [feature for feature in NEGONE_FEATURES if feature in X_train.columns]
    median_features_filtered = [feature for feature in MEDIAN_FEATURES if feature in X_train.columns]

    # Redefine NEGONE_FEATURES and MEDIAN_FEATURES with updated values.
    NEGONE_FEATURES = negone_features_filtered
    MEDIAN_FEATURES = median_features_filtered

    print("Filtered NEGONE_FEATURES:", NEGONE_FEATURES)
    print("Filtered MEDIAN_FEATURES:", MEDIAN_FEATURES)

    def generate_preprocessor(numeric_features, categorical_features, N_JOBS, cat_encode_type, 
                            do_specificimpute, do_featureselection, 
                            do_sampling, do_pca, var_thresh, oversample_technique, 
                            negone_features=NEGONE_FEATURES, median_features=MEDIAN_FEATURES,
                            prefix='', do_feature_subset=False, max_features=1, do_removeppi=False, do_removegtex=False):
        cat_encoders = [OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, encoded_missing_value=-1), 
                        OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist', min_frequency=10)]
        categorical_transformer = cat_encoders[cat_encode_type]

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', MinMaxScaler(feature_range =(0, 1), clip=True))])

        median_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', MinMaxScaler(feature_range =(0, 1), clip=True))])

        negone_transformer = Pipeline(steps=[
            ('scaler', MinMaxScaler(feature_range =(0, 1), clip=True)),
            ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
        ])

        preprocessor = None
        if do_specificimpute:
            preprocessor = ColumnTransformer(
                transformers=[
                    ('median', median_transformer, median_features),
                    ('negone', negone_transformer, negone_features),
                    ('cat', categorical_transformer, categorical_features),
            ])
        else:
            preprocessor = ColumnTransformer(
                transformers=[
                    ('numeric', numeric_transformer, numeric_features),
                    ('cat', categorical_transformer, categorical_features),
                ])

        vt = VarianceThreshold(threshold=var_thresh)
        steps = [('initial', preprocessor), ('removeba', RemoveBeforeAfterTransformer()), ('variance_threshold', vt)]
        if do_sampling == 1:
            steps.append(('undersampling', RandomUnderSampler(random_state=42)))
        if do_sampling == 2:
            oversamplers = [SMOTE(n_jobs=N_JOBS,random_state=42), RandomOverSampler(random_state=42)]
            steps.append(('oversampling', oversamplers[oversample_technique]))
        if do_pca:
            steps.append(('pca', PCA()))

        preprocessor = Pipeline(steps=steps)
        return preprocessor

    X_test = data
    y_test = pd.read_csv('../data/y_test_id.csv', low_memory=False)
    model_type = 'xgb'
    num_models = 27

    features = X_test.columns.tolist()
    print(features)

    # Load training data and extract columns (features).
    X_train = drop_allnan(X_train)
    columns = X_train.columns.tolist()

    # Pre-processor generation with multiple parameters that can be tweaked. I used the same values as LoGoFunc
    preprocessor = joblib.load(f'{model_path}/preprocessor.joblib')

    # Iterate over the models in the ensemble.
    models = []
    num_models = 27  
    for i in range(num_models):
        models.append(joblib.load(f'{model_path}/{model_type}_model_{i}.joblib'))

    # This encodes the IMPACT feature's column, then drops the ID column.
    if 'feature2' in features:
        feature2_vals = {'LOW': 0, 'MODIFIER': 1, 'MODERATE': 1.5, 'HIGH': 2}
        encoded_feature2s = [feature2_vals[imp] for imp in X_test['feature2']]
        X_test = X_test.drop(columns=['feature2'])
        X_test['feature2'] = encoded_feature2s
    X_test = X_test[columns]
    ids = X_test['feature0'].tolist()
    X_test = X_test.drop(columns='feature0')
    
    # Make sure the data types are the same, because NumPy arrays are not tolerated in places where DataFrames are expected.
    for col in X_test.columns:
        X_test[col] = X_test[col].astype(X_train[col].dtype)

    # Pre-process the test data.
    X_test = transform(X_test, preprocessor)

    # Pool the predictions into a list.
    all_preds = []
    for i in range(num_models):
        preds = models[i].predict_proba(X_test)  
        all_preds.append(preds)

    # Apply the soft-voting function.
    y_pred_proba = soft_vote(np.array(all_preds))
    y_pred = [np.argmax(p) for p in y_pred_proba]

    # Map the labels to numbers.
    label_mapping = {'Neutral': 0, 'GOF': 1, 'LOF': 2}
    y_test_numeric = [label_mapping[label] for label in y_test['label']]

    # Compute class-wise MCC.
    mcc_per_class = {}
    for i, class_name in enumerate(['Neutral', 'GOF', 'LOF']):
        y_true_binary = (np.array(y_test_numeric) == i).astype(int)
        y_pred_binary = (np.array(y_pred) == i).astype(int)
        mcc_per_class[class_name] = matthews_corrcoef(y_true_binary, y_pred_binary)

    overall_mcc = matthews_corrcoef(y_test_numeric, y_pred)

    print("Per-class MCC:")
    for class_name, mcc_value in mcc_per_class.items():
        print(f"{class_name}: {mcc_value:.4f}")
    print(f"Overall MCC: {overall_mcc:.4f}")
    
    def format_confusion_matrix(conf_matrix):
        return "\n".join([",".join(map(str, row)) for row in conf_matrix])

    # Perform the evaluation using SciKit-learn's metrics.
    accuracy = accuracy_score(y_test_numeric, y_pred)
    precision = precision_score(y_test_numeric, y_pred, average='weighted')
    recall = recall_score(y_test_numeric, y_pred, average='weighted')
    f1 = f1_score(y_test_numeric, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test_numeric, y_pred_proba, multi_class='ovo')
    conf_matrix = confusion_matrix(y_test_numeric, y_pred)

    # Print the recorded metrics.
    print(f'Model {number} - Test Accuracy: {accuracy}')
    print(f'Model {number} - Test Precision: {precision}')
    print(f'Model {number} - Test Recall: {recall}')
    print(f'Model {number} - Test F1 Score: {f1}')
    print(f'Model {number} - Test ROC AUC Score: {roc_auc}')

    # Compute the remaining class-wise metrics.
    report = classification_report(y_test_numeric, y_pred, target_names=['Neutral', 'GOF', 'LOF'])
    print(report)
    custom_report = classification_report(y_test_numeric, y_pred, target_names=['Neutral', 'GOF', 'LOF'])
    custom_report += "\n\nPer-class MCC:\n"
    for class_name, mcc_value in mcc_per_class.items():
        custom_report += f"{class_name}: {mcc_value:.4f}\n"
    print(custom_report)

    # Export the metrics to a CSV.
    metrics_df = pd.DataFrame({
        'Model': [number],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1 Score': [f1],
        'ROC AUC Score': [roc_auc],
        'Overall MCC': [overall_mcc]
    })
    metrics_df.to_csv(metrics_path, index=False)
    print(f"Metrics for model {number} saved to {metrics_path}")

    # Export classification report to CSV.
    report_df = pd.DataFrame({
        'Classification Report': [custom_report]
    })
    report_df.to_csv(report_path, index=False, quoting=0)
    print(f"Report for model {number} saved to {report_path}")

    # Generate the confusion matrix.
    matrix_df = pd.DataFrame(conf_matrix, 
                            columns=['Predicted Neutral', 'Predicted GOF', 'Predicted LOF'],
                            index=['Actual Neutral', 'Actual GOF', 'Actual LOF'])

    # Export matrix to CSV.
    matrix_df.to_csv(matrix_path, quoting=0)
    print(f"Confusion Matrix for model {number} saved to {matrix_path}")

    out = []
    for i in range(len(y_pred)):
        out.append([ids[i], ['Neutral', 'GOF', 'LOF'][y_pred[i]], *y_pred_proba[i]])
    out = pd.DataFrame(out, columns=['feature0', 'prediction', 'LoGoFunc_Neutral', 'LoGoFunc_GOF', 'LoGoFunc_LOF'])
    out.to_csv(output_path, index=None)

    print(f"Results for model {number} saved to {output_path}")
    print("---------------------------------------------------")

Processing model 256
Shape of data: (2831, 257)
Shape of X_train: (25546, 257)
Filtered NEGONE_FEATURES: ['feature242', 'feature46', 'feature31', 'feature6', 'feature89', 'feature52', 'feature191', 'feature196', 'feature91', 'feature192', 'feature27', 'feature20', 'feature111', 'feature100', 'feature90', 'feature72', 'feature255', 'feature50', 'feature67', 'feature136', 'feature59', 'feature92', 'feature139', 'feature60', 'feature48', 'feature145', 'feature78', 'feature115', 'feature134', 'feature132', 'feature35', 'feature42', 'feature96', 'feature249', 'feature82', 'feature18', 'feature23', 'feature123', 'feature99', 'feature32', 'feature70', 'feature68', 'feature178', 'feature141', 'feature95', 'feature55', 'feature56', 'feature34', 'feature77', 'feature226', 'feature193', 'feature110', 'feature229', 'feature223', 'feature85', 'feature140', 'feature254', 'feature172', 'feature155', 'feature240', 'feature236', 'feature239', 'feature209', 'feature238', 'feature187', 'feature159', 'fea

In [2]:
""" This combines all of the individual metrics into one file. """

metrics_dir = '../XGBoost/metrics/individual'

# Init list to hold the df's.
dfs = []

# Iterate over every metric file.
for i in range(1, 500):
    file_path = os.path.join(metrics_dir, f'xgb_df{i}_metrics.csv')
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        dfs.append(df)

# Combine the df's.
combined_df = pd.concat(dfs, ignore_index=True)

# Sort by the 'Model' column.
combined_df = combined_df.sort_values('Model')
combined_df.set_index('Model', inplace=True)

# Save new CSV file.
output_path = '../XGBoost/metrics/combined_metrics.csv'
combined_df.to_csv(output_path)

print(f"Saved to {output_path}")

Saved to ../XGBoost/metrics/combined_metrics.csv


In [3]:
""" This ranks the models according to accuracy. """

input_path = '../XGBoost/metrics/combined_metrics.csv'
combined_df = pd.read_csv(input_path, index_col='Model')

# Sort ACC from low to high.
sorted_df = combined_df.sort_values('Accuracy', ascending=True)

# New df only needs 'Model' and 'Accuracy' columns.
rankings_df = pd.DataFrame({
    'Model': sorted_df.index,
    'Accuracy': sorted_df['Accuracy']
})

# Reset index so 'Model' is normal column.
rankings_df = rankings_df.reset_index(drop=True)

# Save new CSV file.
output_path = '../XGBoost/metrics/ranked_by_accuracy.csv'
rankings_df.to_csv(output_path, index=False)
print(f"Saved to {output_path}")

Saved to ../XGBoost/metrics/ranked_by_accuracy.csv


In [4]:
""" This ranks the models according to precision. """

input_path = '../XGBoost/metrics/combined_metrics.csv'
combined_df = pd.read_csv(input_path, index_col='Model')

# Sort PREC from low to high.
sorted_df = combined_df.sort_values('Precision', ascending=True)

# New df only needs 'Model' and 'Precision' columns.
rankings_df = pd.DataFrame({
    'Model': sorted_df.index,
    'Precision': sorted_df['Precision']
})

# Reset index so 'Model' is normal column.
rankings_df = rankings_df.reset_index(drop=True)

# Save new CSV file.
output_path = '../XGBoost/metrics/ranked_by_precision.csv'
rankings_df.to_csv(output_path, index=False)
print(f"Saved to {output_path}")

Saved to ../XGBoost/metrics/ranked_by_precision.csv


In [5]:
""" This ranks the models according to recall. """

input_path = '../XGBoost/metrics/combined_metrics.csv'
combined_df = pd.read_csv(input_path, index_col='Model')

# Sort REC from low to high.
sorted_df = combined_df.sort_values('Recall', ascending=True)

# New df only needs 'Model' and 'Recall' columns.
rankings_df = pd.DataFrame({
    'Model': sorted_df.index,
    'Recall': sorted_df['Recall']
})

# Reset index so 'Model' is normal column.
rankings_df = rankings_df.reset_index(drop=True)

# Save new CSV file.
output_path = '../XGBoost/metrics/ranked_by_recall.csv'
rankings_df.to_csv(output_path, index=False)
print(f"Saved to {output_path}")

Saved to ../XGBoost/metrics/ranked_by_recall.csv


In [6]:
""" This ranks the models according to F1-score. """

input_path = '../XGBoost/metrics/combined_metrics.csv'
combined_df = pd.read_csv(input_path, index_col='Model')

# Sort F1 from low to high.
sorted_df = combined_df.sort_values('F1 Score', ascending=True)

# New df only needs 'Model' and 'F1 Score' columns.
rankings_df = pd.DataFrame({
    'Model': sorted_df.index,
    'F1_Score': sorted_df['F1 Score']
})

# Reset index so 'Model' is normal column.
rankings_df = rankings_df.reset_index(drop=True)

# Save new CSV file.
output_path = '../XGBoost/metrics/ranked_by_f1.csv'
rankings_df.to_csv(output_path, index=False)
print(f"Saved to {output_path}")

Saved to ../XGBoost/metrics/ranked_by_f1.csv


In [7]:
""" This ranks the models according to ROC AUC. """

input_path = '../XGBoost/metrics/combined_metrics.csv'
combined_df = pd.read_csv(input_path, index_col='Model')

# ROC AUC Score from low to high.
sorted_df = combined_df.sort_values('ROC AUC Score', ascending=True)

# New df only needs 'Model' and 'ROC AUC' columns.
rankings_df = pd.DataFrame({
    'Model': sorted_df.index,
    'ROC_AUC_Score': sorted_df['ROC AUC Score']
})

# Reset index so 'Model' is normal column.
rankings_df = rankings_df.reset_index(drop=True)

# Save new CSV.
output_path = '../XGBoost/metrics/ranked_by_roc_auc.csv'
rankings_df.to_csv(output_path, index=False)
print(f"Saved to {output_path}")

Saved to ../XGBoost/metrics/ranked_by_roc_auc.csv
