### XGBoost Cross-Validation, Training, Testing, and Evaluation

#### *** Feature names must be encoded before using this notebook! ***

In [1]:
import time, os, sys, joblib, csv, ast
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
from scipy.special import softmax
from joblib import Parallel, delayed
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import  RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.base import BaseEstimator, TransformerMixin

### Preprocessing Pipeline

In [146]:
# These lists are used to determine imputation strategies for the specified features (inherited from LoGoFunc)
NEGONE_FEATURES = ['feature487', 'feature489', 'feature491', 'feature492', 'feature493', 'feature494', 'feature495', 'feature496', 'feature497', 'feature498', 'feature499', 'feature458', 'feature4', 'feature6', 'feature10', 'feature13', 'feature31', 'feature35', 'feature36', 'feature46', 'feature47', 'feature48', 'feature49', 'feature45', 'feature50', 'feature51', 'feature52', 'feature53', 'feature54', 'feature55', 'feature56', 'feature57', 'feature58', 'feature59', 'feature60', 'feature61', 'feature62', 'feature63', 'feature64', 'feature65', 'feature66', 'feature67', 'feature68', 'feature69', 'feature70', 'feature71', 'feature72', 'feature73', 'feature74', 'feature75', 'feature76', 'feature77', 'feature84', 'feature85', 'feature86', 'feature87', 'feature88', 'feature89', 'feature90', 'feature91', 'feature92', 'feature93', 'feature94', 'feature95', 'feature96', 'feature97', 'feature98', 'feature99', 'feature100', 'feature101', 'feature102', 'feature103', 'feature105', 'feature107', 'feature108', 'feature109', 'feature110', 'feature111', 'feature112', 'feature113', 'feature114', 'feature115', 'feature116', 'feature117', 'feature118', 'feature119', 'feature120', 'feature121', 'feature122', 'feature123', 'feature124', 'feature125', 'feature126', 'feature127', 'feature128', 'feature129', 'feature130', 'feature131', 'feature132', 'feature133', 'feature134', 'feature135', 'feature136', 'feature137', 'feature138', 'feature139', 'feature140', 'feature141', 'feature142', 'feature143', 'feature144', 'feature145', 'feature146', 'feature147', 'feature148', 'feature149', 'feature150', 'feature151', 'feature152', 'feature153', 'feature154', 'feature155', 'feature156', 'feature157', 'feature158', 'feature159', 'feature160', 'feature161', 'feature162', 'feature163', 'feature164', 'feature165', 'feature166', 'feature167', 'feature168', 'feature169', 'feature170', 'feature171', 'feature172', 'feature173', 'feature174', 'feature175', 'feature176', 'feature177', 'feature178', 'feature179', 'feature180', 'feature181', 'feature182', 'feature183', 'feature184', 'feature185', 'feature186', 'feature187', 'feature188', 'feature189', 'feature190', 'feature191', 'feature192', 'feature193', 'feature194', 'feature195', 'feature196', 'feature197', 'feature198', 'feature199', 'feature200', 'feature201', 'feature202', 'feature203', 'feature204', 'feature205', 'feature206', 'feature207', 'feature208', 'feature209', 'feature210', 'feature211', 'feature212', 'feature213', 'feature214', 'feature215', 'feature216', 'feature217', 'feature218', 'feature219', 'feature220', 'feature221', 'feature222', 'feature223', 'feature224', 'feature225', 'feature226', 'feature227', 'feature228', 'feature229', 'feature230', 'feature231', 'feature232', 'feature233', 'feature234', 'feature235', 'feature236', 'feature237', 'feature238', 'feature239', 'feature240', 'feature241', 'feature242', 'feature243', 'feature244', 'feature245', 'feature246', 'feature247', 'feature248', 'feature249', 'feature250', 'feature251', 'feature252', 'feature253', 'feature254', 'feature255', 'feature256', 'feature257', 'feature258', 'feature259', 'feature260', 'feature261', 'feature262', 'feature263', 'feature264', 'feature265', 'feature266', 'feature267', 'feature268', 'feature269', 'feature270', 'feature271', 'feature272', 'feature273', 'feature274', 'feature275', 'feature276', 'feature277', 'feature278', 'feature279', 'feature280', 'feature281', 'feature282', 'feature283', 'feature284', 'feature285', 'feature286', 'feature287', 'feature288', 'feature289', 'feature290', 'feature291', 'feature292', 'feature293', 'feature294', 'feature295', 'feature296', 'feature297', 'feature298', 'feature299', 'feature300', 'feature301', 'feature302', 'feature303', 'feature304', 'feature305', 'feature306', 'feature307', 'feature308', 'feature309', 'feature310', 'feature311', 'feature312', 'feature313', 'feature314', 'feature315', 'feature316', 'feature317', 'feature318', 'feature319', 'feature320', 'feature321', 'feature322', 'feature323', 'feature324', 'feature325', 'feature326', 'feature327', 'feature328', 'feature329', 'feature330', 'feature331', 'feature332', 'feature333', 'feature334', 'feature335', 'feature336', 'feature337', 'feature338', 'feature339', 'feature340', 'feature341', 'feature342', 'feature343', 'feature344', 'feature345', 'feature346', 'feature347', 'feature348', 'feature349', 'feature355', 'feature356', 'feature357', 'feature358', 'feature359', 'feature360', 'feature361', 'feature362', 'feature363', 'feature364', 'feature365', 'feature366', 'feature367', 'feature368', 'feature369', 'feature373', 'feature374', 'feature375', 'feature376', 'feature421', 'feature422', 'feature423', 'feature424', 'feature425', 'feature426', 'feature427', 'feature428', 'feature429', 'feature430', 'feature431', 'feature432', 'feature433', 'feature434', 'feature435', 'feature436', 'feature437', 'feature438', 'feature439', 'feature440', 'feature441', 'feature442', 'feature443', 'feature444', 'feature445', 'feature446', 'feature447', 'feature448', 'feature459', 'feature460', 'feature461', 'feature462', 'feature463', 'feature464', 'feature465', 'feature466', 'feature467', 'feature471', 'feature472', 'feature473', 'feature474', 'feature475', 'feature476', 'feature477', 'feature478', 'feature479', 'feature480', 'feature481', 'feature482', 'feature483', 'feature484', 'feature485', 'feature486']
MEDIAN_FEATURES = ['feature3', 'feature5', 'feature7', 'feature8', 'feature9', 'feature11', 'feature12', 'feature14', 'feature15', 'feature16', 'feature17', 'feature18', 'feature19', 'feature20', 'feature21', 'feature22', 'feature23', 'feature24', 'feature25', 'feature26', 'feature27', 'feature28', 'feature29', 'feature30', 'feature32', 'feature33', 'feature34', 'feature37', 'feature38', 'feature39', 'feature40', 'feature41', 'feature42', 'feature43', 'feature79', 'feature80', 'feature81', 'feature82', 'feature350', 'feature351', 'feature352', 'feature353', 'feature354', 'feature370', 'feature371', 'feature372', 'feature378', 'feature379', 'feature380', 'feature381', 'feature382', 'feature383', 'feature384', 'feature385', 'feature386', 'feature387', 'feature388', 'feature389', 'feature390', 'feature391', 'feature392', 'feature393', 'feature394', 'feature395', 'feature396', 'feature397', 'feature398', 'feature399', 'feature400', 'feature401', 'feature402', 'feature403', 'feature404', 'feature405', 'feature406', 'feature407', 'feature408', 'feature409', 'feature410', 'feature411', 'feature412', 'feature413', 'feature414', 'feature415', 'feature416', 'feature417', 'feature418', 'feature419', 'feature420', 'feature449', 'feature450', 'feature451', 'feature452', 'feature453', 'feature454', 'feature455', 'feature456', 'feature457', 'feature469', 'feature470']

In [147]:
# All of this is from utils.py
def generate_preprocessor(numeric_features, categorical_features, N_JOBS, cat_encode_type, 
                            do_specificimpute, do_featureselection, 
                            do_sampling, do_pca, var_thresh, oversample_technique, 
                            negone_features=NEGONE_FEATURES, median_features=MEDIAN_FEATURES,
                            prefix='', do_feature_subset=False, max_features=1, do_removeppi=False, do_removegtex=False):
    cat_encoders = [OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, encoded_missing_value=-1), 
                    OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist', min_frequency=10)]
    categorical_transformer = cat_encoders[cat_encode_type]

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler(feature_range =(0, 1), clip=True))])

    median_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler(feature_range =(0, 1), clip=True))])

    negone_transformer = Pipeline(steps=[
        ('scaler', MinMaxScaler(feature_range =(0, 1), clip=True)),
        ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
    ])

    preprocessor = None
    if do_specificimpute:
        preprocessor = ColumnTransformer(
            transformers=[
                ('median', median_transformer, median_features),
                ('negone', negone_transformer, negone_features),
                ('cat', categorical_transformer, categorical_features),
        ])
    else:
        preprocessor = ColumnTransformer(
            transformers=[
                ('numeric', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features),
            ])

    vt = VarianceThreshold(threshold=var_thresh)
    steps = [('initial', preprocessor), ('removeba', RemoveBeforeAfterTransformer()), ('variance_threshold', vt)]
    if do_sampling == 1:
        steps.append(('undersampling', RandomUnderSampler(random_state=42)))
    if do_sampling == 2:
        oversamplers = [SMOTE(n_jobs=N_JOBS,random_state=42), RandomOverSampler(random_state=42)]
        steps.append(('oversampling', oversamplers[oversample_technique]))
    if do_pca:
        steps.append(('pca', PCA()))

    preprocessor = Pipeline(steps=steps)
    return preprocessor

In [148]:
# All of this is from utils.py, with additional encoding where necessary.
class RemoveBeforeAfterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.drop_cols = None

    def fit(self, X, y=None):
        print(X.shape)
        drop_numbers = [35, 36, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98]
        self.drop_cols = [col for col in X.columns if any(f'feature{num}' == col for num in drop_numbers)]
        print(f"Columns to be dropped: {self.drop_cols}")
        return self

    def transform(self, X, y=None):
        print(f"Columns being dropped: {[col for col in self.drop_cols if col in X.columns]}")
        X = X.drop(columns=self.drop_cols, errors='ignore')
        print(X.shape)
        return X

    def get_feature_names_out(self, input_features=None):
        return [f for f in input_features if f not in self.drop_cols]

In [149]:
# All of this is from utils.py
def preprocess(preprocessor, train_data, train_labels, quiet=False):
    for k, v in preprocessor.steps:
        print(f"Applying step: {k}")
        print(f"Input shape: {train_data.shape}")
        print(f"Input columns: {train_data.columns}")
        
        if k == 'initial':
            start = time.time()
            v.fit(train_data)
            train_data = pd.DataFrame(v.transform(train_data), columns=v.get_feature_names_out())
            end = time.time()
            if not quiet:
                print(k + ' took ' + str(end - start) + ' to run.')
        elif k == 'oversampling' or k == 'undersampling':
            start = time.time()
            input_features = train_data.columns
            train_data, train_labels = v.fit_resample(train_data, train_labels)
            train_data = pd.DataFrame(train_data, columns=input_features)
            end = time.time()
            if not quiet:
                print(k + ' took ' + str(end - start) + ' to run.')
        else:
            start = time.time()
            v.fit(train_data)
            input_features = train_data.columns
            train_data = pd.DataFrame(v.transform(train_data), columns=v.get_feature_names_out(input_features))
            end = time.time()
            if not quiet:
                print(k + ' took ' + str(end - start) + ' to run.')
        
        print(f"Output shape: {train_data.shape}")
        print(f"Output columns: {train_data.columns}")
        print("---")

    for col in train_data.columns:
        try:
            train_data[col] = train_data[col].astype('float')
        except:
            train_data[col] = train_data[col].astype('category')

    return train_data, train_labels

def transform(test_data, preprocessor, quiet=False):
    for k, v in preprocessor.steps:
        if k == 'initial':
            test_data = pd.DataFrame(v.transform(test_data), columns=v.get_feature_names_out())
        elif k == 'oversampling' or k == 'undersampling':
            continue
        else:
            test_data = v.transform(test_data)
    test_data = pd.DataFrame(test_data)

    for col in test_data.columns:
        try:
            test_data[col] = test_data[col].astype('float')
        except:
            test_data[col] = test_data[col].astype('category')

    return test_data.to_numpy()

### Preprocessing and Validation Split

In [150]:
# Loading the correct training data (missing `Protein_dom`).
X_train = pd.read_csv('../data/X_train_encoded.csv', low_memory=False)
y_train = pd.read_csv('../data/y_train_id.csv', low_memory=False)

# Model type and paths.
model_type = 'xgb' 
model_path = f'../models/{model_type}-validation'
output_path = f'../results/{model_type}-validation.csv'
metrics_file = f'../metrics/{model_type}-validation-metrics.csv'

##### This pre-processing snippet is directly adapted from the LoGoFunc GitLab repository.

In [151]:
# This is used to drop columns that only contain NaN values.
def drop_allnan(data):
    for col in data.columns:
        if data[col].isna().sum() == len(data):
            data = data.drop(columns=col)
    return data

# This encodes the IMPACT feature's column.
X_train = drop_allnan(X_train)
feature1_vals = {'LOW': 0, 'MODIFIER': 1, 'MODERATE': 1.5, 'HIGH': 2}
encoded_feature1s = [feature1_vals[imp] for imp in X_train['feature1']]
X_train = X_train.drop(columns=['feature1'])
X_train['feature1'] = encoded_feature1s

# Conditional imputation based on feature type.
numeric_features = X_train.select_dtypes(include=['number']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

In [152]:
# Pre-processor generation with multiple parameters that can be tweaked. I used the same values as LoGoFunc.
preprocessor = generate_preprocessor(numeric_features, categorical_features, 40, 0,
                            1, 0, 2, 0, 0, 1,
                            prefix='light0', do_feature_subset=True, max_features=1)

In [153]:
# Encoding the labels.
y_train_enc = []
for lab in y_train['label']:
    if lab == 'GOF':
        y_train_enc.append(1)
    elif lab == 'LOF':
        y_train_enc.append(2)
    else:
        y_train_enc.append(0)
y_train = y_train_enc

In [154]:
X_train, y_train = preprocess(preprocessor, X_train, y_train)

# This creates a directory for the preprocessor and trained models.
os.makedirs(model_path, exist_ok=True)
joblib.dump(preprocessor, f'{model_path}/preprocessor.joblib')

# This is where the train-val split happens in a 80-20 ratio.
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=41)

Applying step: initial
Input shape: (25546, 500)
Input columns: Index(['feature0', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6',
       'feature7', 'feature8', 'feature9', 'feature10',
       ...
       'feature491', 'feature492', 'feature493', 'feature494', 'feature495',
       'feature496', 'feature497', 'feature498', 'feature499', 'feature1'],
      dtype='object', length=500)
initial took 0.5073683261871338 to run.
Output shape: (25546, 498)
Output columns: Index(['median__feature3', 'median__feature5', 'median__feature7',
       'median__feature8', 'median__feature9', 'median__feature11',
       'median__feature12', 'median__feature14', 'median__feature15',
       'median__feature16',
       ...
       'cat__feature2', 'cat__feature44', 'cat__feature78', 'cat__feature83',
       'cat__feature104', 'cat__feature106', 'cat__feature377',
       'cat__feature468', 'cat__feature488', 'cat__feature490'],
      dtype='object', length=498)
---
Applying step: removeba
Input s

### Training and Validation

In [155]:
# First, there is initialization of the XGBoost classifier.
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# This is the search space of parameters that will be assessed.
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'learning_rate': np.logspace(-3, 0, 4),
    'subsample': [0.5, 0.75, 1.0],
    'colsample_bytree': [0.5, 0.75, 1.0],
    'min_child_weight': [1, 2, 3, 4, 5],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4]
}

# Specify 5-fold cross-validation at this step.
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_val_score(xgb, X_train, y_train, cv=kf, scoring='accuracy')
print(f'Cross-validation accuracy: {cv_results.mean()}')

# This is where RandomizedSearchCV is set to explore 10% of the available search space.
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist, n_iter=1080, cv=3, n_jobs=-1, verbose=2, random_state=42)

# This fits the model to the randomized search algorithm.
random_search.fit(X_train, y_train)

# This is populated with the top parameters as the search completes.
best_params = random_search.best_params_
print(f'Best parameters found: {best_params}')

# This saves the total number of parameter sets that will be explored to a variable.
n_params = len(random_search.cv_results_['params'])

# This is where the 27 best params are pooled.
top_n = min(27, n_params)
top_n_indices = np.argsort(random_search.cv_results_['mean_test_score'])[-top_n:]
top_n_params = [random_search.cv_results_['params'][i] for i in top_n_indices]

# This prints the best params at the end.
print("Top 27 parameter sets:")
for i, params in enumerate(top_n_params):
    print(f"Set {i+1}:")
    for key, value in params.items():
        print(f"  {key}: {value}")
    print()

# This spits out the params to the Models directory.
os.makedirs(model_path, exist_ok=True)
with open(f'{model_path}/top_27_params.json', 'w') as f:
    json.dump(top_n_params, f, indent=2)

# This handles the training of the model ensemble.
def train_model(params, X_train, y_train):
    model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss', **params)
    model.fit(X_train, y_train)
    return model

# This takes the 27 best params and trains the models.
models = Parallel(n_jobs=-1)(delayed(train_model)(params, X_train, y_train) for params in top_n_params)

# This saves the models along with params in the Models directory.
for i, (model, params) in enumerate(zip(models, top_n_params)):
    joblib.dump(model, f'{model_path}/xgb_model_{i}.joblib')
    with open(f'{model_path}/xgb_params_{i}.json', 'w') as f:
        json.dump(params, f, indent=2)

def predict_model(model, data):
    return model.predict_proba(data)

all_preds = Parallel(n_jobs=-1)(delayed(predict_model)(model, X_val) for model in models)
all_preds = np.array(all_preds)
avg_preds = np.mean(all_preds, axis=0)
final_predictions = np.argmax(avg_preds, axis=1)

# This performs final validation of the model.
val_accuracy = accuracy_score(y_val, final_predictions)
val_precision = precision_score(y_val, final_predictions, average='weighted')
val_recall = recall_score(y_val, final_predictions, average='weighted')
val_f1 = f1_score(y_val, final_predictions, average='weighted')
val_roc_auc = roc_auc_score(y_val, avg_preds, multi_class='ovo')
val_conf_matrix = confusion_matrix(y_val, final_predictions)

# After acquiring the metrics, they are printed here.
print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation Precision: {val_precision}')
print(f'Validation Recall: {val_recall}')
print(f'Validation F1 Score: {val_f1}')
print(f'Validation ROC AUC Score: {val_roc_auc}')
print(f'Confusion Matrix:\n{val_conf_matrix}')

Cross-validation accuracy: 0.9398788923896166
Fitting 3 folds for each of 1080 candidates, totalling 3240 fits
[CV] END colsample_bytree=0.75, gamma=0.2, learning_rate=0.001, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.75; total time=  14.3s
[CV] END colsample_bytree=0.75, gamma=0.2, learning_rate=0.001, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.75; total time=  14.4s
[CV] END colsample_bytree=0.75, gamma=0.2, learning_rate=0.001, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.75; total time=  14.8s
[CV] END colsample_bytree=1.0, gamma=0.2, learning_rate=0.01, max_depth=5, min_child_weight=4, n_estimators=300, subsample=0.5; total time=  43.3s
[CV] END colsample_bytree=1.0, gamma=0.2, learning_rate=0.01, max_depth=5, min_child_weight=4, n_estimators=300, subsample=0.5; total time=  47.2s
[CV] END colsample_bytree=1.0, gamma=0.2, learning_rate=0.01, max_depth=5, min_child_weight=4, n_estimators=300, subsample=0.5; total time=  48.0



[CV] END colsample_bytree=1.0, gamma=0.2, learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=300, subsample=0.75; total time=  35.1s
[CV] END colsample_bytree=1.0, gamma=0.2, learning_rate=0.01, max_depth=4, min_child_weight=4, n_estimators=300, subsample=0.75; total time=  35.5s
[CV] END colsample_bytree=1.0, gamma=0.2, learning_rate=0.1, max_depth=4, min_child_weight=4, n_estimators=300, subsample=0.75; total time=  29.3s
[CV] END colsample_bytree=1.0, gamma=0.2, learning_rate=0.1, max_depth=4, min_child_weight=4, n_estimators=300, subsample=0.75; total time=  29.6s
[CV] END colsample_bytree=1.0, gamma=0.2, learning_rate=0.1, max_depth=4, min_child_weight=4, n_estimators=300, subsample=0.75; total time=  29.5s
[CV] END colsample_bytree=0.5, gamma=0.4, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=100, subsample=1.0; total time=  12.2s
[CV] END colsample_bytree=0.5, gamma=0.4, learning_rate=0.1, max_depth=5, min_child_weight=3, n_estimators=100, subsa

### Testing the model after validation

In [156]:
# Load both test and reload training data.
X_train = pd.read_csv('../data/X_train_encoded.csv', low_memory=False)
X_test = pd.read_csv('../data/X_test_encoded.csv', low_memory=False)
y_test = pd.read_csv('../data/y_test_id.csv', low_memory=False)

In [157]:
# This is the soft-voting function used to aggregate the predictions.
def soft_vote(preds):
    summed_preds = [[np.sum(preds[:, j][:, i]) for i in range(3)] for j in range(len(preds[0]))]
    return [softmax(np.log(sp)) for sp in summed_preds]

# Again, this checks for columns that only contain NaN values.
def drop_allnan(data):
    for col in data.columns:
        if data[col].isna().sum() == len(data):
            data = data.drop(columns=col)
    return data

# Input the training data and do initial pre-processing.
X_train = drop_allnan(X_train)
columns = X_train.columns.tolist()

# Invoke the pre-processing function.
preprocessor = joblib.load(f'{model_path}/preprocessor.joblib')

# Iterate over the models in the ensemble.
models = []
num_models = 27  
for i in range(num_models):
    models.append(joblib.load(f'{model_path}/{model_type}_model_{i}.joblib'))

# Feed the data into the pre-processor functions.
y_test = pd.read_csv('../data/y_test_id.csv', low_memory=False)  
feature1_vals = {'LOW': 0, 'MODIFIER': 1, 'MODERATE': 1.5, 'HIGH': 2}
encoded_feature1s = [feature1_vals[imp] for imp in X_test['feature1']]
X_test = X_test.drop(columns=['feature1'])
X_test['feature1'] = encoded_feature1s
X_test = X_test[columns]
ids = X_test['feature0'].tolist()
X_test = X_test.drop(columns='feature0')

# Make sure the data types are the same. because NumPy arrays are not tolerated in places where DataFrames are expected.
for col in X_test.columns:
    X_test[col] = X_test[col].astype(X_train[col].dtype)
X_test = transform(X_test, preprocessor)

# Pool the predictions into a list.
all_preds = []
for i in range(num_models):
    preds = models[i].predict_proba(X_test)  
    all_preds.append(preds)

# Apply the soft-voting function.
y_pred_proba = soft_vote(np.array(all_preds))
y_pred = [np.argmax(p) for p in y_pred_proba]

# Map the labels to numbers.
label_mapping = {'Neutral': 0, 'GOF': 1, 'LOF': 2}
y_test_numeric = [label_mapping[label] for label in y_test['label']]

# Perform the evaluation using Scikit-learn's metrics.
accuracy = accuracy_score(y_test_numeric, y_pred)
precision = precision_score(y_test_numeric, y_pred, average='weighted')
recall = recall_score(y_test_numeric, y_pred, average='weighted')
f1 = f1_score(y_test_numeric, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test_numeric, y_pred_proba, multi_class='ovo')
conf_matrix = confusion_matrix(y_test_numeric, y_pred)

# Print the recorded metrics.
print(f'Test Accuracy: {accuracy}')
print(f'Test Precision: {precision}')
print(f'Test Recall: {recall}')
print(f'Test F1 Score: {f1}')
print(f'Test ROC AUC Score: {roc_auc}')
print(f'Confusion Matrix:\n{conf_matrix}')

out = []
for i in range(len(y_pred)):
    out.append([ids[i], ['Neutral', 'GOF', 'LOF'][y_pred[i]], *y_pred_proba[i]])
out = pd.DataFrame(out, columns=['feature0', 'prediction', 'LoGoFunc_Neutral', 'LoGoFunc_GOF', 'LoGoFunc_LOF'])
out.to_csv(output_path, index=None)

Columns being dropped: []
(2831, 498)
Test Accuracy: 0.8940303779583186
Test Precision: 0.8925924643411418
Test Recall: 0.8940303779583186
Test F1 Score: 0.8899314155343145
Test ROC AUC Score: 0.943516559433875
Confusion Matrix:
[[1228    2  109]
 [   6   65   81]
 [  87   15 1238]]


### Evalutation against y_test

In [158]:
# This is just to manually verify the test output.
predictions_file = output_path

# Load y_test.
y_test_path = '../data/y_test_id.csv'
y_test = pd.read_csv(y_test_path)
y_true = y_test['label']

# Encode labels to numbers.
label_mapping = {'Neutral': 0, 'GOF': 1, 'LOF': 2}
y_true_numeric = [label_mapping[label] for label in y_true]

# Load predictions.
predictions = pd.read_csv(predictions_file)

# Ensure DataFrames have same number of rows.
assert len(predictions) == len(y_test)
y_pred = predictions['prediction']
y_pred_numeric = [label_mapping[label] for label in y_pred]

# Perform the evaluation using Scikit-learn's metrics.
accuracy = accuracy_score(y_true_numeric, y_pred_numeric)
precision = precision_score(y_true_numeric, y_pred_numeric, average='weighted')
recall = recall_score(y_true_numeric, y_pred_numeric, average='weighted')
f1 = f1_score(y_true_numeric, y_pred_numeric, average='weighted')
conf_matrix = confusion_matrix(y_true_numeric, y_pred_numeric)

results = [{
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'confusion_matrix': conf_matrix.tolist()  
}]

results_df = pd.DataFrame(results)
results_df.to_csv(metrics_file, index=False)
print(results_df)

   accuracy  precision  ...  f1_score                               confusion_matrix
0   0.89403   0.892592  ...  0.889931  [[1228, 2, 109], [6, 65, 81], [87, 15, 1238]]

[1 rows x 5 columns]


   accuracy  precision  ...  f1_score                               confusion_matrix
0  0.896503   0.895064  ...  0.892804  [[1230, 2, 107], [6, 68, 78], [84, 16, 1240]]

[1 rows x 5 columns]

In [2]:
input_file = '../metrics/xgb-validation-metrics.csv'
output_file = '../metrics/xgb-evaluation.csv'

def macro(confusion_matrix):
    cm = np.array(confusion_matrix)
    recalls = np.diag(cm) / np.sum(cm, axis=1)
    return np.mean(recalls)

def micro(confusion_matrix):
    cm = np.array(confusion_matrix)
    true_positives = np.diag(cm)
    total_true_positives = np.sum(true_positives)
    total_actual_positives = np.sum(cm)
    return total_true_positives / total_actual_positives

with open(input_file, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    rows = list(reader)

for row in rows:
    confusion_matrix = ast.literal_eval(row['confusion_matrix'])
    macro = macro(confusion_matrix)
    micro_recall = micro(confusion_matrix)
    row['micro_recall'] = f'{micro_recall:.4f}'
    row['macro'] = f'{macro:.4f}'

    # Remove the original 'weighted' REC column.
    if 'recall' in row:
        del row['recall']

with open(output_file, 'w', newline='') as csvfile:
    fieldnames = ['accuracy', 'precision', 'f1_score', 'micro_recall', 'macro', 'confusion_matrix']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in rows:
        writer.writerow(row)

print(f"Saved to {output_file}")

Saved to ../metrics/xgb-evaluation.csv
