In [1]:
import os, time, re
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from scipy.special import softmax
import joblib
from joblib import Parallel, delayed
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import  RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.base import BaseEstimator, TransformerMixin

### This loads all of the ENS ranked subsets and trains 499 models, one for each subset.

In [2]:
# Points to the training data.
train_dir = "../data/encoded/train"

# Creates subset CSV list.
csv_files = [f'X_train_id_top_{i}_features.csv' for i in range(1, 500)]

# Init dictionary to store dataframes.
dfs = {}

# Iteratively load CSVs.
for feature_number in tqdm(range(1, 500), desc="Loading CSV files"):
    df_name = f"df{feature_number}"
    file_path = os.path.join(train_dir, f'X_train_{feature_number}_encoded.csv')
    df = pd.read_csv(file_path, low_memory=False)
    dfs[df_name] = df

print(f"Loaded {len(dfs)} dataframes.")
print(dfs.keys())

Loading CSV files:   0%|          | 0/499 [00:00<?, ?it/s]

Loaded 499 dataframes.
dict_keys(['df1', 'df2', 'df3', 'df4', 'df5', 'df6', 'df7', 'df8', 'df9', 'df10', 'df11', 'df12', 'df13', 'df14', 'df15', 'df16', 'df17', 'df18', 'df19', 'df20', 'df21', 'df22', 'df23', 'df24', 'df25', 'df26', 'df27', 'df28', 'df29', 'df30', 'df31', 'df32', 'df33', 'df34', 'df35', 'df36', 'df37', 'df38', 'df39', 'df40', 'df41', 'df42', 'df43', 'df44', 'df45', 'df46', 'df47', 'df48', 'df49', 'df50', 'df51', 'df52', 'df53', 'df54', 'df55', 'df56', 'df57', 'df58', 'df59', 'df60', 'df61', 'df62', 'df63', 'df64', 'df65', 'df66', 'df67', 'df68', 'df69', 'df70', 'df71', 'df72', 'df73', 'df74', 'df75', 'df76', 'df77', 'df78', 'df79', 'df80', 'df81', 'df82', 'df83', 'df84', 'df85', 'df86', 'df87', 'df88', 'df89', 'df90', 'df91', 'df92', 'df93', 'df94', 'df95', 'df96', 'df97', 'df98', 'df99', 'df100', 'df101', 'df102', 'df103', 'df104', 'df105', 'df106', 'df107', 'df108', 'df109', 'df110', 'df111', 'df112', 'df113', 'df114', 'df115', 'df116', 'df117', 'df118', 'df119', 'df

In [3]:
# Load the best params for XGBoost.
import json
with open('top_27_params.json', 'r') as f:
    top_n_params = json.load(f)

In [4]:
# Load encoded NEGONE and MEDIAN feature lists.
file_path = '../data/negone_median.json'

with open(file_path, 'r') as file:
    data = json.load(file)

negone_features = data.get('NEGONE_FEATURES', [])
median_features = data.get('MEDIAN_FEATURES', [])

print('NEGONE_FEATURES:', negone_features)
print('MEDIAN_FEATURES:', median_features)

NEGONE_FEATURES: ['feature369', 'feature265', 'feature473', 'feature420', 'feature461', 'feature415', 'feature373', 'feature433', 'feature395', 'feature443', 'feature242', 'feature258', 'feature451', 'feature366', 'feature425', 'feature46', 'feature31', 'feature6', 'feature89', 'feature52', 'feature468', 'feature286', 'feature315', 'feature191', 'feature196', 'feature91', 'feature192', 'feature27', 'feature20', 'feature111', 'feature100', 'feature90', 'feature72', 'feature255', 'feature298', 'feature50', 'feature67', 'feature136', 'feature59', 'feature92', 'feature139', 'feature60', 'feature48', 'feature145', 'feature78', 'feature115', 'feature372', 'feature472', 'feature439', 'feature382', 'feature475', 'feature353', 'feature134', 'feature132', 'feature35', 'feature42', 'feature96', 'feature249', 'feature82', 'feature18', 'feature23', 'feature123', 'feature99', 'feature32', 'feature70', 'feature68', 'feature178', 'feature401', 'feature141', 'feature294', 'feature447', 'feature95', 'fe

In [5]:
# Load other encoded feature names. 

file_path = '../data/patterns.json'

with open(file_path, 'r') as file:
    data = json.load(file)

# Get keys, convert to integer numbers.
keys_list = [int(key.replace('feature', '')) for key in data.keys()]
drop_numbers = keys_list
print('Extracted Keys:', drop_numbers)
print('Length:', len(drop_numbers))

Extracted Keys: [6, 18, 23, 32, 42, 48, 50, 59, 60, 67, 68, 70, 78, 82, 89, 92, 96, 99, 115, 123, 136, 139, 145, 178, 249, 298]
Length: 26


In [None]:
""" Iterate through the entire training script for each subeset DataFrame in the dictionary. """

train_dfs = dfs
for df_name, X_train in train_dfs.items():
    y_train = pd.read_csv('../data/y_train_id.csv')
    print(y_train.shape)

    # Encoding the labels.
    y_train_enc = []
    for lab in y_train['label']:
        if lab == 'GOF':
            y_train_enc.append(1)
        elif lab == 'LOF':
            y_train_enc.append(2)
        else:
            y_train_enc.append(0)
    y_train = y_train_enc
    print(len(y_train))

    print(f"Processing {df_name}")

    model_name = f"xgb_{df_name}"
    model_directory = f'../XGBoost/models/{model_name}'

    # Create the directory.
    if not os.path.exists(model_directory):
        os.makedirs(model_directory)

    print(f"Directory '{model_directory}' is ready.")

    # These lists are used to determine imputation strategies for the specified features (inherited from LoGoFunc)
    NEGONE_FEATURES = ['feature369', 'feature265', 'feature473', 'feature420', 'feature461', 'feature415', 'feature373', 'feature433', 'feature395', 'feature443', 'feature242', 'feature258', 'feature451', 'feature366', 'feature425', 'feature46', 'feature31', 'feature6', 'feature89', 'feature52', 'feature468', 'feature286', 'feature315', 'feature191', 'feature196', 'feature91', 'feature192', 'feature27', 'feature20', 'feature111', 'feature100', 'feature90', 'feature72', 'feature255', 'feature298', 'feature50', 'feature67', 'feature136', 'feature59', 'feature92', 'feature139', 'feature60', 'feature48', 'feature145', 'feature78', 'feature115', 'feature372', 'feature472', 'feature439', 'feature382', 'feature475', 'feature353', 'feature134', 'feature132', 'feature35', 'feature42', 'feature96', 'feature249', 'feature82', 'feature18', 'feature23', 'feature123', 'feature99', 'feature32', 'feature70', 'feature68', 'feature178', 'feature401', 'feature141', 'feature294', 'feature447', 'feature95', 'feature375', 'feature55', 'feature421', 'feature56', 'feature34', 'feature304', 'feature77', 'feature435', 'feature365', 'feature226', 'feature360', 'feature347', 'feature193', 'feature314', 'feature299', 'feature403', 'feature402', 'feature334', 'feature350', 'feature110', 'feature271', 'feature229', 'feature337', 'feature268', 'feature328', 'feature293', 'feature223', 'feature85', 'feature361', 'feature140', 'feature297', 'feature254', 'feature340', 'feature306', 'feature172', 'feature155', 'feature240', 'feature236', 'feature239', 'feature209', 'feature238', 'feature187', 'feature364', 'feature330', 'feature437', 'feature159', 'feature441', 'feature431', 'feature396', 'feature358', 'feature295', 'feature444', 'feature316', 'feature380', 'feature75', 'feature448', 'feature205', 'feature430', 'feature148', 'feature335', 'feature326', 'feature308', 'feature320', 'feature318', 'feature389', 'feature276', 'feature305', 'feature296', 'feature390', 'feature204', 'feature127', 'feature54', 'feature408', 'feature376', 'feature413', 'feature282', 'feature338', 'feature356', 'feature385', 'feature283', 'feature245', 'feature93', 'feature362', 'feature352', 'feature264', 'feature272', 'feature333', 'feature384', 'feature86', 'feature302', 'feature429', 'feature231', 'feature152', 'feature174', 'feature349', 'feature322', 'feature379', 'feature300', 'feature252', 'feature464', 'feature200', 'feature164', 'feature194', 'feature197', 'feature218', 'feature368', 'feature423', 'feature339', 'feature387', 'feature374', 'feature391', 'feature181', 'feature311', 'feature427', 'feature213', 'feature125', 'feature184', 'feature290', 'feature284', 'feature317', 'feature406', 'feature237', 'feature281', 'feature449', 'feature310', 'feature248', 'feature177', 'feature359', 'feature394', 'feature291', 'feature416', 'feature371', 'feature327', 'feature336', 'feature355', 'feature312', 'feature182', 'feature367', 'feature325', 'feature203', 'feature263', 'feature412', 'feature214', 'feature219', 'feature377', 'feature138', 'feature79', 'feature109', 'feature195', 'feature454', 'feature405', 'feature201', 'feature463', 'feature432', 'feature210', 'feature465', 'feature484', 'feature397', 'feature381', 'feature456', 'feature462', 'feature288', 'feature438', 'feature275', 'feature383', 'feature363', 'feature212', 'feature158', 'feature392', 'feature445', 'feature498', 'feature494', 'feature492', 'feature478', 'feature493', 'feature469', 'feature489', 'feature188', 'feature168', 'feature418', 'feature118', 'feature404', 'feature167', 'feature113', 'feature142', 'feature107', 'feature147', 'feature157', 'feature257', 'feature103', 'feature114', 'feature94', 'feature189', 'feature112', 'feature76', 'feature260', 'feature161', 'feature230', 'feature143', 'feature83', 'feature175', 'feature162', 'feature105', 'feature58', 'feature227', 'feature73', 'feature228', 'feature222', 'feature351', 'feature388', 'feature126', 'feature206', 'feature440', 'feature292', 'feature146', 'feature341', 'feature303', 'feature307', 'feature321', 'feature243', 'feature287', 'feature149', 'feature128', 'feature51', 'feature69', 'feature190', 'feature221', 'feature428', 'feature124', 'feature133', 'feature235', 'feature344', 'feature217', 'feature483', 'feature487', 'feature477', 'feature488', 'feature474', 'feature453', 'feature471', 'feature499', 'feature496', 'feature497', 'feature495', 'feature393', 'feature476', 'feature479', 'feature470', 'feature457', 'feature485', 'feature446', 'feature486', 'feature481', 'feature490', 'feature480', 'feature491', 'feature323', 'feature399', 'feature244', 'feature482', 'feature400', 'feature466', 'feature436', 'feature411', 'feature121', 'feature153', 'feature173', 'feature80', 'feature154', 'feature62', 'feature150', 'feature186', 'feature279', 'feature57', 'feature280', 'feature176', 'feature116', 'feature269', 'feature37', 'feature65', 'feature108', 'feature250', 'feature259', 'feature53', 'feature64', 'feature234', 'feature39', 'feature104', 'feature97', 'feature165', 'feature313', 'feature459', 'feature442', 'feature261', 'feature407', 'feature342', 'feature169', 'feature151', 'feature33', 'feature117', 'feature266', 'feature426', 'feature331', 'feature424', 'feature370', 'feature422', 'feature343', 'feature285', 'feature357', 'feature274', 'feature256', 'feature409', 'feature267', 'feature417', 'feature273', 'feature233']
    MEDIAN_FEATURES = ['feature40', 'feature84', 'feature378', 'feature156', 'feature458', 'feature253', 'feature120', 'feature38', 'feature87', 'feature171', 'feature199', 'feature246', 'feature324', 'feature220', 'feature467', 'feature47', 'feature170', 'feature12', 'feature15', 'feature22', 'feature49', 'feature26', 'feature61', 'feature130', 'feature4', 'feature13', 'feature45', 'feature289', 'feature24', 'feature43', 'feature329', 'feature11', 'feature348', 'feature332', 'feature21', 'feature144', 'feature180', 'feature232', 'feature10', 'feature25', 'feature160', 'feature119', 'feature224', 'feature3', 'feature66', 'feature309', 'feature241', 'feature102', 'feature19', 'feature9', 'feature7', 'feature225', 'feature8', 'feature5', 'feature131', 'feature455', 'feature354', 'feature452', 'feature460', 'feature247', 'feature101', 'feature88', 'feature17', 'feature319', 'feature216', 'feature30', 'feature44', 'feature135', 'feature208', 'feature278', 'feature198', 'feature166', 'feature202', 'feature16', 'feature129', 'feature137', 'feature179', 'feature211', 'feature71', 'feature106', 'feature122', 'feature215', 'feature270', 'feature207', 'feature81', 'feature450', 'feature41', 'feature14', 'feature345', 'feature163', 'feature63', 'feature98', 'feature36', 'feature346', 'feature185', 'feature183', 'feature251', 'feature386', 'feature74', 'feature28']

    # Filter NEGONE_FEATURES and MEDIAN_FEATURES based on current X_train columns.  
    negone_features_filtered = [feature for feature in NEGONE_FEATURES if feature in X_train.columns]
    median_features_filtered = [feature for feature in MEDIAN_FEATURES if feature in X_train.columns]

    # Redefine NEGONE_FEATURES and MEDIAN_FEATURES with updated values.
    NEGONE_FEATURES = negone_features_filtered
    MEDIAN_FEATURES = median_features_filtered

    print("Filtered NEGONE_FEATURES:", NEGONE_FEATURES)
    print("Filtered MEDIAN_FEATURES:", MEDIAN_FEATURES)

    def generate_preprocessor(numeric_features, categorical_features, N_JOBS, cat_encode_type, 
                                do_specificimpute, do_featureselection, 
                                do_sampling, do_pca, var_thresh, oversample_technique, 
                                negone_features=NEGONE_FEATURES, median_features=MEDIAN_FEATURES,
                                prefix='', do_feature_subset=False, max_features=1, do_removeppi=False, do_removegtex=False):
        cat_encoders = [OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1, encoded_missing_value=-1), 
                        OneHotEncoder(sparse=False, handle_unknown='infrequent_if_exist', min_frequency=10)]
        categorical_transformer = cat_encoders[cat_encode_type]

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', MinMaxScaler(feature_range =(0, 1), clip=True))])

        median_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', MinMaxScaler(feature_range =(0, 1), clip=True))])

        negone_transformer = Pipeline(steps=[
            ('scaler', MinMaxScaler(feature_range =(0, 1), clip=True)),
            ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
        ])

        preprocessor = None
        if do_specificimpute:
            preprocessor = ColumnTransformer(
                transformers=[
                    ('median', median_transformer, median_features),
                    ('negone', negone_transformer, negone_features),
                    ('cat', categorical_transformer, categorical_features),
            ])
        else:
            preprocessor = ColumnTransformer(
                transformers=[
                    ('numeric', numeric_transformer, numeric_features),
                    ('cat', categorical_transformer, categorical_features),
                ])

        vt = VarianceThreshold(threshold=var_thresh)
        steps = [('initial', preprocessor), ('removeba', RemoveBeforeAfterTransformer()), ('variance_threshold', vt)]
        if do_sampling == 1:
            steps.append(('undersampling', RandomUnderSampler(random_state=42)))
        if do_sampling == 2:
            oversamplers = [SMOTE(n_jobs=N_JOBS,random_state=42), RandomOverSampler(random_state=42)]
            steps.append(('oversampling', oversamplers[oversample_technique]))
        if do_pca:
            steps.append(('pca', PCA()))

        preprocessor = Pipeline(steps=steps)
        return preprocessor

    class RemoveBeforeAfterTransformer(BaseEstimator, TransformerMixin):
        def __init__(self):
            self.drop_cols = None

        def fit(self, X, y=None):
            print(X.shape)
            self.drop_cols = [col for col in X.columns if any(f'feature{num}' == col for num in drop_numbers)]
            print(f"Columns to be dropped: {self.drop_cols}")
            return self

        def transform(self, X, y=None):
            print(f"Columns being dropped: {[col for col in self.drop_cols if col in X.columns]}")
            X = X.drop(columns=self.drop_cols, errors='ignore')
            print(X.shape)
            return X

        def get_feature_names_out(self, input_features=None):
            return [f for f in input_features if f not in self.drop_cols]

    def preprocess(preprocessor, train_data, train_labels, quiet=False):
        for k, v in preprocessor.steps:
            print(f"Applying step: {k}")
            print(f"Input shape: {train_data.shape}")
            print(f"Input columns: {train_data.columns}")
            
            if k == 'initial':
                start = time.time()
                v.fit(train_data)
                train_data = pd.DataFrame(v.transform(train_data), columns=v.get_feature_names_out())
                end = time.time()
                if not quiet:
                    print(k + ' took ' + str(end - start) + ' to run.')
            elif k == 'oversampling' or k == 'undersampling':
                start = time.time()
                input_features = train_data.columns
                train_data, train_labels = v.fit_resample(train_data, train_labels)
                train_data = pd.DataFrame(train_data, columns=input_features)
                end = time.time()
                if not quiet:
                    print(k + ' took ' + str(end - start) + ' to run.')
            else:
                start = time.time()
                v.fit(train_data)
                input_features = train_data.columns
                train_data = pd.DataFrame(v.transform(train_data), columns=v.get_feature_names_out(input_features))
                end = time.time()
                if not quiet:
                    print(k + ' took ' + str(end - start) + ' to run.')
            
            print(f"Output shape: {train_data.shape}")
            print(f"Output columns: {train_data.columns}")
            print("---")

        for col in train_data.columns:
            try:
                train_data[col] = train_data[col].astype('float')
            except:
                train_data[col] = train_data[col].astype('category')

        return train_data, train_labels

    def transform(test_data, preprocessor, quiet=False):
        for k, v in preprocessor.steps:
            if k == 'initial':
                test_data = pd.DataFrame(v.transform(test_data), columns=v.get_feature_names_out())
            elif k == 'oversampling' or k == 'undersampling':
                continue
            else:
                test_data = v.transform(test_data)
        test_data = pd.DataFrame(test_data)

        for col in test_data.columns:
            try:
                test_data[col] = test_data[col].astype('float')
            except:
                test_data[col] = test_data[col].astype('category')

        return test_data.to_numpy()

    # Specify the paths for model, results, and metrics.
    model_path = model_directory
    output_path = f'../XGBoost/results/{model_name}-validation.csv'
    metrics_file = f'../XGBoost//metrics/{model_name}-validation-metrics.csv'

    # This is used to drop columns that only contain NaN values.
    def drop_allnan(data):
        for col in data.columns:
            if data[col].isna().sum() == len(data):
                data = data.drop(columns=col)
        return data

    # This encodes the IMPACT feature's column.
    X_train = drop_allnan(X_train)
    feature1_vals = {'LOW': 0, 'MODIFIER': 1, 'MODERATE': 1.5, 'HIGH': 2}

    for col in X_train.columns:
        if X_train[col].dtype == 'object':
            if set(X_train[col].unique()) <= set(feature1_vals.keys()):
                X_train[col] = X_train[col].map(feature1_vals).fillna(0)

    # Conditional imputation based on feature type.
    numeric_features = X_train.select_dtypes(include=['number']).columns
    categorical_features = X_train.select_dtypes(include=['object']).columns

    # Pre-processor generation with multiple parameters that can be tweaked. I used the same values as LoGoFunc
    preprocessor = generate_preprocessor(numeric_features, categorical_features, 40, 0,
                                1, 0, 2, 0, 0, 1,
                                prefix='light0', do_feature_subset=True, max_features=1)

    if isinstance(y_train, list):
        y_train = pd.DataFrame({'label': y_train})

    X_train, y_train = preprocess(preprocessor, X_train, y_train)

    # This creates a directory for the preprocessor and trained models.
    joblib.dump(preprocessor, f'{model_path}/preprocessor.joblib')
    os.makedirs(model_path, exist_ok=True)

    def train_model(params, X_train, y_train):
        model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss', **params)
        model.fit(X_train, y_train)
        return model

    # Train the ensemble in parallel.
    models = Parallel(n_jobs=-1)(delayed(train_model)(params, X_train, y_train) for params in top_n_params)

    # Export models and params.
    for i, (model, params) in enumerate(zip(models, top_n_params)):
        joblib.dump(model, f'{model_path}/xgb_model_{i}.joblib')
        with open(f'{model_path}/xgb_params_{i}.json', 'w') as f:
            json.dump(params, f, indent=2)

    print(f"Training complete for {df_name}.")

print("Finished.")