## XGBoost

In [30]:
from multiprocessing import Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, roc_auc_score
import shap
import pandas as pd
import numpy as np
import warnings
import glob
import os
from concurrent.futures import ProcessPoolExecutor
from os import listdir 

np.int = np.int32
np.float = np.float64
np.bool = np.bool_

os.chdir ('/home/zuzannak/MB_transformation/')
os.environ['KMP_DUPLICATE_LIB_OK']='True'


def tune_model(X_train, y_train):
    
    # Initialize the XGBoost classifier
    xgb_model = XGBClassifier(device="cuda", tree_method='gpu_hist', gpu_id=0, n_jobs=1)
    
    # Set up the parameter grid
    param_grid = {
        'n_estimators': [50],# 100, 200],
        'max_depth': [3],# 5, 7],
        'learning_rate': [0.01],# 0.1, 0.3],
        'subsample': [0.8],# 1.0],
        'colsample_bytree': [0.8]}#, 1.0]
    #}
    
    # Define scoring using roc_auc_score
    scorer = make_scorer(roc_auc_score, needs_proba=True)
    strat_kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=xgb_model, 
                               param_grid=param_grid, 
                               scoring=scorer, 
                               cv=strat_kfold, 
                               n_jobs = 3,
                               verbose=1)
    
    # Perform grid search on your training data
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_model = grid_search.best_estimator_
    return best_model
    
def process_file(file):

    name = file.split('/')[-1]
    output_file_name = name.replace('.csv', '')

    data = pd.read_csv('/home/zuzannak/MB_transformation/data/' + file, index_col = [0]).dropna().iloc[:50]
    feature_names = data.drop(['target_var'], axis=1).columns

    X = data.drop(['target_var'], axis=1).values
    y = data['target_var'].values

    kf = StratifiedKFold(n_splits=2, random_state = 0, shuffle=True)

    classification_results = []
    shap_results = pd.DataFrame()
    for i, (train_index, test_index) in enumerate(kf.split(X, y)):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        best_model = tune_model(X_train, y_train)
        ypred = best_model.predict(X_test)
        ypred_proba = best_model.predict_proba(X_test)[:, 1]

        precision = precision_score(y_test, ypred, zero_division=0)
        recall = recall_score(y_test, ypred, zero_division=0)
        auc = roc_auc_score(y_test, ypred_proba)

        classification_results.append({'fold_idx':i,
                                       'precision':precision,
                                       'recall':recall, 
                                       'auc':auc})

        # SHAP values anaysis
        explainer = shap.Explainer(best_model.predict, X_test)
        shap_values = explainer(X_test, max_evals=2 * X_test.shape[1] + 1)

        features_shap = np.abs(shap_values.values).mean(axis=0)
        features_shap_df = pd.DataFrame(list(zip(features_shap, feature_names)), columns = ['SHAP_value', 'feature_name'])
        features_shap_df['fold_idx'] = i
        shap_results = pd.concat([shap_results, features_shap_df])
    
    shap_results_df = pd.DataFrame(shap_results)
    shap_results_df['dataset'] = output_file_name
    classification_results_df = pd.DataFrame(classification_results)
    classification_results_df['dataset'] = output_file_name
    
    shap_results_df.to_csv(f'/home/zuzannak/MB_transformation/results/shap_values/{output_file_name}.csv')
    classification_results_df.to_csv(f'/home/zuzannak/MB_transformation/results/classification/{output_file_name}.csv')


from concurrent.futures import ProcessPoolExecutor
import os

# Define the number of workers (this can be the number of CPU cores you want to use)
num_workers = os.cpu_count()  # You can adjust this based on your system

def run_in_parallel():
    # Get the list of CSV files in the directory
    files = [f for f in listdir('/home/zuzannak/MB_transformation/data/') if f.endswith('.csv')][:2]

    # Create a pool of workers
    with Pool(processes=num_workers) as pool:
        # Map the process_file function to each file for parallel execution
        pool.map(process_file, files)

# Run the parallel processing (direct call, without the __name__ check)
run_in_parallel()

In [21]:
from concurrent.futures import ProcessPoolExecutor
import os

# Define the number of workers (this can be the number of CPU cores you want to use)
num_workers = os.cpu_count()  # You can adjust this based on your system

def run_in_parallel():
    # Get the list of CSV files in the directory
    files = [f for f in listdir('/home/zuzannak/MB_transformation/data/') if f.endswith('.csv')][:2]

    # Create a pool of workers
    with Pool(processes=num_workers) as pool:
        # Map the process_file function to each file for parallel execution
        pool.map(process_file, files)

# Run the parallel processing (direct call, without the __name__ check)
run_in_parallel()

Loky-backed parallel loops cannot be called in a multiprocessing, setting n_jobs=1


Fitting 2 folds for each of 1 candidates, totalling 2 fits

Loky-backed parallel loops cannot be called in a multiprocessing, setting n_jobs=1



Parameters: { "device" } are not used.

Fitting 2 folds for each of 1 candidates, totalling 2 fits
Parameters: { "device" } are not used.



Process ForkPoolWorker-108:
Process ForkPoolWorker-101:
Process ForkPoolWorker-103:
Process ForkPoolWorker-104:
Traceback (most recent call last):
Process ForkPoolWorker-105:
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
Process ForkPoolWorker-95:
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 364, in get
    with self._rlock:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-102:
Process ForkPoolWorker-96:
Process ForkPoolWorker-98:
Process ForkPoolWorker-106:
Process ForkPoolWorker-107:
Process

KeyboardInterrupt: 

In [13]:
from multiprocessing.pool import ThreadPool
pool = ThreadPool(processes=10)
inputDir = listdir('/home/zuzannak/MB_transformation/data/')

res = p.map(process_file, inputDir)

KeyboardInterrupt: 