In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
from xgboost import XGBClassifier

data = pd.read_csv('pseudodata_præoperation.csv')
y = np.random.choice([0,1],size = len(data))

X = data.to_numpy()

In [None]:
import numpy as np
from skopt import BayesSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier
from skopt.space import Real, Integer

# # Generate synthetic data
# X, y = make_classification(n_samples=6000, n_features=20, random_state=42)

# Split data into training and hold-out sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=1/6, random_state=42)

# Calculate scale_pos_weight based on training data
num_negative = np.sum(y_train == 0)
num_positive = np.sum(y_train == 1)
scale_pos_weight = num_negative / num_positive

# Initialize XGBoost classifier
model = XGBClassifier()

# Define the hyperparameter space
param_space = {
    'n_estimators': Integer(10, 5000),
    'max_depth': Integer(1, 10),
    'learning_rate': Real(0.001, 0.01, prior='log-uniform'),
    'subsample': Real(0.6, 1.0),
    'colsample_bytree': Real(0.6, 1.0),
    'gamma': Real(0, 5),
    'scale_pos_weight': Real(scale_pos_weight*0.25, scale_pos_weight*2)
}

# Define the scoring metrics
scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# Initialize BayesSearchCV
opt = BayesSearchCV(
    estimator=model,
    search_spaces=param_space,
    scoring=scoring,
    n_iter=50,  # Number of iterations for Bayesian optimization
    cv=5,  # Number of cross-validation folds
    n_jobs=-1,
    return_train_score=True,
    refit='roc_auc',  # Refit the best model on the entire training set using ROC AUC
    random_state=42
)

# Perform Bayesian optimization
opt.fit(X_train, y_train)

# Evaluate on hold-out set
y_pred_holdout = opt.predict(X_holdout)
print("Hold-Out Set Metrics:")
for metric in scoring:
    if metric in ['accuracy', 'precision', 'recall', 'f1']:
        score = globals()[f"{metric}_score"](y_holdout, y_pred_holdout)
    elif metric == 'roc_auc':
        score = roc_auc_score(y_holdout, opt.predict_proba(X_holdout)[:, 1])
    print(f"{metric.capitalize()}: {score}")

# Print the best parameters and the best score
print("Best parameters:", opt.best_params_)
print("Best ROC AUC:", opt.best_score_)


In [4]:
Real(scale_pos_weight*0.25, scale_pos_weight*2)

Real(low=0.25, high=2.0, prior='uniform', transform='identity')

In [7]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Load and preprocess data
# data = pd.read_csv('x_matricer/x_matrix_pre_4729.csv', compression='gzip').sort_values(by='IDno', ignore_index=True).drop(columns='IDno')
# y_pred = pd.read_csv('y_pred_4729.csv').sort_values(by='IDno', ignore_index=True).drop(columns='IDno')
# X = data.to_numpy()
# y = y_pred.to_numpy().flatten()

# # Standardize the data
# X = StandardScaler().fit_transform(X)

data = pd.read_csv('pseudodata_præoperation.csv')
y = np.random.choice([0,1],size = len(data))

X = data.to_numpy()

# Split data into holdout set and remaining set
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=1/6, random_state=42)

# Create a DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dholdout = xgb.DMatrix(X_holdout, label=y_holdout)

scale_pos_weight = num_negative / num_positive

# Define a simple parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.0001, 0.001, 0.01, 0.3],
    'n_estimators': [10, 50, 100, 500, 1000, 5000],
    'scale_pos_weight': [0.1*num_negative,num_negative,num_negative*1.5,num_negative*2]  # Adjust for class imbalance
}

# Initialize a XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False)

scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# Perform Grid Search with Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='f1', cv=cv, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters found
print(f'Best parameters found: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_:.5f}')

# Train the final model with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate the final model on the holdout set
y_holdout_pred = best_model.predict(X_holdout)
accuracy = accuracy_score(y_holdout, y_holdout_pred)
precision = precision_score(y_holdout, y_holdout_pred)
recall = recall_score(y_holdout, y_holdout_pred)

print(f'Final Model Accuracy on Holdout Set: {accuracy * 100:.2f}%')
print(f'Holdout Set Precision: {precision * 100:.2f}%')
print(f'Holdout Set Recall: {recall * 100:.2f}%')


Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Best parameters found: {'learning_rate': 0.3, 'max_depth': 5, 'n_estimators': 1000, 'scale_pos_weight': 1327}
Best cross-validation accuracy: 0.50113
Final Model Accuracy on Holdout Set: 50.47%
Holdout Set Precision: 48.66%
Holdout Set Recall: 79.37%


## Prøv den her
Start med færre parametre

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score


# Load and preprocess data
# data = pd.read_csv('x_matricer/x_matrix_pre_4729.csv', compression='gzip').sort_values(by='IDno', ignore_index=True).drop(columns='IDno')
# y_pred = pd.read_csv('y_pred_4729.csv').sort_values(by='IDno', ignore_index=True).drop(columns='IDno')
# X = data.to_numpy()
# y = y_pred.to_numpy().flatten()

# # Standardize the data
# X = StandardScaler().fit_transform(X)

data1 = pd.read_csv('pseudodata_præoperation.csv')
data2 = pd.read_csv('pseudodata_præoperation.csv')
data3 = pd.read_csv('pseudodata_præoperation.csv')
data4 = pd.read_csv('pseudodata_præoperation.csv')
data5 = pd.read_csv('pseudodata_præoperation.csv')
data6 = pd.read_csv('pseudodata_præoperation.csv')

base = data1
phase1 = pd.concat([data1, data2], axis = 1)
phase2 = pd.concat([data1, data2, data3], axis = 1)
phase3 = pd.concat([data1, data2, data3, data4], axis = 1)
phase4 = pd.concat([data1, data2, data3, data4, data5], axis = 1)
phase5 = pd.concat([data1, data2, data3, data4, data5, data6], axis = 1)
y_pred = 
data_list = [base, phase1, phase2, phase3, phase4, phase5]

preds_log = []
models_log = []
holdout_log = []
true_log = []

for i in range(len(data_list)):
    print(i)
    data = data_list[i]

    y = np.random.choice([0,1],size = len(data))

    X = data.to_numpy()

    X = StandardScaler().fit_transform(X)

    # Split data into holdout set and remaining set
    X_train, X_holdout, y_train, y_holdout = train_test_split(X,y, test_size=500, random_state=42, stratify=y)

    holdout_set = Subset(dataset, holdout_indices)
    remaining_set = Subset(dataset, train_indices)

    num_negative = np.sum(y == 0)
    num_positive = np.sum(y == 1)
    scale_pos_weight = num_negative / num_positive

    # Create a DMatrix for XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dholdout = xgb.DMatrix(X_holdout, label=y_holdout)

    # Define a simple parameter grid for hyperparameter tuning
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.0001, 0.001, 0.01, 0.3],
        'n_estimators': [10, 50, 100, 500, 1000, 5000],
        'scale_pos_weight': [0.1*num_negative,num_negative,num_negative*1.5,num_negative*2],  # Adjust for class imbalance, 
    }

    # Initialize a XGBoost model
    xgb_model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, early_stopping_rounds = 10)

    scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

    # Perform Grid Search with Cross-Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='roc_auc', cv=cv, verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Print the best parameters found
    print(f'Best parameters found: {grid_search.best_params_}')
    print(f'Best cross-validation accuracy: {grid_search.best_score_:.5f}')

    # Train the final model with the best parameters
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

    # Evaluate the final model on the holdout set
    y_holdout_pred = best_model.predict(X_holdout)
    accuracy = accuracy_score(y_holdout, y_holdout_pred)
    precision = precision_score(y_holdout, y_holdout_pred)
    recall = recall_score(y_holdout, y_holdout_pred)
    f1 = f1_score(y_holdout, y_holdout_pred)

    models_log.append(best_model)
    #holdout_log.append(holdout_loader)
    true_log.append(y_holdout)
    preds_log.append(y_holdout_pred)

    print(f'Final Model Accuracy on Holdout Set: {accuracy * 100:.2f}%')
    print(f'Holdout Set Precision: {precision * 100:.2f}%')
    print(f'Holdout Set Recall: {recall * 100:.2f}%')
    print(f'Holdout Set F1 Score: {f1 * 100:.2f}%')

    with open('XGB_output.txt', 'a') as file:
        print(f"Phase {i + 1}", file=file)
        print(f'Best parameters: {grid_search.best_params_}', file=file)
        print(f'Final Model F1 Score on Holdout Set: {f1:.4f}', file=file)
        print(f'Holdout Set Accuracy: {accuracy * 100:.2f}%', file=file)
        print(f'Holdout Set Precision: {precision * 100:.2f}%', file=file)
        print(f'Holdout Set Recall: {recall * 100:.2f}%', file=file)
        print('\n', file=file)
        print('Output written to output.txt')


In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

# Split data into holdout set and remaining set
train_indices, holdout_indices = train_test_split(np.arange(len(X)), test_size=500, random_state=42, stratify=y)
X_train, X_holdout = X[train_indices], X[holdout_indices]
y_train, y_holdout = y[train_indices], y[holdout_indices]

# Check class distribution
print("Class distribution in training set:", np.bincount(y_train.astype(int)))
print("Class distribution in holdout set:", np.bincount(y_holdout.astype(int)))

# Create a DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dholdout = xgb.DMatrix(X_holdout, label=y_holdout)

# Define a simple parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
    'scale_pos_weight': [np.sum(y_train == 0) / np.sum(y_train == 1)]  # Adjust for class imbalance
}

# Initialize a XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')

# Perform Grid Search with Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=cv, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters found
print(f'Best parameters found: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_:.5f}')

# Train the final model with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate the final model on the holdout set
y_holdout_pred = best_model.predict(X_holdout)
accuracy = accuracy_score(y_holdout, y_holdout_pred)
precision = precision_score(y_holdout, y_holdout_pred)
recall = recall_score(y_holdout, y_holdout_pred)

print(f'Final Model Accuracy on Holdout Set: {accuracy * 100:.2f}%')
print(f'Holdout Set Precision: {precision * 100:.2f}%')
print(f'Holdout Set Recall: {recall * 100:.2f}%')

# Confusion Matrix
cm = confusion_matrix(y_holdout, y_holdout_pred)
print("Confusion Matrix:")
print(cm)


In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, roc_auc_score


# Split data into holdout set and remaining set
train_indices, holdout_indices = train_test_split(np.arange(len(X)), test_size=500, random_state=42, stratify=y)
X_train, X_holdout = X[train_indices], X[holdout_indices]
y_train, y_holdout = y[train_indices], y[holdout_indices]

# Check class distribution
print("Class distribution in training set:", np.bincount(y_train.astype(int)))
print("Class distribution in holdout set:", np.bincount(y_holdout.astype(int)))

# Initialize a simple XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', scale_pos_weight=np.sum(y_train == 0) / np.sum(y_train == 1))

# Perform Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=cv, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation accuracy: {cv_scores.mean():.5f}')

# Train the final model on the entire training set
xgb_model.fit(X_train, y_train)

# Evaluate the final model on the holdout set
y_holdout_pred = xgb_model.predict(X_holdout)
accuracy = accuracy_score(y_holdout, y_holdout_pred)
precision = precision_score(y_holdout, y_holdout_pred)
recall = recall_score(y_holdout, y_holdout_pred)
roc_auc = roc_auc_score(y_holdout, y_holdout_pred)

print(f'Final Model Accuracy on Holdout Set: {accuracy * 100:.2f}%')
print(f'Holdout Set Precision: {precision * 100:.2f}%')
print(f'Holdout Set Recall: {recall * 100:.2f}%')
print(f'Holdout Set ROC AUC: {roc_auc * 100:.2f}%')

# Confusion Matrix
cm = confusion_matrix(y_holdout, y_holdout_pred)
print("Confusion Matrix:")
print(cm)
