In [None]:
# evaluate xgboost algorithm for classification
from numpy import mean
from numpy import std
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier
# define dataset
data = pd.read_csv('x_matricer/x_matrix_pre_4729.csv', compression='gzip').sort_values(by='IDno',ignore_index=True).drop(columns='IDno')
y_pred = pd.read_csv('y_pred_4729.csv').sort_values(by='IDno',ignore_index=True).drop(columns='IDno')

# Convert to numpy array and generate synthetic labels for demonstration
X = data.to_numpy()
y = y_pred.to_numpy()
# define the model
model = XGBClassifier()
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring = 'accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:

# test regression dataset
from sklearn.datasets import make_regression
# define dataset
X, y = make_regression(n_samples=1000, n_features=20, n_informative=15, noise=0.1, random_state=7)

In [None]:
from xgboost import XGBRegressor
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
model = XGBRegressor()
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
# explore xgboost number of trees effect on performance
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier
from matplotlib import pyplot



# get a list of models to evaluate
def get_models():
	models = dict()
	trees = [10, 50, 100, 500, 1000, 5000]
	for n in trees:
		models[str(n)] = XGBClassifier(n_estimators=n)
	return models

# evaluate a give model using cross-validation
def evaluate_model(model):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

In [None]:
# explore xgboost tree depth effect on performance
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier
from matplotlib import pyplot


# get a list of models to evaluate
def get_models():
	models = dict()
	for i in range(1,11):
		models[str(i)] = XGBClassifier(max_depth=i)
	return models

# evaluate a give model using cross-validation
def evaluate_model(model):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores


# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

In [None]:
# explore xgboost learning rate effect on performance
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier
from matplotlib import pyplot

# get the dataset
def get_dataset():
	X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
	return X, y

# get a list of models to evaluate
def get_models():
	models = dict()
	rates = [0.00001, 0.001, 0.01, 0.1, 1.0]
	for r in rates:
		key = '%.4f' % r
		models[key] = XGBClassifier(eta=r)
	return models

# evaluate a give model using cross-validation
def evaluate_model(model):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores


# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

In [None]:
# explore xgboost subsample ratio effect on performance
from numpy import arange
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBClassifier
from matplotlib import pyplot

# get the dataset
def get_dataset():
	X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
	return X, y

# get a list of models to evaluate
def get_models():
	models = dict()
	for i in arange(0.1, 1.1, 0.1):
		key = '%.1f' % i
		models[key] = XGBClassifier(subsample=i)
	return models

# evaluate a give model using cross-validation
def evaluate_model(model):
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores


# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	print(name)
	scores = evaluate_model(model)
	results.append(scores)
	names.append(name)
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score

# Load and preprocess data
data = pd.read_csv('x_matricer/x_matrix_pre_4729.csv', compression='gzip').sort_values(by='IDno', ignore_index=True).drop(columns='IDno')
y_pred = pd.read_csv('y_pred_4729.csv').sort_values(by='IDno', ignore_index=True).drop(columns='IDno')
X = data.to_numpy()
y = y_pred.to_numpy().flatten()

# Standardize the data
X = StandardScaler().fit_transform(X)

# Split data into holdout set and remaining set
train_indices, holdout_indices = train_test_split(np.arange(len(X)), test_size=500, random_state=42, stratify=y)
X_train, X_holdout = X[train_indices], X[holdout_indices]
y_train, y_holdout = y[train_indices], y[holdout_indices]

# Create a DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dholdout = xgb.DMatrix(X_holdout, label=y_holdout)

# Define a simple parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
    'scale_pos_weight': [1/0.02]  # Adjust for class imbalance
}

# Initialize a XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False)

# Perform Grid Search with Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=cv, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters found
print(f'Best parameters found: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_:.5f}')

# Train the final model with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate the final model on the holdout set
y_holdout_pred = best_model.predict(X_holdout)
accuracy = accuracy_score(y_holdout, y_holdout_pred)
precision = precision_score(y_holdout, y_holdout_pred)
recall = recall_score(y_holdout, y_holdout_pred)

print(f'Final Model Accuracy on Holdout Set: {accuracy * 100:.2f}%')
print(f'Holdout Set Precision: {precision * 100:.2f}%')
print(f'Holdout Set Recall: {recall * 100:.2f}%')


In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix

# Split data into holdout set and remaining set
train_indices, holdout_indices = train_test_split(np.arange(len(X)), test_size=500, random_state=42, stratify=y)
X_train, X_holdout = X[train_indices], X[holdout_indices]
y_train, y_holdout = y[train_indices], y[holdout_indices]

# Check class distribution
print("Class distribution in training set:", np.bincount(y_train.astype(int)))
print("Class distribution in holdout set:", np.bincount(y_holdout.astype(int)))

# Create a DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dholdout = xgb.DMatrix(X_holdout, label=y_holdout)

# Define a simple parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
    'scale_pos_weight': [np.sum(y_train == 0) / np.sum(y_train == 1)]  # Adjust for class imbalance
}

# Initialize a XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')

# Perform Grid Search with Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=cv, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters found
print(f'Best parameters found: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_:.5f}')

# Train the final model with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Evaluate the final model on the holdout set
y_holdout_pred = best_model.predict(X_holdout)
accuracy = accuracy_score(y_holdout, y_holdout_pred)
precision = precision_score(y_holdout, y_holdout_pred)
recall = recall_score(y_holdout, y_holdout_pred)

print(f'Final Model Accuracy on Holdout Set: {accuracy * 100:.2f}%')
print(f'Holdout Set Precision: {precision * 100:.2f}%')
print(f'Holdout Set Recall: {recall * 100:.2f}%')

# Confusion Matrix
cm = confusion_matrix(y_holdout, y_holdout_pred)
print("Confusion Matrix:")
print(cm)


In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, roc_auc_score


# Split data into holdout set and remaining set
train_indices, holdout_indices = train_test_split(np.arange(len(X)), test_size=500, random_state=42, stratify=y)
X_train, X_holdout = X[train_indices], X[holdout_indices]
y_train, y_holdout = y[train_indices], y[holdout_indices]

# Check class distribution
print("Class distribution in training set:", np.bincount(y_train.astype(int)))
print("Class distribution in holdout set:", np.bincount(y_holdout.astype(int)))

# Initialize a simple XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', scale_pos_weight=np.sum(y_train == 0) / np.sum(y_train == 1))

# Perform Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=cv, scoring='accuracy')
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation accuracy: {cv_scores.mean():.5f}')

# Train the final model on the entire training set
xgb_model.fit(X_train, y_train)

# Evaluate the final model on the holdout set
y_holdout_pred = xgb_model.predict(X_holdout)
accuracy = accuracy_score(y_holdout, y_holdout_pred)
precision = precision_score(y_holdout, y_holdout_pred)
recall = recall_score(y_holdout, y_holdout_pred)
roc_auc = roc_auc_score(y_holdout, y_holdout_pred)

print(f'Final Model Accuracy on Holdout Set: {accuracy * 100:.2f}%')
print(f'Holdout Set Precision: {precision * 100:.2f}%')
print(f'Holdout Set Recall: {recall * 100:.2f}%')
print(f'Holdout Set ROC AUC: {roc_auc * 100:.2f}%')

# Confusion Matrix
cm = confusion_matrix(y_holdout, y_holdout_pred)
print("Confusion Matrix:")
print(cm)


In [None]:
import matplotlib.pyplot as plt
import xgboost as xgb

# Train the model to inspect feature importance
xgb_model = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss', scale_pos_weight=np.sum(y_train == 0) / np.sum(y_train == 1))
xgb_model.fit(X_train, y_train)

# Plot feature importance
xgb.plot_importance(xgb_model, max_num_features=20)
plt.show()


In [None]:
# Inspect random samples
sample_indices = np.random.choice(train_indices, size=5, replace=False)
print(data.iloc[sample_indices])
print(y_pred.iloc[sample_indices])


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix

# Train logistic regression
logreg = LogisticRegression(class_weight='balanced', max_iter=1000)
logreg.fit(X_train, y_train)

# Evaluate on holdout set
y_holdout_pred = logreg.predict(X_holdout)
accuracy = accuracy_score(y_holdout, y_holdout_pred)
precision = precision_score(y_holdout, y_holdout_pred)
recall = recall_score(y_holdout, y_holdout_pred)
roc_auc = roc_auc_score(y_holdout, y_holdout_pred)

print(f'Logistic Regression Accuracy on Holdout Set: {accuracy * 100:.2f}%')
print(f'Logistic Regression Precision: {precision * 100:.2f}%')
print(f'Logistic Regression Recall: {recall * 100:.2f}%')
print(f'Logistic Regression ROC AUC: {roc_auc * 100:.2f}%')

# Confusion Matrix
cm = confusion_matrix(y_holdout, y_holdout_pred)
print("Confusion Matrix:")
print(cm)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate on holdout set
y_holdout_pred = rf.predict(X_holdout)
accuracy = accuracy_score(y_holdout, y_holdout_pred)
precision = precision_score(y_holdout, y_holdout_pred)
recall = recall_score(y_holdout, y_holdout_pred)
roc_auc = roc_auc_score(y_holdout, y_holdout_pred)

print(f'Random Forest Accuracy on Holdout Set: {accuracy * 100:.2f}%')
print(f'Random Forest Precision: {precision * 100:.2f}%')
print(f'Random Forest Recall: {recall * 100:.2f}%')
print(f'Random Forest ROC AUC: {roc_auc * 100:.2f}%')

# Confusion Matrix
cm = confusion_matrix(y_holdout, y_holdout_pred)
print("Confusion Matrix:")
print(cm)


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Train XGBoost on augmented data
xgb_model.fit(X_train_res, y_train_res)

# Evaluate on holdout set
y_holdout_pred = xgb_model.predict(X_holdout)
accuracy = accuracy_score(y_holdout, y_holdout_pred)
precision = precision_score(y_holdout, y_holdout_pred)
recall = recall_score(y_holdout, y_holdout_pred)
roc_auc = roc_auc_score(y_holdout, y_holdout_pred)

print(f'SMOTE XGBoost Accuracy on Holdout Set: {accuracy * 100:.2f}%')
print(f'SMOTE XGBoost Precision: {precision * 100:.2f}%')
print(f'SMOTE XGBoost Recall: {recall * 100:.2f}%')
print(f'SMOTE XGBoost ROC AUC: {roc_auc * 100:.2f}%')

# Confusion Matrix
cm = confusion_matrix(y_holdout, y_holdout_pred)
print("Confusion Matrix:")
print(cm)
