<a href="https://colab.research.google.com/github/vibhuverma17/COACH/blob/main/COACH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install xgboost hyperopt
!pip install --upgrade scipy

In [None]:
import pandas as pd
import numpy as np
import random
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import train_test_split, KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, confusion_matrix, classification_report
from scipy.stats import binomtest

import ipywidgets as widgets
from IPython.display import display

import itertools

In [None]:
# Set the random seed for reproducibility
np.random.seed(42)

# Define constants
num_rows = 15000
months = pd.date_range(start="2024-01-01", periods=12, freq='M').strftime('%Y-%m').tolist()
age_groups = ['18-24', '25-34', '35-44', '45-54', '55+']
genders = ['Male', 'Female', 'Non-binary']
education_levels = ['High School', 'Associate\'s', 'Bachelor\'s', 'Master\'s', 'Doctorate']
locations = ['Urban', 'Suburban', 'Rural']
income_ranges = ['$20,000 - $30,000', '$30,000 - $50,000', '$50,000 - $70,000',
                '$70,000 - $90,000', '$90,000 - $110,000']

# Generate data
data = {
    "Survey_ID": range(1, num_rows + 1),
    "Survey_Month": [random.choice(months) for _ in range(num_rows)],
    "Age_Group": [random.choice(age_groups) for _ in range(num_rows)],
    "Gender": [random.choice(genders) for _ in range(num_rows)],
    "Education_Level": [random.choice(education_levels) for _ in range(num_rows)],
    "Location": [random.choice(locations) for _ in range(num_rows)],
    "Income_Range": [random.choice(income_ranges) for _ in range(num_rows)],
    "Happiness_Score": np.random.choice([0, 1], size=num_rows),  # Binary 0 or 1
    "Work_Satisfaction": np.random.randint(1, 11, size=num_rows),  # 1 to 10
    "Social_Interactions": np.random.randint(1, 11, size=num_rows),  # 1 to 10
    "Physical_Health": np.random.randint(1, 11, size=num_rows),  # 1 to 10
    "Mental_Health": np.random.randint(1, 11, size=num_rows),  # 1 to 10
    "Major_Stressors": [random.choice(['Job', 'Finances', 'Health', 'Relationships', 'Family', 'Workload']) for _ in range(num_rows)],
}

# Create DataFrame
survey_df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
survey_df.head()

# Save to CSV if needed
# survey_df.to_csv("happiness_survey_data.csv", index=False)


In [None]:
# Define the preprocess function as a pipeline
def create_preprocessing_pipeline():
    # Define the column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(drop='first'), ['Age_Group', 'Gender', 'Education_Level', 'Location', 'Income_Range','Major_Stressors']),
        ],
        remainder='passthrough'  # Keep other columns unchanged
    )

    # Define the full pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
    ])

    return pipeline

def preprocess_data(data):
    # Convert 'Survey_Month' to datetime and extract year and month
    data['Survey_Month'] = pd.to_datetime(data['Survey_Month'])
    data['Year'] = data['Survey_Month'].dt.year
    data['Month'] = data['Survey_Month'].dt.month

    # Sort by year and month to ensure correct order
    data = data.sort_values(by=['Year', 'Month'])

    return data

In [None]:
# Preprocess the entire dataset
preprocessed_df = preprocess_data(survey_df)

# Create a preprocessing pipeline
pipeline = create_preprocessing_pipeline()

# Separate into features and target variable
X_full = preprocessed_df.drop(columns=['Survey_ID', 'Happiness_Score', 'Survey_Month'])
y_full = preprocessed_df['Happiness_Score']

# Fit the pipeline on the full data
X_transformed = pipeline.fit_transform(X_full)

# Identify the latest month
latest_month = preprocessed_df['Survey_Month'].max()

# Separate into training and testing sets
train_indices = preprocessed_df['Survey_Month'] < latest_month
X_train = X_transformed[train_indices]
y_train = y_full[train_indices]

# Prepare the test set
test_indices = preprocessed_df['Survey_Month'] == latest_month
X_test = X_transformed[test_indices]
y_test = y_full[test_indices]

In [None]:
class XGBoostHyperparameterTuner:
    def __init__(self, X, y):
        self.X_train = X
        self.y_train = y
        self.model = None
        self.best_params = None

    def objective(self, params):
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        loglosses = []

        for train_index, val_index in kf.split(self.X_train):
            # Use iloc to ensure positional indexing
            X_kf_train, X_kf_val = self.X_train[train_index], self.X_train[val_index]
            y_kf_train, y_kf_val = self.y_train.iloc[train_index], self.y_train.iloc[val_index]

            dtrain = xgb.DMatrix(X_kf_train, label=y_kf_train)
            dval = xgb.DMatrix(X_kf_val, label=y_kf_val)

            # Train the model with early stopping
            model = xgb.train(params, dtrain, num_boost_round=100,
                              evals=[(dval, 'eval')],
                              early_stopping_rounds=10,
                              verbose_eval=False)

            # Predict on validation data and calculate logloss
            preds = model.predict(dval)
            logloss = -1 * np.mean(np.log((preds - y_kf_val)**2))
            loglosses.append(logloss)

        # Return mean logloss across all folds
        mean_logloss = np.mean(loglosses)
        return {'loss': mean_logloss, 'status': STATUS_OK}

    def tune_hyperparameters(self, max_evals=50):
        space = {
            'max_depth': hp.randint('max_depth', 3, 10),
            'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
            'n_estimators': hp.randint('n_estimators', 50, 200),
            'gamma': hp.uniform('gamma', 0, 5),
            'subsample': hp.uniform('subsample', 0.5, 1.0),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
            'objective': 'binary:logistic',
            'eval_metric': 'logloss'
        }

        trials = Trials()
        best = fmin(fn=self.objective, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials)
        self.best_params = best
        return best

    def train_final_model(self):
        dtrain = xgb.DMatrix(self.X_train, label=self.y_train)
        self.model = xgb.train({**self.best_params, 'objective': 'binary:logistic', 'eval_metric': 'logloss'}, dtrain, num_boost_round=100)

    def predict(self, X):
        if self.model is None:
            raise Exception("Model has not been trained yet!")
        dval = xgb.DMatrix(X)
        return self.model.predict(dval)

In [None]:
# Initialize and tune the model
tuner = XGBoostHyperparameterTuner(X_train, y_train)
best_params = tuner.tune_hyperparameters(max_evals=50)
print("Best hyperparameters:", best_params)

# Train the final model with the best hyperparameters
tuner.train_final_model()

# Access the trained model
model = tuner.model

# Make predictions on the validation set
predictions = tuner.predict(X_train)

In [None]:
def find_best_cutoff(y_true, y_prob):
    # Calculate the false positive rate (fpr), true positive rate (tpr), and thresholds
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)

    # Calculate Youden's J statistic
    J = tpr - fpr

    # Find the index of the maximum J statistic
    best_index = np.argmax(J)
    best_threshold = thresholds[best_index]

    return best_threshold

# Get the predicted probabilities on the training set
train_predictions_prob = tuner.predict(X_train)

# Find the best threshold using Youden's J statistic
best_threshold = find_best_cutoff(y_train, train_predictions_prob)

# Convert probabilities to class labels using the best threshold
train_predictions = (train_predictions_prob > best_threshold).astype(int)

# Print the best threshold
print(f"Best Threshold (using Youden's J statistic): {best_threshold}")

# Get the confusion matrix
conf_matrix = confusion_matrix(y_train, train_predictions)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Get the classification report (includes precision, recall, f1-score)
class_report = classification_report(y_train, train_predictions)

# Print the classification report
print("\nClassification Report:")
print(class_report)

In [None]:
# Global variable to store the filtered indices
filtered_indices = []

# Step 1: Identify non-numeric columns (excluding 'Survey ID')
non_numeric_cols = preprocessed_df[preprocessed_df['Survey_Month'] == latest_month].select_dtypes(exclude=['number']).columns.tolist()
if 'Survey ID' in non_numeric_cols:
    non_numeric_cols.remove('Survey ID')

# Step 2: Create widgets for selecting values from non-numeric columns
dropdowns = {}
for col in non_numeric_cols:
    unique_values = preprocessed_df[preprocessed_df['Survey_Month'] == latest_month][col].unique().tolist()
    dropdowns[col] = widgets.Dropdown(
        options=['All'] + unique_values,
        description=col,
        style={'description_width': 'initial'}
    )

# Function to apply the filters and return the index values of filtered rows
def apply_filters(*args):
    global filtered_indices
    filtered_df = preprocessed_df[preprocessed_df['Survey_Month'] == latest_month].copy()

    # Apply filters
    for col, dropdown in dropdowns.items():
        if dropdown.value != 'All':
            filtered_df = filtered_df[filtered_df[col] == dropdown.value]

    # Step 3: Get index values of filtered rows
    filtered_indices = filtered_df.index.tolist()

    # Display the indices
    print("Filtered row indices Done")

# Step 4: Create an "Apply Filters" button
apply_button = widgets.Button(description="Apply Filters")
apply_button.on_click(apply_filters)

# Display dropdowns and button
display(widgets.VBox(list(dropdowns.values()) + [apply_button]))


In [None]:
# Number of successes (e.g., heads in a coin flip)
successes = np.sum(y_test[
    (preprocessed_df['Survey_Month'] == latest_month) &
    (preprocessed_df.index.isin(filtered_indices))])

# Number of trials (e.g., total number of coin flips)
trials = y_test[
    (preprocessed_df['Survey_Month'] == latest_month) &
    (preprocessed_df.index.isin(filtered_indices))].shape[0]

# Hypothesized probability of success (e.g., fair coin: p = 0.5)
p = np.mean(tuner.predict(X_transformed[
    (preprocessed_df['Survey_Month'] == latest_month) &
    (preprocessed_df.index.isin(filtered_indices))]))

# Perform the binomial test
p_value = binomtest(successes, n=trials, p=p, alternative='two-sided').pvalue

print(f'P-value of the test: {p_value}')

# Check significance at 0.05 level
if p_value < 0.05:
    print("The result is statistically significant at the 0.05 level and this is an outlier.")
else:
    print("The result is not statistically significant at the 0.05 level and this is not an outlier.")

In [None]:
# Number of successes (e.g., heads in a coin flip)
successes = np.sum(y_test[
    (preprocessed_df['Survey_Month'] == latest_month) &
    (preprocessed_df.index.isin(filtered_indices))])

# Number of trials (e.g., total number of coin flips)
trials = y_test[
    (preprocessed_df['Survey_Month'] == latest_month) &
    (preprocessed_df.index.isin(filtered_indices))].shape[0]

# Hypothesized probability of success (e.g., fair coin: p = 0.5)
p = np.mean(y_test)

# Perform the binomial test
p_value = binomtest(successes, n=trials, p=p, alternative='two-sided').pvalue

print(f'P-value of the test: {p_value}')

# Check significance at 0.05 level
if p_value < 0.05:
    print("The result is statistically significant at the 0.05 level and this is an outlier.")
else:
    print("The result is not statistically significant at the 0.05 level and this is not an outlier.")

In [None]:
# Step 1: Identify non-numeric columns (excluding 'Survey ID')
non_numeric_cols = preprocessed_df[preprocessed_df['Survey_Month'] == latest_month].select_dtypes(exclude=['number']).columns.tolist()
if 'Survey ID' in non_numeric_cols:
    non_numeric_cols.remove('Survey ID')

# Step 2: Function to get all combinations of selected values for non-numeric columns
def get_combinations():
    unique_value_lists = []

    # Collect the unique values for each non-numeric column
    for col in non_numeric_cols:
        unique_values = preprocessed_df[preprocessed_df['Survey_Month'] == latest_month][col].unique().tolist()
        unique_value_lists.append(['All'] + unique_values)

    # Generate all possible combinations of values
    return list(itertools.product(*unique_value_lists))

# Step 3: Apply a specific combination of filters to the dataset
def apply_filter_combination(combo):
    filtered_df = preprocessed_df[preprocessed_df['Survey_Month'] == latest_month].copy()

    # Apply filters based on the current combination of values
    for i, col in enumerate(non_numeric_cols):
        if combo[i] != 'All':
            filtered_df = filtered_df[filtered_df[col] == combo[i]]

    return filtered_df

# Step 4: Find valid combinations where filtered rows > 50 and store the results in a DataFrame
def find_valid_combinations():
    results = []

    # Get all possible combinations of values for the non-numeric columns
    combinations = get_combinations()

    # Loop through each combination and apply the filter
    for combo in combinations:
        filtered_df = apply_filter_combination(combo)
        if len(filtered_df) > 50:  # Check if the number of rows is greater than 50
            # Create a row with the combination details
            combo_dict = {non_numeric_cols[i]: combo[i] for i in range(len(non_numeric_cols))}
            result_row = combo_dict.copy()
            result_row["Number of Rows"] = len(filtered_df)
            result_row["Filtered_Indices"] = filtered_df.index.tolist()
            results.append(result_row)

    # Convert the results to a DataFrame
    results_df = pd.DataFrame(results)
    #print(f"Number of valid combinations: {len(results_df)}")
    return results_df

# Step 5: Run the process
valid_combinations_df = find_valid_combinations()

In [None]:
def check_outlier(row, y_test, X_transformed, preprocessed_df, tuner, latest_month):
    # Number of successes (e.g., sum of y_test for the given filtered index)
    successes = np.sum(y_test[
        (preprocessed_df['Survey_Month'] == latest_month) &
        (preprocessed_df.index.isin(row['Filtered_Indices']))])

    # Number of trials is the count of indices for the current row
    trials = len(row['Filtered_Indices'])

    # Hypothesized probability of success (from the model's predictions)
    p = np.mean(tuner.predict(X_transformed[
        (preprocessed_df['Survey_Month'] == latest_month) &
        (preprocessed_df.index.isin(row['Filtered_Indices']))]))

    # Perform the binomial test
    p_value = binomtest(successes, n=trials, p=p, alternative='two-sided').pvalue

    # Return 1 if statistically significant (outlier), otherwise 0
    return 1 if p_value < 0.05 else 0

def apply_outlier_check(valid_combinations_df, y_test, X_transformed, preprocessed_df, tuner, latest_month):
    valid_combinations_df['Outlier_Status'] = valid_combinations_df.apply(
        lambda row: check_outlier(row, y_test, X_transformed, preprocessed_df, tuner, latest_month),
        axis=1
    )
    return valid_combinations_df

In [None]:
def check_outlier_std(row, y_test, X_transformed, preprocessed_df, tuner, latest_month):
    # Number of successes (e.g., sum of y_test for the given filtered index)
    successes = np.sum(y_test[
        (preprocessed_df['Survey_Month'] == latest_month) &
        (preprocessed_df.index.isin(row['Filtered_Indices']))])

    # Number of trials is the count of indices for the current row
    trials = len(row['Filtered_Indices'])

    # Hypothesized probability of success (from the model's predictions)
    p = np.mean(y_test)

    # Perform the binomial test
    p_value = binomtest(successes, n=trials, p=p, alternative='two-sided').pvalue

    # Return 1 if statistically significant (outlier), otherwise 0
    return 1 if p_value < 0.05 else 0

def apply_outlier_check_std(valid_combinations_df, y_test, X_transformed, preprocessed_df, tuner, latest_month):
    valid_combinations_df['Outlier_Status_std'] = valid_combinations_df.apply(
        lambda row: check_outlier_std(row, y_test, X_transformed, preprocessed_df, tuner, latest_month),
        axis=1
    )
    return valid_combinations_df

In [None]:
updated_df = apply_outlier_check(valid_combinations_df, y_test, X_transformed, preprocessed_df, tuner, latest_month)
updated_df = apply_outlier_check_std(valid_combinations_df, y_test, X_transformed, preprocessed_df, tuner, latest_month)

In [None]:
updated_df.head()

In [None]:
# Find rows where Outlier_Status is 1 and Outlier_Status_std is 0
only_in_first = updated_df[(updated_df['Outlier_Status'] == 1) & (updated_df['Outlier_Status_std'] == 0)]

# Find rows where Outlier_Status is 0 and Outlier_Status_std is 1
only_in_second = updated_df[(updated_df['Outlier_Status'] == 0) & (updated_df['Outlier_Status_std'] == 1)]

# Find rows where both are 1
present_in_both = updated_df[(updated_df['Outlier_Status'] == 1) & (updated_df['Outlier_Status_std'] == 1)]

# Find rows where both are 0
absent_in_both = updated_df[(updated_df['Outlier_Status'] == 0) & (updated_df['Outlier_Status_std'] == 0)]

# Output the results
print("Rows where Outlier_Status is 1 and Outlier_Status_std is 0:")
print(only_in_first.shape[0])

print("\nRows where Outlier_Status is 0 and Outlier_Status_std is 1:")
print(only_in_second.shape[0])

print("\nRows where both are 1:")
print(present_in_both.shape[0])

print("\nRows where both are 0:")
print(absent_in_both.shape[0])

----------------------------------------------------------------------------------------------------------

In [None]:
# Set the random seed for reproducibility
np.random.seed(42)

# Define constants
num_rows = 15000
age_groups = ['18-24', '25-34', '35-44', '45-54', '55+']
genders = ['Male', 'Female', 'Non-binary']
education_levels = ['High School', 'Associate\'s', 'Bachelor\'s', 'Master\'s', 'Doctorate']
locations = ['Urban', 'Suburban', 'Rural']
income_ranges = ['$20,000 - $30,000', '$30,000 - $50,000', '$50,000 - $70,000',
                '$70,000 - $90,000', '$90,000 - $110,000']

# Generate data
data = {
    "Survey_ID": range(1, num_rows + 1),
    "Age_Group": [random.choice(age_groups) for _ in range(num_rows)],
    "Gender": [random.choice(genders) for _ in range(num_rows)],
    "Education_Level": [random.choice(education_levels) for _ in range(num_rows)],
    "Location": [random.choice(locations) for _ in range(num_rows)],
    "Income_Range": [random.choice(income_ranges) for _ in range(num_rows)],
    "Happiness_Score": np.random.choice([0, 1], size=num_rows),  # Binary 0 or 1
    "Work_Satisfaction": np.random.randint(1, 11, size=num_rows),  # 1 to 10
    "Social_Interactions": np.random.randint(1, 11, size=num_rows),  # 1 to 10
    "Physical_Health": np.random.randint(1, 11, size=num_rows),  # 1 to 10
    "Mental_Health": np.random.randint(1, 11, size=num_rows),  # 1 to 10
    "Major_Stressors": [random.choice(['Job', 'Finances', 'Health', 'Relationships', 'Family', 'Workload']) for _ in range(num_rows)],
}

# Create DataFrame
survey_df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
survey_df.head()

# Save to CSV if needed
# survey_df.to_csv("happiness_survey_data.csv", index=False)

In [None]:
def custom_combiner(feature, category):
    return str(feature) + ":" + str(category)


# Define the preprocess function as a pipeline
def create_preprocessing_pipeline():
    # Define the column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(drop='first',feature_name_combiner=custom_combiner), ['Age_Group', 'Gender', 'Education_Level', 'Location', 'Income_Range','Major_Stressors']),
        ],
        remainder='passthrough'  # Keep other columns unchanged
    )

    # Define the full pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
    ])

    return pipeline


In [None]:
# Preprocess the entire dataset
preprocessed_df = survey_df

# Create a preprocessing pipeline
pipeline = create_preprocessing_pipeline()

# Separate into features and target variable
X_full = preprocessed_df.drop(columns=['Survey_ID', 'Happiness_Score'])
y_full = preprocessed_df['Happiness_Score']

# Fit the pipeline on the full data
X_transformed = pipeline.fit_transform(X_full)

In [None]:
# Extract one-hot encoded column names
onehot_columns = pipeline.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out()

# Identify passthrough columns
passthrough_columns = [col for col in X_full.columns if col not in ['Age_Group', 'Gender', 'Education_Level', 'Location', 'Income_Range', 'Major_Stressors']]

# Combine one-hot encoded columns with passthrough columns
final_columns = list(onehot_columns) + passthrough_columns

# Convert the transformed array to a DataFrame and apply the column names
X_transformed_df = pd.DataFrame(X_transformed, columns=final_columns)

# Display the first few rows of the transformed DataFrame
X_transformed_df.head()

In [None]:
# Global variable to store the filtered indices
filtered_indices = []

# Step 1: Identify non-numeric columns (excluding 'Survey ID')
non_numeric_cols = preprocessed_df.select_dtypes(exclude=['number']).columns.tolist()
if 'Survey ID' in non_numeric_cols:
    non_numeric_cols.remove('Survey ID')

# Step 2: Create widgets for selecting values from non-numeric columns
dropdowns = {}
for col in non_numeric_cols:
    unique_values = preprocessed_df[col].unique().tolist()
    dropdowns[col] = widgets.Dropdown(
        options=['All'] + unique_values,
        description=col,
        style={'description_width': 'initial'}
    )


all_cols = []
# Function to apply the filters and return the index values of filtered rows
def apply_filters(*args):
    global filtered_indices
    filtered_df = preprocessed_df.copy()

    # Apply filters
    for col, dropdown in dropdowns.items():
        if dropdown.value != 'All':
            filtered_df = filtered_df[filtered_df[col] == dropdown.value]
            all_cols.append(f'{col}:{dropdown.value}')

    # Step 3: Get index values of filtered rows
    filtered_indices = filtered_df.index.tolist()

    # Display the indices
    print("Filtered row indices Done")

# Step 4: Create an "Apply Filters" button
apply_button = widgets.Button(description="Apply Filters")
apply_button.on_click(apply_filters)

# Display dropdowns and button
display(widgets.VBox(list(dropdowns.values()) + [apply_button]))

In [None]:
for i in range(0,len(all_cols)):
  print(str(all_cols[i]))


In [None]:
all_cols = [x.split(':')[0] for x in all_cols ]
filtered_columns = [col for col in X_transformed_df.columns if any(keyword in col for keyword in all_cols)]
X_transformed_df[filtered_columns] = np.nan

In [None]:
train_indices = preprocessed_df.index.difference(filtered_indices)

# Split into train and test sets
X_train = X_transformed[train_indices]
y_train = preprocessed_df.loc[train_indices]['Happiness_Score']
X_test = X_transformed[filtered_indices]
y_test = preprocessed_df.loc[filtered_indices]['Happiness_Score']

In [None]:
class XGBoostHyperparameterTuner:
    def __init__(self, X, y):
        self.X_train = X
        self.y_train = y
        self.model = None
        self.best_params = None

    def objective(self, params):
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        loglosses = []

        for train_index, val_index in kf.split(self.X_train):
            # Use iloc to ensure positional indexing
            X_kf_train, X_kf_val = self.X_train[train_index], self.X_train[val_index]
            y_kf_train, y_kf_val = self.y_train.iloc[train_index], self.y_train.iloc[val_index]

            dtrain = xgb.DMatrix(X_kf_train, label=y_kf_train)
            dval = xgb.DMatrix(X_kf_val, label=y_kf_val)

            # Train the model with early stopping
            model = xgb.train(params, dtrain, num_boost_round=100,
                              evals=[(dval, 'eval')],
                              early_stopping_rounds=10,
                              verbose_eval=False)

            # Predict on validation data and calculate logloss
            preds = model.predict(dval)
            logloss = -1 * np.mean(np.log((preds - y_kf_val)**2))
            loglosses.append(logloss)

        # Return mean logloss across all folds
        mean_logloss = np.mean(loglosses)
        return {'loss': mean_logloss, 'status': STATUS_OK}

    def tune_hyperparameters(self, max_evals=50):
        space = {
            'max_depth': hp.randint('max_depth', 3, 10),
            'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
            'n_estimators': hp.randint('n_estimators', 50, 200),
            'gamma': hp.uniform('gamma', 0, 5),
            'subsample': hp.uniform('subsample', 0.5, 1.0),
            'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
            'objective': 'binary:logistic',
            'eval_metric': 'logloss'
        }

        trials = Trials()
        best = fmin(fn=self.objective, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials)
        self.best_params = best
        return best

    def train_final_model(self):
        dtrain = xgb.DMatrix(self.X_train, label=self.y_train)
        self.model = xgb.train({**self.best_params, 'objective': 'binary:logistic', 'eval_metric': 'logloss'}, dtrain, num_boost_round=100)

    def predict(self, X):
        if self.model is None:
            raise Exception("Model has not been trained yet!")
        dval = xgb.DMatrix(X)
        return self.model.predict(dval)

In [None]:
if (len(train_indices) < len(filtered_indices)) | (len(train_indices) < 20*len(X_transformed_df.columns)):
  print("sample too small to train")
else:
  # Initialize and tune the model
  tuner = XGBoostHyperparameterTuner(X_train, y_train)
  best_params = tuner.tune_hyperparameters(max_evals=50)
  print("Best hyperparameters:", best_params)

  # Train the final model with the best hyperparameters
  tuner.train_final_model()

  # Access the trained model
  model = tuner.model

  # Make predictions on the validation set
  predictions = tuner.predict(X_train)

In [None]:
if (len(train_indices) < len(filtered_indices)) | (len(train_indices) < 20*len(X_transformed_df.columns)):
  print("sample too small to train")
else:
  # Number of successes (e.g., heads in a coin flip)
  successes = np.sum(y_test)

  # Number of trials (e.g., total number of coin flips)
  trials = y_test.shape[0]

  # Hypothesized probability of success (e.g., fair coin: p = 0.5)
  p = np.mean(tuner.predict(X_test))

  # Perform the binomial test
  p_value = binomtest(successes, n=trials, p=p, alternative='two-sided').pvalue

  print(f'P-value of the test: {p_value}')

  # Check significance at 0.05 level
  if p_value < 0.05:
      print("The result is statistically significant at the 0.05 level and this is an outlier.")
  else:
      print("The result is not statistically significant at the 0.05 level and this is not an outlier.")

In [None]:
if (len(train_indices) < len(filtered_indices)) | (len(train_indices) < 20*len(X_transformed_df.columns)):
  print("sample too small to train")
else:
  # Number of successes (e.g., heads in a coin flip)
  successes = np.sum(y_test)

  # Number of trials (e.g., total number of coin flips)
  trials = y_test.shape[0]


  # Hypothesized probability of success (e.g., fair coin: p = 0.5)
  p = np.mean(y_train)

  # Perform the binomial test
  p_value = binomtest(successes, n=trials, p=p, alternative='two-sided').pvalue

  print(f'P-value of the test: {p_value}')

  # Check significance at 0.05 level
  if p_value < 0.05:
      print("The result is statistically significant at the 0.05 level and this is an outlier.")
  else:
      print("The result is not statistically significant at the 0.05 level and this is not an outlier.")