In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)


## Model training
### Tree based models
* XGB, LGBM, RF, ETC, DT

### Neural Networks
* NN, autoencoders(?), GNN


## Model Evaluation

### Get Shap for model explanability both local and global
* Tune model on best model selected (either on precision/f1/recall and feature blend)

### Test model on out of sample set
* Get metrics(acc,precision,f1, etc..) from out of sample set

## Selection of Best Model

### Dashboard(?) --enhancement
* Transfer results to a dashboard

# Import file here

In [2]:
# Load Data
df = pd.read_csv("twitter_human_bots_dataset.csv")


In [3]:

# Convert 'created_at' to datetime
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

# Calculate Account Age
current_time = datetime.now()
df['account_age_days'] = (current_time - df['created_at']).dt.days

# Time-Based Features
df['creation_hour'] = df['created_at'].dt.hour
df['creation_day_of_week'] = df['created_at'].dt.dayofweek
df['creation_month'] = df['created_at'].dt.month
df['creation_year'] = df['created_at'].dt.year
df['creation_quarter'] = df['created_at'].dt.quarter
df['is_weekend'] = df['creation_day_of_week'] >= 5
df['creation_week_of_year'] = df['created_at'].dt.isocalendar().week
df['is_beginning_of_month'] = df['created_at'].dt.day <= 5
df['is_end_of_month'] = df['created_at'].dt.day >= 26

# Define part of day based on hour
def part_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

df['part_of_day'] = df['creation_hour'].apply(part_of_day)

# Additional Features
humans_mean = df[df['account_type'] == 'human']['average_tweets_per_day'].mean()
humans_std = df[df['account_type'] == 'human']['average_tweets_per_day'].std()
df['deviation_from_humans'] = (df['average_tweets_per_day'] - humans_mean) / humans_std

# Description Length Feature
df['description_length'] = df['description'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)

# Followers/Friends Ratios
df['followers_to_friends_ratio'] = df['followers_count'] / df['friends_count']
df['followers_to_friends_ratio'].fillna(0, inplace=True)

# Followers to Tweets Per Day Ratio
df['followers_to_tweets_per_day_ratio'] = df['followers_count'] / df['average_tweets_per_day']
df['followers_to_tweets_per_day_ratio'].fillna(0, inplace=True)

# Mentions Count in Description
import re

def extract_mentions(description):
    return re.findall(r'@\w+', str(description))

df['mentions'] = df['description'].apply(extract_mentions)
df['mention_count'] = df['mentions'].apply(len)

# Ensure any remaining NaN values are filled if necessary
########################################################################
# TO DISCUSS METHOD OF IMPUTATION
########################################################################
df.fillna(0, inplace=True)
df.replace(np.inf,0, inplace = True)

# ENCODING METHOD FOR LANG AND LOCATION TO BE DISCUSSSED

In [None]:
# Encoding Categorical Features
df['account_type'] = df['account_type'].map({'human': 0, 'bot': 1})

encode_cols = ['default_profile', 'default_profile_image', 'geo_enabled', 'lang', 'location', 'verified',
               'creation_year', 'is_weekend', 'is_beginning_of_month', 'is_end_of_month', 'part_of_day']

label_encoder = LabelEncoder()
for col in encode_cols:
    df[col] = label_encoder.fit_transform(df[col].astype(str))  # Convert to string to handle NaNs if any


# Define Feature Columns and Target
id_col = ['id']
labels = ['account_type']
predictive_cols = ['default_profile', 'default_profile_image', 'favourites_count', 'followers_count', 'friends_count',
                   'geo_enabled', 'lang', 'location', 'statuses_count', 'verified', 'average_tweets_per_day', 
                   'account_age_days', 'creation_hour', 'creation_day_of_week', 'creation_month', 'creation_year',
                   'creation_quarter', 'is_weekend', 'creation_week_of_year', 'is_beginning_of_month', 
                   'is_end_of_month', 'part_of_day', 'deviation_from_humans', 'description_length', 
                   'followers_to_friends_ratio', 'followers_to_tweets_per_day_ratio', 'mention_count','account_type']



In [None]:
# Set cutoff date for training/validation split
cutoff_date = pd.to_datetime('2017-01-01')
df['date'] = pd.to_datetime(df['created_at'])

# Define columns to keep
id_cols = ['id']
target_cols = ['account_type']
# Assuming predictive_cols is already defined
columns_to_keep = id_cols + predictive_cols

# Filter columns before splitting
df_filtered = df[columns_to_keep + ['date']]

# Split data based on cutoff date
oot = df_filtered[df_filtered['date'] >= cutoff_date].set_index('id')
df_model = df_filtered[df_filtered['date'] < cutoff_date].set_index('id')

# Drop date column as it's no longer needed
oot = oot.drop('date', axis=1)
df_model = df_model.drop('date', axis=1)

# Train, Test, Validation Splits
RANDOM_SEED = 2024
train, test = train_test_split(df_model, test_size=0.2, random_state=RANDOM_SEED)
train, valid = train_test_split(train, test_size=0.2, random_state=RANDOM_SEED)

# Add 'X_fold' columns for each split
train['X_fold'] = 'train'
test['X_fold'] = 'test'
valid['X_fold'] = 'valid'
oot['X_fold'] = 'oot'

# Combine all datasets for modeling
mds = pd.concat([train, test, valid, oot]).copy(deep=True)
mds = mds.reset_index()

# Plot correlation heatmap for predictive columns
corr = mds[predictive_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap of Predictive Features')
plt.show()

# Check target distribution
print("\nTarget Distribution:")
print(mds['account_type'].value_counts())
print("\nTarget Distribution (%):")
print(mds['account_type'].value_counts(normalize=True) * 100)

# Final Data Overview
print("\nData Sample:")
print(mds.head())
print("\nDataset Info:")
print(mds.info())

# Verify final columns
print("\nFinal columns in dataset:")
print(mds.columns.tolist())

# Verify data splits
print("\nData split sizes:")
print(mds['X_fold'].value_counts())

In [16]:
import itertools
import math
from hyperopt import hp
import xgboost as xgb
import lightgbm as lgb
from IPython.display import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import re
from sklearn.model_selection import train_test_split
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Import custom functions
# from utils import mpr_report, final_fitting

# from lmf2 import LoadModelFunction#, final_fitting

import os

# Might shift this into another file

In [None]:
import lightgbm as lgb
import xgboost as xgb
from xgboost import DMatrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.model_selection import KFold, TimeSeriesSplit
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import time
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import json
from functools import partial
import warnings
warnings.filterwarnings("ignore")

class UnifiedModelTrainer:
    def __init__(self, random_state=2024):
        self.random_state = random_state

    def prepare_data(self, df, features, target):
        """Prepare train, validation, test and out-of-time datasets"""
        train_valid = df[df['X_fold'].isin(['train', 'valid'])]
        test = df[df['X_fold'] == 'test']
        oot = df[df['X_fold'] == 'oot']
        
        X_train_valid = train_valid[features]
        y_train_valid = train_valid[target]
        
        X_test = test[features]
        y_test = test[target]
        
        X_oot = oot[features]
        y_oot = oot[target]
        
        return (X_train_valid, y_train_valid), (X_test, y_test), (X_oot, y_oot)
    
    def evaluate_model(self, model, X_test, y_test, X_oot, y_oot):
        """Evaluate model performance on test and OOT datasets"""
        if isinstance(model, lgb.Booster):
            pred_test = model.predict(X_test)
            pred_oot = model.predict(X_oot)
        else:
            pred_test = model.predict_proba(X_test)[:, 1]
            pred_oot = model.predict_proba(X_oot)[:, 1]
        
        metrics = {
            'test_auc': roc_auc_score(y_test, pred_test),
            'oot_auc': roc_auc_score(y_oot, pred_oot)
        }
        
        # Calculate additional metrics for different thresholds
        threshold_metrics = {}
        for threshold in [0.5, 0.8, 0.85, 0.9, 0.95]:
            pred_test_binary = (pred_test > threshold).astype(int)
            pred_oot_binary = (pred_oot > threshold).astype(int)
            
            threshold_metrics[f'threshold_{threshold}'] = {
                'test': {
                    'precision': precision_score(y_test, pred_test_binary),
                    'recall': recall_score(y_test, pred_test_binary),
                    'f1': f1_score(y_test, pred_test_binary),
                    'auc': roc_auc_score(y_test, pred_test_binary)
                },
                'oot': {
                    'precision': precision_score(y_oot, pred_oot_binary),
                    'recall': recall_score(y_oot, pred_oot_binary),
                    'f1': f1_score(y_oot, pred_oot_binary),
                    'auc': roc_auc_score(y_oot, pred_oot_binary)

                }
            }
        
        metrics['threshold_metrics'] = threshold_metrics
        return metrics
        
    from hyperopt import hp

    def create_search_space(self, model_type):
        """Define complete search space including data preparation and model parameters."""
        
        # Common parameters
        common_params = {
            'k_folds': hp.choice('k_folds', [3, 5, 7, 10]),
            'k_split': hp.choice('k_split', ['non_ts', 'ts']),
            
            # Feature selection parameters
            'f_method': hp.choice('f_method', [
                'all',  # Use all features
                'kbest_f',  # SelectKBest with f_classif
                'kbest_mi',  # SelectKBest with mutual_info_classif
                'l1',  # L1-based feature selection
                'tree_importance'  # Tree-based feature importance
            ]),
            'num_feats': hp.choice('num_feats', [20, 30, 40, 50, 'all']),
            
            # Scaling parameters
            'scaler': hp.choice('scaler', [
                'noscaler',
                'standard',
                'minmax',
                'robust'
            ]),
            
            # Fixed parameters
            'SEED': self.random_state
        }
        
        if model_type == 'lgbm':
            # LightGBM specific parameters
            lgbm_params = {
                'num_leaves': hp.quniform('num_leaves', 15, 127, 1),
                'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
                'feature_fraction': hp.uniform('feature_fraction', 0.6, 0.9),
                'bagging_fraction': hp.uniform('bagging_fraction', 0.6, 0.9),
                'bagging_freq': hp.quniform('bagging_freq', 2, 10, 1),
                'min_child_samples': hp.quniform('min_child_samples', 10, 150, 1),
                'max_depth': hp.quniform('max_depth', 3, 12, 1),
                'n_estimators': hp.quniform('n_estimators', 100, 1000, 50)
            }
            return {**common_params, **lgbm_params}

        elif model_type == 'xgb':
            # XGBoost specific parameters
            xgb_params = {
                'max_depth': hp.quniform('max_depth', 3, 12, 1),
                'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
                'subsample': hp.uniform('subsample', 0.6, 1.0),
                'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
                'min_child_weight': hp.quniform('min_child_weight', 1, 7, 1),
                'n_estimators': hp.quniform('n_estimators', 100, 1000, 50)
            }
            return {**common_params, **xgb_params}
        
        else:
            raise ValueError("Invalid model type. Please choose 'lgbm' or 'xgb'.")

    
    def apply_feature_selection(self, X, y, method, num_feats):
        """Apply feature selection based on specified method"""
        if method == 'all' or num_feats == 'all':
            return X
            
        n_features = min(num_feats, X.shape[1])
        
        if method == 'kbest_f':
            selector = SelectKBest(score_func=f_classif, k=n_features)
        elif method == 'kbest_mi':
            selector = SelectKBest(score_func=mutual_info_classif, k=n_features)
        elif method == 'l1':
            from sklearn.linear_model import LogisticRegression
            selector = LogisticRegression(penalty='l1', solver='saga', random_state=self.random_state)
            selector.fit(X, y)
            mask = np.abs(selector.coef_[0]) > 0
            return X.iloc[:, mask]
        elif method == 'tree_importance':
            model = lgb.LGBMClassifier(random_state=self.random_state)
            model.fit(X, y)
            importance = pd.Series(model.feature_importances_, index=X.columns)
            selected_features = importance.nlargest(n_features).index
            return X[selected_features]
            
        if method in ['kbest_f', 'kbest_mi']:
            selector.fit(X, y)
            mask = selector.get_support()
            return X.iloc[:, mask]
            
        return X
    
    def plot_results(self, metrics, output_dir):
        """Create and save visualization of results"""
        # Plot threshold performance
        plt.figure(figsize=(12, 6))
        thresholds = sorted(metrics['threshold_metrics'].keys(), 
                          key=lambda x: float(x.split('_')[1]))
        
        test_precision = [metrics['threshold_metrics'][t]['test']['precision'] 
                         for t in thresholds]
        oot_precision = [metrics['threshold_metrics'][t]['oot']['precision'] 
                        for t in thresholds]
        
        plt.plot(thresholds, test_precision, label='Test Precision')
        plt.plot(thresholds, oot_precision, label='OOT Precision')
        plt.xlabel('Threshold')
        plt.ylabel('Precision')
        plt.title('Model Performance Across Thresholds')
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"{output_dir}/threshold_performance.png")
        plt.close()
    
    def apply_scaling(self, X_train, X_test, X_oot, scaler_type):
        """Apply scaling transformation to the data"""
        if scaler_type == 'noscaler':
            return X_train, X_test, X_oot
            
        if scaler_type == 'standard':
            scaler = StandardScaler()
        elif scaler_type == 'minmax':
            scaler = MinMaxScaler()
        elif scaler_type == 'robust':
            scaler = RobustScaler()
            
        X_train_scaled = pd.DataFrame(
            scaler.fit_transform(X_train),
            columns=X_train.columns,
            index=X_train.index
        )
        X_test_scaled = pd.DataFrame(
            scaler.transform(X_test),
            columns=X_test.columns,
            index=X_test.index
        )
        X_oot_scaled = pd.DataFrame(
            scaler.transform(X_oot),
            columns=X_oot.columns,
            index=X_oot.index
        )
        
        return X_train_scaled, X_test_scaled, X_oot_scaled
    
    def get_cv_splitter(self, k_split, k_folds):
        """Get cross-validation splitter based on specified method"""
        if k_split == 'non_ts':
            return KFold(n_splits=k_folds, shuffle=True, random_state=self.random_state)
        else:
            return TimeSeriesSplit(n_splits=k_folds)
    
    def objective_lgb(self, space, X_train, y_train, X_test, y_test, X_oot, y_oot):
        """Enhanced objective function for LightGBM optimization"""
        try:
            # Apply feature selection
            X_train_selected = self.apply_feature_selection(
                X_train, y_train, space['f_method'], space['num_feats']
            )
            X_test_selected = X_test[X_train_selected.columns]
            X_oot_selected = X_oot[X_train_selected.columns]
            
            # Apply scaling
            X_train_processed, X_test_processed, X_oot_processed = self.apply_scaling(
                X_train_selected, X_test_selected, X_oot_selected, space['scaler']
            )
            
            # Create CV splitter
            cv_splitter = self.get_cv_splitter(space['k_split'], int(space['k_folds']))
            
            # Prepare LightGBM parameters
            params = {
                'objective': 'binary',
                'metric': 'auc',
                'boosting_type': 'gbdt',
                'num_leaves': int(space['num_leaves']),
                'learning_rate': space['learning_rate'],
                'feature_fraction': space['feature_fraction'],
                'bagging_fraction': space['bagging_fraction'],
                'bagging_freq': int(space['bagging_freq']),
                'min_child_samples': int(space['min_child_samples']),
                'max_depth': int(space['max_depth']),
                'n_estimators': int(space['n_estimators']),
                'verbose': -1,
                'random_state': space['SEED']
            }
            
            # Cross-validation scores
            cv_scores = []
            for train_idx, valid_idx in cv_splitter.split(X_train_processed):
                X_fold_train = X_train_processed.iloc[train_idx]
                y_fold_train = y_train.iloc[train_idx]
                X_fold_valid = X_train_processed.iloc[valid_idx]
                y_fold_valid = y_train.iloc[valid_idx]
                
                train_data = lgb.Dataset(X_fold_train, label=y_fold_train)
                valid_data = lgb.Dataset(X_fold_valid, label=y_fold_valid, reference=train_data)
                
                model = lgb.train(
                    params,
                    train_data,
                    valid_sets=[valid_data],
                    callbacks=[lgb.early_stopping(stopping_rounds=20)]
                )
                
                pred_valid = model.predict(X_fold_valid)
                cv_scores.append(roc_auc_score(y_fold_valid, pred_valid)) # modify metric here for tuning for other metrics
            
            # Train final model on full training data
            train_data = lgb.Dataset(X_train_processed, label=y_train)
            valid_data = lgb.Dataset(X_test_processed, label=y_test, reference=train_data)
            
            final_model = lgb.train(
                params,
                train_data,
                valid_sets=[valid_data],
                callbacks=[lgb.early_stopping(stopping_rounds=20)]
            )
            
            # Calculate metrics
            metrics = self.evaluate_model(
                final_model, 
                X_test_processed, y_test,
                X_oot_processed, y_oot
            )
            
            metrics['cv_score_mean'] = np.mean(cv_scores)
            metrics['cv_score_std'] = np.std(cv_scores)
            
            return {
                'loss': -metrics['cv_score_mean'],  # Optimize for CV performance
                'status': STATUS_OK,
                'model': final_model,
                'metrics': metrics,
                'params': params,
                'feature_columns': list(X_train_processed.columns)
            }
            
        except Exception as e:
            print(f"Error in objective function: {str(e)}")
            return {'loss': 0, 'status': STATUS_OK, 'model': None}
    
    def objective_xgb(self, space, X_train, y_train, X_test, y_test, X_oot, y_oot):
        """Enhanced objective function for XGBoost optimization"""
        try:
            # Apply feature selection
            X_train_selected = self.apply_feature_selection(
                X_train, y_train, space['f_method'], space['num_feats']
            )
            X_test_selected = X_test[X_train_selected.columns]
            X_oot_selected = X_oot[X_train_selected.columns]
            
            # Apply scaling
            X_train_processed, X_test_processed, X_oot_processed = self.apply_scaling(
                X_train_selected, X_test_selected, X_oot_selected, space['scaler']
            )
            
            # Create CV splitter
            cv_splitter = self.get_cv_splitter(space['k_split'], int(space['k_folds']))
            
            # Prepare XGBoost parameters
            params = {
                'objective': 'binary:logistic',
                'eval_metric': 'auc',
                'max_depth': int(space['max_depth']),
                'learning_rate': space['learning_rate'],
                'subsample': space['subsample'],
                'colsample_bytree': space['colsample_bytree'],
                'min_child_weight': int(space['min_child_weight']),
                'n_estimators': int(space['n_estimators']),
                'random_state': space['SEED']
            }
            
            # Cross-validation scores
            cv_scores = []
            for train_idx, valid_idx in cv_splitter.split(X_train_processed):
                X_fold_train = X_train_processed.iloc[train_idx]
                y_fold_train = y_train.iloc[train_idx]
                X_fold_valid = X_train_processed.iloc[valid_idx]
                y_fold_valid = y_train.iloc[valid_idx]
                
                model = xgb.XGBClassifier(**params)
                model.fit(X_fold_train, y_fold_train, eval_set=[(X_fold_valid, y_fold_valid)], 
                        verbose=False)
                
                pred_valid = model.predict_proba(X_fold_valid)[:, 1]
                cv_scores.append(roc_auc_score(y_fold_valid, pred_valid))
            
            # Train final model on full training data
            model = xgb.XGBClassifier(**params)
            model.fit(X_train_processed, y_train, eval_set=[(X_test_processed, y_test)], 
                     verbose=False)
            
            # Calculate metrics
            metrics = self.evaluate_model(
                model, 
                X_test_processed, y_test,
                X_oot_processed, y_oot
            )
            
            metrics['cv_score_mean'] = np.mean(cv_scores)
            metrics['cv_score_std'] = np.std(cv_scores)
            
            return {
                'loss': -metrics['cv_score_mean'],  # Optimize for CV performance
                'status': STATUS_OK,
                'model': model,
                'metrics': metrics,
                'params': params,
                'feature_columns': list(X_train_processed.columns)
            }
            
        except Exception as e:
            print(f"Error in objective function: {str(e)}")
            return {'loss': 0, 'status': STATUS_OK, 'model': None}

    
    def optimize_model(self, df, features, target, max_evals=20, model_type = ""):
        """Run complete model optimization pipeline with enhanced search space"""
        # Prepare data
        (X_train_valid, y_train_valid), (X_test, y_test), (X_oot, y_oot) = self.prepare_data(
            df, features, target
        )
        if model_type == 'lgbm':
        # Define objective function with prepared data
            objective = partial(
                self.objective_lgb,
                X_train=X_train_valid,
                y_train=y_train_valid,
                X_test=X_test,
                y_test=y_test,
                X_oot=X_oot,
                y_oot=y_oot
            )
        
        if model_type == 'xgb':
            objective = partial(
                self.objective_xgb,
                X_train=X_train_valid,
                y_train=y_train_valid,
                X_test=X_test,
                y_test=y_test,
                X_oot=X_oot,
                y_oot=y_oot
            )
        
        # Run optimization
        trials = Trials()
        best = fmin(
            fn=objective,
            space=self.create_search_space(model_type),
            algo=tpe.suggest,
            max_evals=max_evals,
            trials=trials
        )
        
        # Get best trial
        best_trial = sorted(trials.trials, key=lambda x: x['result']['loss'])[0]
        
        # Store trials history
        self.trials_history = pd.DataFrame([
            {
                **trial['misc']['vals'],
                'cv_score': -trial['result']['loss'],
                'test_auc': trial['result']['metrics']['test_auc'],
                'oot_auc': trial['result']['metrics']['oot_auc'],
                **{
                    f"threshold_{threshold}_{metric}_{set_type}": trial['result']['metrics']['threshold_metrics'][f'threshold_{threshold}'][set_type][metric]
                    for threshold in [0.5,0.8, 0.85, 0.9, 0.95]
                    for set_type in ['test', 'oot']
                    for metric in ['precision', 'recall', 'f1', 'auc']
                }
            }
            for trial in trials.trials
            if 'loss' in trial['result']
        ])
        
        return (
            best_trial['result']['model'],
            best_trial['result']['metrics'],
            best_trial['result']['params'],
            best_trial['result']['feature_columns']
        )
    
    # EITHER THIS OR SAVE_MODEL -- NEED TO FIX
    def save_results(self, model, metrics, params, output_dir):
        """Save model, parameters and metrics"""
        # Save model
        model.save_model(f"{output_dir}/best_model.txt")
        
        # Save parameters
        with open(f"{output_dir}/best_params.json", 'w') as f:
            json.dump(params, f, indent=4)
        
        # Save metrics
        with open(f"{output_dir}/metrics.json", 'w') as f:
            json.dump(metrics, f, indent=4)
    

    # Pickling for XGB -- NEED TO FIX
    def save_model(self, model, output_dir, model_name='best_model.pkl'):
        """Save the trained XGBoost model to a pickle file."""
        with open(f"{output_dir}/{model_name}", 'wb') as f:
            pickle.dump(model, f)

    def load_model(self, model_path):
        """Load a trained XGBoost model from a pickle file."""
        with open(model_path, 'rb') as f:
            return pickle.load(f)
        


In [None]:
# Initialize trainer
trainer = UnifiedModelTrainer(random_state=2024)

# Load data
df = mds
features = [col for col in df.columns if col not in ['account_type', 'X_fold']]
target = 'account_type'

# Run optimization with extended search space
model, metrics, params, selected_features = trainer.optimize_model(
    df, features, target, max_evals=10, model_type = 'xgb'
)


# SHAP EXPLAINABILITY

In [None]:
trainer.save_model(model, "output_directory")

import shap

# Load the model from the pickle file
loaded_model = trainer.load_model('output_directory/best_model.pkl')

# Assuming you have your data prepared
explainer = shap.Explainer(loaded_model)
test = df[df['X_fold'] == 'test']
X_test = test[features]
y_test = test[target]    
shap_values = explainer(X_test)

# Visualize the SHAP values
shap.summary_plot(shap_values, X_test)


# To get Features and Metrics of best MODEL

In [None]:

# Save results -- NEED TO FIX
#trainer.save_results(model, metrics, params, "output_directory")
trainer.save_model(model, metrics, params)
trainer.plot_results(metrics, "output_directory")

# Access trials history
print(trainer.trials_history)


# Sort trials history by test AUC to get the highest AUC model
best_auc_trial = trainer.trials_history.sort_values(by='test_auc', ascending=False).iloc[0]

best_auc_trial[['test_auc','threshold_0.5_auc_test','threshold_0.8_auc_test',
       'threshold_0.8_auc_oot',
       'threshold_0.85_auc_test',
       'threshold_0.85_auc_oot',
       'threshold_0.9_auc_test',
       'threshold_0.9_auc_oot',
       'threshold_0.95_auc_test',
       'threshold_0.95_auc_oot']]
