<a href="https://colab.research.google.com/github/xrolly28/Data-Science-Project/blob/main/automl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [105]:
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBRegressor
from sklearn.cluster import KMeans, DBSCAN
from sklearn.model_selection import cross_val_score
from sklearn.metrics import silhouette_score, calinski_harabasz_score, make_scorer,roc_auc_score,mean_squared_error

In [106]:
def Search_space(model_name):
  if model_name==Ridge:
      search_space={
        'alpha': (0.01, 10.0),
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
      }
  elif model_name==Lasso:
     search_space={
        'alpha':  (0.01, 10.0),
        'selection': ['cyclic', 'random'],
       }
  elif model_name==RandomForestClassifier:
    search_space={
    'n_estimators':(10, 100),
    'max_depth': (5, 50),
    'min_samples_split':(2, 11),
    'min_samples_leaf': (1, 11)
    }
  elif model_name ==SVC:
    search_space={
    'C':  (1e-6, 1e+2),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree':  (1, 6),
    'gamma':(1e-6, 1e+1)
    }
  elif model_name==KNeighborsClassifier:
    search_space={
    'n_neighbors':(1, 30),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'chebyshev'],
    }
  elif model_name==LogisticRegression:
     search_space={
      'C': (0.001, 100),
      'penalty': ['l1', 'l2', 'none'],
      'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
    }
  elif model_name==XGBRegressor:
    search_space= {
    'learning_rate': (0.01, 0.3),
    'n_estimators': (50, 200),
    #'max_depth': (1, 10),
    #'min_child_weight': (1, 10),
    #'subsample':  (0.5, 1.0),
    #'colsample_bytree': (0.5, 1.0),
    'reg_alpha': (0.0, 1.0),
    'reg_lambda':  (0.0, 1.0),
    #'gamma':  (0.0, 10.0),
    'booster': ['gbtree', 'gblinear', 'dart'],
    # 'objective': ['reg:squarederror'],
    }
  elif model_name==AdaBoostClassifier:
    search_space={
    'n_estimators':  (50, 200),
    'learning_rate': (0.01, 1)
    }
  elif model_name==KMeans:
   search_space={
    'n_clusters': (2, 10),
    'init': ['k-means++', 'random']
    # 'n_init'=10
    }
  elif model_name==DBSCAN:
    search_space={
    'eps': (1e-3, 5.0),
    'min_samples': (2, 20),
    'metric': ['euclidean', 'manhattan', 'cosine']
    }
  else:
        raise ValueError("Unsupported model")
  return search_space

In [107]:
# model=AdaBoostClassifier
# search_space=Search_space(model)
# print(search_space)

In [108]:
def sample_hyperparameters(model_name,search_space):
  sampled_params={}
  if model_name==RandomForestClassifier:
     return {
         param: np.random.randint(low, high + 1) for param, (low, high) in search_space.items()
       }
  elif model_name==SVC:
     return {
        'C': np.exp(np.random.uniform(np.log(search_space['C'][0]), np.log(search_space['C'][1]))),
        'kernel': np.random.choice(search_space['kernel']),
        'degree': np.random.randint(*search_space['degree']) if 'degree' in search_space else None,
        'gamma': np.exp(np.random.uniform(np.log(search_space['gamma'][0]), np.log(search_space['gamma'][1])))
     }
  elif model_name==AdaBoostClassifier:
    return{
        'n_estimators': np.random.randint(search_space['n_estimators'][0], search_space['n_estimators'][1]),
         'learning_rate': np.random.uniform(search_space['learning_rate'][0], search_space['learning_rate'][1]),
       }
  else:
    for param, space in search_space.items():
         if isinstance(space, tuple) and len(space) == 2:  # Numeric range
            if isinstance(space[0], int):
                sampled_params[param] = np.random.randint(space[0], space[1])
            elif isinstance(space[0], float):
                sampled_params[param] = np.random.uniform(space[0], space[1])
         elif isinstance(space, list):  # Categorical list
            sampled_params[param] = np.random.choice(space)
    return sampled_params

In [109]:
# search_space=Search_space(AdaBoostClassifier)
# sp=sample_hyperparameters(AdaBoostClassifier,search_space)
# print(sp)

In [110]:
def is_compatible(params):
       solver = params.get('solver')
       penalty = params.get('penalty')
       compatible_combinations = {
        'liblinear': ['l1', 'l2'],
        'newton-cg': ['l2', 'none'],
        'lbfgs': ['l2', 'none'],
        'sag': ['l2', 'none'],
        'saga': ['l1', 'l2', 'elasticnet', 'none']
       }
       return penalty in compatible_combinations.get(solver, [])

In [111]:
def evaluate_model(model,params,X,y=None):
  if model==SVC:
    # model=SVC(params)
    if params['kernel'] != 'poly':
            del params['degree']
    if params['kernel'] == 'linear':
            del params['gamma']
  elif model=='LogisticRegression()':
    if not is_compatible(params):
          return -np.inf
  elif model==XGBRegressor:
       model_is=model(**params)
       return 1-cross_val_score(model_is, X, y, cv=5).mean()
  elif model==KMeans or model==DBSCAN:
       model_is=model(**params)
       labels = model_is.fit_predict(X)
       if len(set(labels)) <= 1:
         return -np.inf
       return silhouette_score(X, labels)
  model_is=model(**params)
  score = cross_val_score(model_is, X, y, cv=5, scoring= 'roc_auc_ovr').mean()
  return score

In [112]:
def encode_categorical_params(params, search_space):
    encoded_params = {}
    for param, value in params.items():
        if param in search_space and isinstance(search_space[param], list):
            for option in search_space[param]:
                encoded_params[f'{param}_{option}'] = 1 if value == option else 0
        else:
            encoded_params[param] = value
    return encoded_params

In [113]:
def get_density(trials, param, value):
            values = [t[param] for t in trials]
            # if len(values) == 0:
            #     return 1
            mean = np.mean(values)
            std = np.std(values)
            if std == 0:
                return 1 if value == mean else 0
            if isinstance(value, str):
               return np.mean(np.array(values) == value)
            return norm(mean,std).pdf(value)

In [114]:
def tpe_optimization(model, X, y=None, max_evals=50, gamma=0.25):
    trials = []
    scores = []
    actual_hp=[]
    is_supervised = y is not None
    search_space=Search_space(model)

    for _ in range(10):
        params = sample_hyperparameters(model,search_space)
        score = evaluate_model(model, params, X,y)
        encoded_params = encode_categorical_params (params, search_space)
        trials.append(encoded_params)
        actual_hp.append(params)
        scores.append(score)

    for _ in range(max_evals - 10):
        threshold = np.percentile(scores, (1-gamma)* 100)
        good_trials = [t for t, s in zip(trials, scores) if s >= threshold]
        bad_trials = [t for t, s in zip(trials, scores) if s < threshold]

        best_score=np.inf
        best_params=None
        for _ in range(100):
            candidate_params = sample_hyperparameters(model,search_space)
            encoded_params = encode_categorical_params(candidate_params, search_space)
            l = np.prod([get_density(bad_trials, k, v) for k, v in encoded_params.items() if v is not None])
            g = np.prod([get_density(good_trials, k, v) for k, v in encoded_params.items() if v is not None])
            score = l / (g + 1e-8)
            if score < best_score:
                best_score = score
                best_params =candidate_params

        score=evaluate_model(model,best_params,X,y)
        actual_hp.append(best_params)
        best_params = encode_categorical_params(best_params, search_space)
        trials.append(best_params)
        scores.append(score)
    best_idx = np.argmax(scores)
    return actual_hp[best_idx], scores[best_idx]

In [115]:
#BO-tpe on load_iris dataset
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.preprocessing import StandardScaler
data = load_iris()
X = data.data
y = data.target

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [116]:
model=AdaBoostClassifier
best_params, best_score_logreg=tpe_optimization(model, X, y, max_evals=50, gamma=0.25)


In [118]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor

# Load the dataset
data = load_diabetes()
X, y = data.data, data.target

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBRegressor
model = XGBRegressor
best_params, best_score=tpe_optimization(model, X, y, max_evals=50, gamma=0.25)

In [119]:
print(best_params)

{'learning_rate': 0.27246813932418107, 'n_estimators': 59, 'reg_alpha': 0.8381637332305075, 'reg_lambda': 0.12484710589669257, 'booster': 'gbtree'}


In [120]:
print(best_score)

0.7503282313401525


#  Emplementation Of BO-TPE Using HYPEROPT

In [92]:
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.preprocessing import StandardScaler

In [101]:
#XGBRegressor on load_diabetes dataset
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import warnings

warnings.filterwarnings('ignore')

# Load the diabetes dataset
data = load_diabetes()
X, y = data.data, data.target

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Define the search space for hyperparameters
search_space = {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),
    'n_estimators': hp.quniform('n_estimators', 50, 200, 1),
    # 'max_depth': hp.quniform('max_depth', 3, 10, 1),
    # 'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    # 'subsample': hp.uniform('subsample', 0.5, 1.0),
    # 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'reg_alpha': hp.loguniform('reg_alpha', np.log(1e-8), np.log(1.0)),
    'reg_lambda': hp.loguniform('reg_lambda', np.log(1e-8), np.log(1.0)),
    # 'gamma': hp.loguniform('gamma', np.log(1e-8), np.log(1.0)),
    'booster': hp.choice('booster', ['gbtree', 'gblinear', 'dart'])
}

# Define the objective function
def objective(params):
    params['n_estimators'] = int(params['n_estimators'])  # Convert to integer
    # params['max_depth'] = int(params['max_depth'])  # Convert to integer
    # params['min_child_weight'] = int(params['min_child_weight'])  # Convert to integer

    model = XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1,
        **params
    )

    score =1-cross_val_score(model, X, y, cv=5).mean()
    return {'loss': score, 'status': STATUS_OK}

# Run hyperparameter optimization
trials = Trials()
best_params = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials
)

# Extract the best booster type
booster_options = ['gbtree', 'gblinear', 'dart']
best_params['booster'] = booster_options[best_params['booster']]

# Print the best hyperparameters
print(f"Best parameters: {best_params}")

# Initialize and train the model with the best parameters
best_model = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    n_jobs=-1,
    **best_params
)
# best_model.fit(X, y)

# Evaluate the model performance using cross-validation
final_score =1- cross_val_score(best_model, X, y, cv=5).mean()
print(f"Cross-validated with best parameters: {final_score}")


100%|██████████| 50/50 [01:14<00:00,  1.49s/trial, best loss: 0.5182993910424878]
Best parameters: {'booster': 'gblinear', 'learning_rate': 0.26044439107276474, 'n_estimators': 186.0, 'reg_alpha': 0.27185361522449863, 'reg_lambda': 2.5510196359470345e-05}


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/sklearn.py", line 1090, in fit
    self._Booster = train(
  File "/usr/local/lib/python3.10/dist-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/usr/local/lib/python3.10/dist-packages/xgboost/training.py", line 178, in train
    for i in range(start_iteration, num_boost_round):
TypeError: 'numpy.float64' object cannot be interpreted as an integer


In [45]:
#Adaboost on iris dataset
data = load_iris()
X = data.data
y = data.target

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

def objective(params):
    n_estimators = int(params['n_estimators'])
    learning_rate = params['learning_rate']

    model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
    score = cross_val_score(model, X, y, cv=5, scoring='roc_auc_ovr').mean()

    return {'loss': -score, 'status': STATUS_OK}


search_space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1))
}
trials = Trials()

best = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=trials
)

best['n_estimators'] = int(best['n_estimators'])
print(f"Best parameters: {best}")

100%|██████████| 50/50 [00:57<00:00,  1.15s/trial, best loss: -0.9883333333333333]
Best parameters: {'learning_rate': 0.19086168265247253, 'n_estimators': 115}


In [46]:
best_model = AdaBoostClassifier(n_estimators=best['n_estimators'], learning_rate=best['learning_rate'])
best_model.fit(X, y)

# Evaluate the final model
final_score = cross_val_score(best_model, X, y, cv=5, scoring='roc_auc_ovr').mean()
print(f"Best ROC AUC score: {final_score}")

Best ROC AUC score: 0.9883333333333333
