In [37]:
#Projet 7 

#undersampling, and oversampling smooth or class weights 
#Area Under Curve pour estimer l'erreur
#Flask ou fastAPI pour faire le API, faire interface avec Stremlit
#Shap line values
#trouver 10 clients plus proches avec KNN du client et voir
# L’application de dashboard interactif répondant aux spécifications ci-dessus et l’API de prédiction du score, déployées chacunes sur le cloud.
# Un dossier, géré via un outil de versioning de code contenant :
# Le notebook ou code de la modélisation (du prétraitement à la prédiction), intégrant via MLFlow le tracking d’expérimentations et le stockage centralisé des modèles
# Le code générant le dashboard
# Le code permettant de déployer le modèle sous forme d'API
# Pour les applications dashboard et API, un fichier introductif permettant de comprendre l'objectif du projet et le découpage des dossiers, et un fichier listant les packages utilisés seront présents dans les dossiers
# Le tableau HTML d’analyse de data drift réalisé à partir d’evidently
# Une note méthodologique décrivant :
# La méthodologie d'entraînement du modèle (2 pages maximum)
# Le traitement du déséquilibre des classes (1 page maximum)
# La fonction coût métier, l'algorithme d'optimisation et la métrique d'évaluation (1 page maximum)
# Un tableau de synthèse des résultats (1 page maximum)
# L’interprétabilité globale et locale du modèle (1 page maximum)
# Les limites et les améliorations possibles (1 page maximum)
# L’analyse du Data Drift (1 page maximum)
# Un support de présentation pour la soutenance, détaillant le travail réalisé (Powerpoint ou équivalent, 30slides maximum).
# Des copies écran des commits, du dossier Github (+ lien vers ce dossier) et de l’exécution des tests unitaires, qui sont les preuves qu’un pipeline de déploiement continu a permis de déployer l’API, doivent être formalisés dans ce support de présentation.
#Outils Open Source pour élaborer une plateforme MLOps
#● MLFlow pour la gestion “d’expériences” et leur tracking lors de la phase d’entraînement des modèles, ainsi que la visualisation desrésultats avec MLFlow UI, pour le partager avec Chris
#● MLFlow pour le stockage centralisé des modèles dans un “model registry” et le serving
#● Git, logiciel de version de code, pour suivre les modifications ducode final de l’API de prédiction de tags à déployer
#● Github pour stocker et partager sur le cloud le code de l’API, alimenté par un “push” Git et ainsi assurer une intégration continue
#● Github Actions pour le déploiement continu et automatisé du code de l’API sur le cloud
#● Pytest (ou Unittest) pour concevoir les tests unitaires et les exécuter de manière automatisée lors du build réalisé par GithubActions

In [1]:
# Load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import auc, roc_curve, roc_auc_score, make_scorer, fbeta_score
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from xgboost import XGBClassifier
from xgboost import plot_importance
from hashlib import sha256
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline


pd.options.display.max_columns = None

# Plot settings
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
sns.set()

# Load applications data
path = ''#'dataset/'
train = pd.read_csv(path + 'application_train.csv')
test = pd.read_csv(path + 'application_test.csv')
train_ids=train['SK_ID_CURR']
test_ids=test['SK_ID_CURR']


# Merge test and train into all application data
train_o = train.copy()
train['Test'] = False
test['Test'] = True
test['TARGET'] = np.nan
app = train.append(test, ignore_index=True)

# Remove entries with gender = XNA
app = app[app['CODE_GENDER'] != 'XNA']
# Remove entries with income type = maternity leave
app = app[app['NAME_INCOME_TYPE'] != 'Maternity leave']
# Remove entries with unknown family status
app = app[app['NAME_FAMILY_STATUS'] != 'Unknown']
app['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
app['PROPORTION_LIFE_EMPLOYED'] = app['DAYS_EMPLOYED'] / app['DAYS_BIRTH']
app['INCOME_TO_CREDIT_RATIO'] = app['AMT_INCOME_TOTAL'] / app['AMT_CREDIT'] 
app['INCOME_TO_ANNUITY_RATIO'] = app['AMT_INCOME_TOTAL'] / app['AMT_ANNUITY']
app['INCOME_TO_ANNUITY_RATIO_BY_AGE'] = app['INCOME_TO_ANNUITY_RATIO'] * app['DAYS_BIRTH']
app['CREDIT_TO_ANNUITY_RATIO'] = app['AMT_CREDIT'] / app['AMT_ANNUITY']
app['CREDIT_TO_ANNUITY_RATIO_BY_AGE'] = app['CREDIT_TO_ANNUITY_RATIO'] * app['DAYS_BIRTH']
app['INCOME_TO_FAMILYSIZE_RATIO'] = app['AMT_INCOME_TOTAL'] / app['CNT_FAM_MEMBERS']

# Add indicator columns for empty values
for col in app:
    if col!='Test' and col!='TARGET':
        app_null = app[col].isnull()
        if app_null.sum()>0:
            app[col+'_ISNULL'] = app_null

# Label encoder
le = LabelEncoder()

# Label encode binary fearures in training set
for col in app: 
    if col!='Test' and col!='TARGET' and app[col].dtype==object and app[col].nunique()==2:
        if col+'_ISNULL' in app.columns: #missing values here?
            app.loc[app[col+'_ISNULL'], col] = 'NaN'
        app[col] = le.fit_transform(app[col])
        if col+'_ISNULL' in app.columns: #re-remove missing vals
            app.loc[app[col+'_ISNULL'], col] = np.nan            

# Get categorical features to encode
cat_features = []
for col in app: 
    if col!='Test' and col!='TARGET' and app[col].dtype==object and app[col].nunique()>2:
        cat_features.append(col)

# One-hot encode categorical features in train set
app = pd.get_dummies(app, columns=cat_features)

# Hash columns
hashes = dict()
for col in app:
    hashes[col] = sha256(app[col].values).hexdigest()
    
# Get list of duplicate column lists
Ncol = app.shape[1] #number of columns
dup_list = []
dup_labels = -np.ones(Ncol)
for i1 in range(Ncol):
    if dup_labels[i1]<0: #if not already merged,
        col1 = app.columns[i1]
        t_dup = [] #list of duplicates matching col1
        for i2 in range(i1+1, Ncol):
            col2 = app.columns[i2]
            if ( dup_labels[i2]<0 #not already merged
                 and hashes[col1]==hashes[col2] #hashes match
                 and app[col1].equals(app[col2])): #cols are equal
                #then this is actually a duplicate
                t_dup.append(col2)
                dup_labels[i2] = i1
        if len(t_dup)>0: #duplicates of col1 were found!
            t_dup.append(col1)
            dup_list.append(t_dup)
        
# Merge duplicate columns
for iM in range(len(dup_list)):
    new_name = 'Merged'+str(iM)
    app[new_name] = app[dup_list[iM][0]].copy()
    app.drop(columns=dup_list[iM], inplace=True)
    #print('Merged', dup_list[iM], 'into', new_name)

# Split data back into test + train
train = app.loc[~app['Test'], :]
test = app.loc[app['Test'], :]

# Ensure all data is stored as floats
train = train.astype(np.float32)
test = test.astype(np.float32)

# Target labels
train_y = train['TARGET']

# Remove test/train indicator column and target column
train.drop(columns=['Test', 'TARGET'], inplace=True)
test.drop(columns=['Test', 'TARGET'], inplace=True)

test.reset_index(inplace=True)
train.reset_index(inplace=True)

test_orig = pd.read_csv(path + 'application_test.csv')
train['ID']=train_ids
#test.drop(columns=['SK_ID_CURR'])
test['ID']=test_ids

# Make SK_ID_CURR the index
train.set_index('ID', inplace=True)
test.set_index('ID', inplace=True)

train.drop(columns=['index'], inplace=True)
test.drop(columns=['index'], inplace=True)

train.to_csv(path +'train_encoded.csv', index=False)
test_encoded=test[['SK_ID_CURR','EXT_SOURCE_3','EXT_SOURCE_2','EXT_SOURCE_1','DAYS_EMPLOYED','AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY']]
test_encoded.to_csv(path +'test_encoded_clean.csv', index=False)
test.to_csv(path +'test_encoded.csv', index=False)

  app = train.append(test, ignore_index=True)


In [None]:
import mlflow
import mlflow.xgboost
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score, fbeta_score
from mlflow.tracking import MlflowClient

def train_xgboost(train_data, test_data, max_depth, learning_rate, n_estimators, gamma, subsample, colsample_bytree):
    with mlflow.start_run():
        # Train the XGBoost model
        model = xgb.XGBClassifier(max_depth=int(max_depth),
                                  learning_rate=learning_rate,
                                  n_estimators=int(n_estimators),
                                  gamma=gamma,
                                  subsample=subsample,
                                  colsample_bytree=colsample_bytree,
                                  objective="binary:logistic",
                                  n_jobs=-1)
        model.fit(train_data.drop("target", axis=1), train_data["target"])

        # Make predictions on the test data
        predictions = model.predict(test_data.drop("target", axis=1))

        # Log the model to the tracking server
        mlflow.xgboost.log_model(model, "model")

        # Calculate and log the AUC score
        auc_score = roc_auc_score(test_data["target"], predictions)
        mlflow.log_metric("auc_score", auc_score)

        # Calculate and log the F1 Beta score
        fbeta_score = fbeta_score(test_data["target"], predictions, beta=0.5)
        mlflow.log_metric("fbeta_score", fbeta_score)

        return fbeta_score

if __name__ == "__main__":
    train_data = pd.read_csv("train_encoded.csv")
    test_data = pd.read_csv("test_encoded.csv")
    
    mlflow.run(".",
               parameters={
                   "max_depth": (3, 5),
                   "learning_rate": (0.01, 0.2),
                   "n_estimators": (50, 200),
                   "gamma": (0, 1),
                   "subsample": (0.5, 1),
                   "colsample_bytree": (0.5, 1)
               },
               experiment_name="xgboost_hyperparameter_tuning",
               version=None,
               entry_point="train_xgboost",
               backend="local")
    
    # Find the best run based on the F1 Beta score
    client = MlflowClient()
    experiment = client.get_experiment_by_name("xgboost_hyperparameter_tuning")
   

In [None]:
train = pd.read_csv(path + 'train_encoded.csv')
test = pd.read_csv(path + 'test_encoded.csv')
# Target labels
#train_y = train['TARGET']

In [None]:
# Classification pipeline w/ isotonic calibration
calib_pipeline = Pipeline([
    ('scaler', RobustScaler()),
    ('imputer', SimpleImputer(strategy='median')),
    ('classifier', CalibratedClassifierCV(
                        base_estimator=XGBClassifier(),
                        method='isotonic'))
])

# Predict probabilities w/ isotonic calibration
calib_pred = cross_val_predict(calib_pipeline, 
                               train, 
                               y=train_y,
                               method='predict_proba')
calib_pred = calib_pred[:,1] #only want p(default)



In [None]:
# Cross-validated AUROC for isotonic
print('Mean AUROC with isotonic calibration:', 
      roc_auc_score(train_y, calib_pred))

# FBetaScore for isotonic
b = 1 # or any value of beta you want to use
print('Mean F-beta score with isotonic calibration:', 
      
      fbeta_score(train_y, calib_pred.round(0), beta=b))



In [None]:
calib_pred

In [None]:
train_y

In [None]:
# Fit to the training data
calib_fit = calib_pipeline.fit(train, train_y)

joblib.dump(calib_fit, 'calib_pipeline.joblib')

In [None]:
# loaded_pipeline = joblib.load('calib_pipeline.joblib')

# index=70

# # Predict default probabilities of the test data
# test_pred = calib_fit.predict_proba(test.iloc[index].values.reshape(1, -1))

# #Adding the index back
# df_out = pd.DataFrame(columns=['SK_ID_CURR','TARGET'])
# df_out = df_out.append({'SK_ID_CURR':index,'TARGET':test_pred[:,1][0]},ignore_index=True)
# df_out

In [None]:
loaded_pipeline = joblib.load('calib_pipeline.joblib')

id = 400000# some value from SK_ID_CURR

#Select the row from the test data with the specified id
test_row = test[test['SK_ID_CURR'] == id]

# Predict default probabilities of the test data
test_pred = calib_fit.predict_proba(test_row.values.reshape(1, -1))

#Adding the id back
df_out = pd.DataFrame(columns=['SK_ID_CURR','TARGET'])
df_out = df_out.append({'SK_ID_CURR':id,'TARGET':test_pred[:,1][0]},ignore_index=True)
df_out

In [None]:
train_y

In [None]:
len(calib_pred)

In [None]:
test.to_csv(path +'test_encoded.csv', index=False)

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

path = "" # "/dataset/"

X_train = pd.read_csv(path + 'train_encoded.csv')
X_test = pd.read_csv(path + 'test_encoded.csv')
train_old = pd.read_csv(path + 'application_train.csv')
X_train = X_train.merge(train_old[['SK_ID_CURR', 'TARGET']], on='SK_ID_CURR', how='left')
y_train = X_train['TARGET']
X_train = X_train.dropna(subset=['TARGET'])
X_train = X_train.drop(columns=["TARGET"], axis=1)
y_train.dropna(inplace=True)
X_train.fillna(X_train.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)
indices=X_train['SK_ID_CURR']
X_test = X_test.set_index('SK_ID_CURR')
X_train = X_train.set_index('SK_ID_CURR')

id_value = 100005
if id_value in X_test.index.values:
    # Find the features of the customer id
    customer_features = X_test.loc[X_test.index == id_value].values.reshape(1, -1)
    # Initialize the KNeighborsClassifier or KNeighborsRegressor with n_neighbors=10
    knn = KNeighborsClassifier(n_neighbors=10)
    # Fit the classifier or regressor on the training dataset
    knn.fit(X_train, y_train)
    # Find the 10 closest neighbors of the customer id in the test dataset
    neighbors = knn.kneighbors(customer_features, return_distance=False)
    # Print the indices of the 10 closest neighbors
    #print(neighbors)
else:
    print(f"{id_value} does not exist in X_test dataframe")

neighbor_ids = indices.iloc[neighbors[0]].values
print(neighbor_ids)

In [39]:
import pandas as pd
import numpy as np
from math import exp
from sklearn import datasets
from pandas_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
import mlflow
import mlflow.xgboost
from mlflow.models.signature import infer_signature
from hyperopt import (
    fmin, 
    hp, 
    tpe, 
    rand, 
    SparkTrials, 
    Trials, 
    STATUS_OK
)
from hyperopt.pyll.base import scope

RANDOM_SEED = 0

mlflow.set_tracking_uri('http://127.0.0.1:5000')

# data = datasets.load_breast_cancer(as_frame=True)
# data_df = data.data
# data_df['target'] = data.target
# data_df
X_train = pd.read_csv(path + 'train_encoded.csv')
X_test = pd.read_csv(path + 'test_encoded.csv')
train_old = pd.read_csv(path + 'application_train.csv')
X_train = X_train.merge(train_old[['SK_ID_CURR', 'TARGET']], on='SK_ID_CURR', how='left')
y_train_val = X_train['TARGET']
X_train = X_train.dropna(subset=['TARGET'])
X_train = X_train.drop(columns=["TARGET"], axis=1)
y_train.dropna(inplace=True)
X_train.fillna(X_train.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)
indices=X_train['SK_ID_CURR']
X_test = X_test.set_index('SK_ID_CURR')
X_train_val = X_train.set_index('SK_ID_CURR')

# data_profile = ProfileReport(data_df)
# data_profile.to_file(f'data_profile.html')

In [40]:
# # Splitting the dataset into training/validation and holdout sets
# train_val, test = train_test_split(
#     data_df, 
#     test_size=0.1,
#     shuffle=True, 
#     random_state=RANDOM_SEED
# )


# # Creating X, y for training/validation set
# X_train_val = train_val.drop(columns='TARGET')
# y_train_val = train_val.TARGET

# # Creating X, y for test set
# X_test = test.drop(columns='TARGET')
# y_test = test.TARGET

# Splitting training/testing set to create training set and validation set
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, 
    y_train_val,
    stratify=y_train_val,
    shuffle=True, 
    random_state=RANDOM_SEED
)

# Preprocessing data
power = PowerTransformer(method='yeo-johnson', standardize=True)
X_train = power.fit_transform(X_train)
X_val =power.transform(X_val)
X_test = power.transform(X_test)

  loglike = -n_samples / 2 * np.log(x_trans.var())
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


In [41]:
# Setting search space for xgboost model
search_space = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': scope.int(hp.quniform('max_depth', 4, 15, 1)),
    'subsample': hp.uniform('subsample', .5, 1.0),
    'learning_rate': hp.loguniform('learning_rate', -7, 0),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 7),
    'reg_alpha': hp.loguniform('reg_alpha', -10, 10),
    'reg_lambda': hp.loguniform('reg_lambda', -10, 10),
    'gamma': hp.loguniform('gamma', -10, 10),
    'use_label_encoder': False,
    'verbosity': 0,
    'random_state': RANDOM_SEED
}
 
try:
    EXPERIMENT_ID = mlflow.create_experiment('xgboost-hyperopt')
except:
    EXPERIMENT_ID = dict(mlflow.get_experiment_by_name('xgboost-hyperopt'))['experiment_id']

def train_model(params):
    """
    Creates a hyperopt training model funciton that sweeps through params in a nested run
    Args:
        params: hyperparameters selected from the search space
    Returns:
        hyperopt status and the loss metric value
    """
    # With MLflow autologging, hyperparameters and the trained model are automatically logged to MLflow.
    # This sometimes doesn't log everything you may want so I usually log my own metrics and params just in case
    mlflow.xgboost.autolog()

    # 
    with mlflow.start_run(experiment_id=EXPERIMENT_ID, nested=True):
        # Training xgboost classifier
        model = xgb.XGBClassifier(**params)
        model = model.fit(X_train, y_train)

        # Predicting values for training and validation data, and getting prediction probabilities
        y_train_pred = model.predict(X_train)
        y_train_pred_proba = model.predict_proba(X_train)[:, 1]
        y_val_pred = model.predict(X_val)
        y_val_pred_proba = model.predict_proba(X_val)[:, 1]

        # Evaluating model metrics for training set predictions and validation set predictions
        # Creating training and validation metrics dictionaries to make logging in mlflow easier
        metric_names = ['accuracy', 'precision', 'recall', 'f1', 'aucroc']
        # Training evaluation metrics
        train_accuracy = accuracy_score(y_train, y_train_pred).round(3)
        train_precision = precision_score(y_train, y_train_pred).round(3)
        train_recall = recall_score(y_train, y_train_pred).round(3)
        train_f1 = f1_score(y_train, y_train_pred).round(3)
        train_aucroc = roc_auc_score(y_train, y_train_pred_proba).round(3)
        training_metrics = {
            'Accuracy': train_accuracy, 
            'Precision': train_precision, 
            'Recall': train_recall, 
            'F1': train_f1, 
            'AUCROC': train_aucroc
        }
        training_metrics_values = list(training_metrics.values())

        # Validation evaluation metrics
        val_accuracy = accuracy_score(y_val, y_val_pred).round(3)
        val_precision = precision_score(y_val, y_val_pred).round(3)
        val_recall = recall_score(y_val, y_val_pred).round(3)
        val_f1 = f1_score(y_val, y_val_pred).round(3)
        val_aucroc = roc_auc_score(y_val, y_val_pred_proba).round(3)
        validation_metrics = {
            'Accuracy': val_accuracy, 
            'Precision': val_precision, 
            'Recall': val_recall, 
            'F1': val_f1, 
            'AUCROC': val_aucroc
        }
        validation_metrics_values = list(validation_metrics.values())
        
        # Logging model signature, class, and name
        signature = infer_signature(X_train, y_val_pred)
        mlflow.xgboost.log_model(model, 'model', signature=signature)
        mlflow.set_tag('estimator_name', model.__class__.__name__)
        mlflow.set_tag('estimator_class', model.__class__)

        # Logging each metric
        for name, metric in list(zip(metric_names, training_metrics_values)):
            mlflow.log_metric(f'training_{name}', metric)
        for name, metric in list(zip(metric_names, validation_metrics_values)):
            mlflow.log_metric(f'validation_{name}', metric)

        # Set the loss to -1*validation auc roc so fmin maximizes the it
        return {'status': STATUS_OK, 'loss': -1*validation_metrics['AUCROC']}

# Greater parallelism will lead to speedups, but a less optimal hyperparameter sweep.
# A reasonable value for parallelism is the square root of max_evals.
# spark_trials = SparkTrials()
# Will need spark configured and installed to run. Add this to fmin function below like so:
# trials = spark_trials
trials = Trials()

# Run fmin within an MLflow run context so that each hyperparameter configuration is logged as a child run of a parent
# run called "xgboost_models" .
with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name='xgboost_models'):
    xgboost_best_params = fmin(
        fn=train_model, 
        space=search_space, 
        algo=tpe.suggest,
        trials=trials,
        max_evals=50
    )

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]





  2%|▏         | 1/50 [00:49<40:36, 49.72s/trial, best loss: -0.745]





  4%|▍         | 2/50 [01:18<29:43, 37.16s/trial, best loss: -0.75] 





  6%|▌         | 3/50 [01:54<28:45, 36.71s/trial, best loss: -0.75]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



  8%|▊         | 4/50 [02:10<21:55, 28.59s/trial, best loss: -0.75]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 10%|█         | 5/50 [02:26<17:59, 23.99s/trial, best loss: -0.75]





 12%|█▏        | 6/50 [02:52<18:06, 24.70s/trial, best loss: -0.759]





 14%|█▍        | 7/50 [03:11<16:29, 23.02s/trial, best loss: -0.759]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 16%|█▌        | 8/50 [03:41<17:34, 25.10s/trial, best loss: -0.759]



2023/01/31 17:21:59 ERROR mlflow.xgboost: Failed to log feature importance plot. XGBoost autologging will ignore the failure and continue. Exception: 
Traceback (most recent call last):
  File "/home/marco/anaconda3/lib/python3.9/site-packages/mlflow/xgboost/__init__.py", line 657, in train_impl
    features, importance = zip(*imp.items())
ValueError: not enough values to unpack (expected 2, got 0)

  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 18%|█▊        | 9/50 [03:55<14:49, 21.69s/trial, best loss: -0.759]



2023/01/31 17:22:11 ERROR mlflow.xgboost: Failed to log feature importance plot. XGBoost autologging will ignore the failure and continue. Exception: 
Traceback (most recent call last):
  File "/home/marco/anaconda3/lib/python3.9/site-packages/mlflow/xgboost/__init__.py", line 657, in train_impl
    features, importance = zip(*imp.items())
ValueError: not enough values to unpack (expected 2, got 0)

  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 20%|██        | 10/50 [04:07<12:29, 18.74s/trial, best loss: -0.759]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 22%|██▏       | 11/50 [04:32<13:25, 20.66s/trial, best loss: -0.759]





 24%|██▍       | 12/50 [05:14<17:07, 27.04s/trial, best loss: -0.759]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 26%|██▌       | 13/50 [05:25<13:46, 22.34s/trial, best loss: -0.759]



2023/01/31 17:23:47 ERROR mlflow.xgboost: Failed to log feature importance plot. XGBoost autologging will ignore the failure and continue. Exception: 
Traceback (most recent call last):
  File "/home/marco/anaconda3/lib/python3.9/site-packages/mlflow/xgboost/__init__.py", line 657, in train_impl
    features, importance = zip(*imp.items())
ValueError: not enough values to unpack (expected 2, got 0)

  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 28%|██▊       | 14/50 [05:43<12:35, 20.98s/trial, best loss: -0.759]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 30%|███       | 15/50 [06:35<17:39, 30.28s/trial, best loss: -0.759]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 32%|███▏      | 16/50 [06:52<14:48, 26.12s/trial, best loss: -0.759]





 34%|███▍      | 17/50 [07:08<12:45, 23.19s/trial, best loss: -0.759]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 36%|███▌      | 18/50 [07:39<13:34, 25.44s/trial, best loss: -0.759]



2023/01/31 17:26:08 ERROR mlflow.xgboost: Failed to log feature importance plot. XGBoost autologging will ignore the failure and continue. Exception: 
Traceback (most recent call last):
  File "/home/marco/anaconda3/lib/python3.9/site-packages/mlflow/xgboost/__init__.py", line 657, in train_impl
    features, importance = zip(*imp.items())
ValueError: not enough values to unpack (expected 2, got 0)

  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 38%|███▊      | 19/50 [08:04<13:09, 25.46s/trial, best loss: -0.759]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 40%|████      | 20/50 [08:18<11:03, 22.13s/trial, best loss: -0.759]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 42%|████▏     | 21/50 [08:28<08:51, 18.32s/trial, best loss: -0.759]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 44%|████▍     | 22/50 [08:36<07:07, 15.28s/trial, best loss: -0.759]





 46%|████▌     | 23/50 [08:52<07:00, 15.56s/trial, best loss: -0.759]





 48%|████▊     | 24/50 [09:28<09:19, 21.53s/trial, best loss: -0.759]





 50%|█████     | 25/50 [10:04<10:44, 25.79s/trial, best loss: -0.759]





 52%|█████▏    | 26/50 [10:42<11:46, 29.45s/trial, best loss: -0.759]





 54%|█████▍    | 27/50 [10:57<09:39, 25.20s/trial, best loss: -0.759]





 56%|█████▌    | 28/50 [11:27<09:47, 26.72s/trial, best loss: -0.759]





 58%|█████▊    | 29/50 [11:56<09:38, 27.53s/trial, best loss: -0.759]



2023/01/31 17:30:34 ERROR mlflow.xgboost: Failed to log feature importance plot. XGBoost autologging will ignore the failure and continue. Exception: 
Traceback (most recent call last):
  File "/home/marco/anaconda3/lib/python3.9/site-packages/mlflow/xgboost/__init__.py", line 657, in train_impl
    features, importance = zip(*imp.items())
ValueError: not enough values to unpack (expected 2, got 0)

  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 60%|██████    | 30/50 [12:30<09:48, 29.44s/trial, best loss: -0.759]





 62%|██████▏   | 31/50 [12:54<08:46, 27.71s/trial, best loss: -0.759]





 64%|██████▍   | 32/50 [13:41<10:03, 33.53s/trial, best loss: -0.759]





 66%|██████▌   | 33/50 [14:32<10:56, 38.63s/trial, best loss: -0.759]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 68%|██████▊   | 34/50 [14:55<09:02, 33.92s/trial, best loss: -0.759]





 70%|███████   | 35/50 [15:21<07:54, 31.65s/trial, best loss: -0.759]





 72%|███████▏  | 36/50 [16:02<08:01, 34.37s/trial, best loss: -0.759]





 74%|███████▍  | 37/50 [16:33<07:14, 33.41s/trial, best loss: -0.759]





 76%|███████▌  | 38/50 [17:00<06:18, 31.51s/trial, best loss: -0.759]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 78%|███████▊  | 39/50 [17:13<04:46, 26.05s/trial, best loss: -0.759]





 80%|████████  | 40/50 [17:35<04:08, 24.82s/trial, best loss: -0.761]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 82%|████████▏ | 41/50 [17:59<03:39, 24.38s/trial, best loss: -0.761]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 84%|████████▍ | 42/50 [18:17<03:01, 22.74s/trial, best loss: -0.761]





 86%|████████▌ | 43/50 [18:40<02:38, 22.66s/trial, best loss: -0.761]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 88%|████████▊ | 44/50 [18:57<02:06, 21.09s/trial, best loss: -0.761]





 90%|█████████ | 45/50 [19:20<01:47, 21.48s/trial, best loss: -0.761]





 92%|█████████▏| 46/50 [19:45<01:31, 22.76s/trial, best loss: -0.761]



2023/01/31 17:38:04 ERROR mlflow.xgboost: Failed to log feature importance plot. XGBoost autologging will ignore the failure and continue. Exception: 
Traceback (most recent call last):
  File "/home/marco/anaconda3/lib/python3.9/site-packages/mlflow/xgboost/__init__.py", line 657, in train_impl
    features, importance = zip(*imp.items())
ValueError: not enough values to unpack (expected 2, got 0)

  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 94%|█████████▍| 47/50 [20:00<01:01, 20.38s/trial, best loss: -0.761]



  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 96%|█████████▌| 48/50 [20:21<00:40, 20.37s/trial, best loss: -0.761]



2023/01/31 17:38:50 ERROR mlflow.xgboost: Failed to log feature importance plot. XGBoost autologging will ignore the failure and continue. Exception: 
Traceback (most recent call last):
  File "/home/marco/anaconda3/lib/python3.9/site-packages/mlflow/xgboost/__init__.py", line 657, in train_impl
    features, importance = zip(*imp.items())
ValueError: not enough values to unpack (expected 2, got 0)

  _warn_prf(average, modifier, msg_start, len(result))

  _warn_prf(average, modifier, msg_start, len(result))



 98%|█████████▊| 49/50 [20:46<00:21, 21.78s/trial, best loss: -0.761]





100%|██████████| 50/50 [21:08<00:00, 25.36s/trial, best loss: -0.761]


In [44]:
y_test_pred

array([0, 0, 0, ..., 0, 0, 0])

In [42]:
# Querying mlflow api instead of using web UI. Sorting by validation aucroc and then getting top run for best run.
runs_df = mlflow.search_runs(experiment_ids=EXPERIMENT_ID, order_by=['metrics.validation_aucroc DESC'])
best_run = runs_df.iloc[0]
best_run_id = best_run['run_id']
best_artifact_uri = best_run['artifact_uri']
# Loading model from best run
best_model = mlflow.xgboost.load_model('runs:/' + best_run_id + '/model')

# Predicting and evaluating best model on holdout set
y_test_pred = best_model.predict(X_test)
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]

test_accuracy = accuracy_score(y_test, y_test_pred).round(3)
test_precision = precision_score(y_test, y_test_pred).round(3)
test_recall = recall_score(y_test, y_test_pred).round(3)
test_f1 = f1_score(y_test, y_test_pred).round(3)
test_aucroc = roc_auc_score(y_test, y_test_pred_proba).round(3)

print(f'Testing Accuracy: {test_accuracy}')
print(f'Testing Precision: {test_precision}')
print(f'Testing Recall: {test_recall}')
print(f'Testing F1: {test_f1}')
print(f'Testing AUCROC: {test_aucroc}')

ValueError: Found input variables with inconsistent numbers of samples: [57, 48744]

In [None]:
model_details = mlflow.register_model(f'runs:/{best_run_id}/artifacts/model', 'XGBoostLoanPrediction')


In [None]:

client = MlflowClient()
client.update_registered_model(
  name=model_details.name,
  description=""" 
  a) radius (mean of distances from center to points on the perimeter)
  b) texture (standard deviation of gray-scale values)
  c) perimeter
  d) area
  e) smoothness (local variation in radius lengths)
  f) compactness (perimeter^2 / area - 1.0)
  g) concavity (severity of concave portions of the contour)
  h) concave points (number of concave portions of the contour)
  i) symmetry
  j) fractal dimension ("coastline approximation" - 1)."""
)

In [None]:
client.update_model_version(
  name=model_details.name,
  version=model_details.version,
  description='This model version is the first XGBoost model trained with HyperOpt for bayesian hyperparameter tuning.'
)

In [None]:
client.transition_model_version_stage(
  name=model_details.name,
  version=model_details.version,
  stage='Production'
)