In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import wandb
import wandb.sklearn
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import pickle

In [2]:
df = pd.read_csv('bankloan.csv')

df.describe()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,35.3872,1.7306,8.7038,8.154,47.67698,10.075984,1.628071,3.218582,0.2512
std,7.948403,0.960525,7.194583,6.760786,51.793192,6.659336,3.053369,6.944523,0.433747
min,20.0,1.0,0.0,0.0,12.1,0.08,0.005292,0.009373,0.0
25%,29.0,1.0,3.0,3.0,24.5,5.05,0.396181,0.990071,0.0
50%,35.0,1.0,7.0,7.0,34.5,8.635,0.906189,1.960223,0.0
75%,41.0,2.0,13.0,12.0,54.725,13.6825,1.859147,3.789633,1.0
max,58.0,5.0,38.0,37.0,2461.7,44.62,139.580606,416.517424,1.0


In [3]:
df.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,35.9,11.9,0.504108,3.767992,0
1,30,1,13,8,46.7,17.88,1.352694,6.997266,0
2,40,1,15,14,61.8,10.64,3.438997,3.136523,0
3,41,1,15,14,72.0,29.67,4.165668,17.196732,0
4,57,1,7,37,25.6,15.86,1.498199,2.561961,0


In [4]:
#Factorize the categorical variables  factorize_cols: ['ed']

df['ed'] = pd.factorize(df['ed'])[0]

#standardize the numerical variables  
numerical_cols =['age', 'employ', 'address', 'income', 'debtinc', 'creddebt', 'othdebt']

scaler = StandardScaler()

df[numerical_cols] = scaler.fit_transform(df[numerical_cols])



In [5]:
X=df[df.columns.drop('default')].values

y=df["default"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=44)

In [6]:
RUS = RandomUnderSampler(random_state=44)

#Fit the RUS
X_train_rus, y_train_rus= RUS.fit_resample(X_train, y_train)

# Check the number of records after under sampling
print(sorted(Counter(y_train_rus).items()))

[(0, 1013), (1, 1013)]


In [7]:
def train_logistic_regression(X_train, y_train, X_test, hyperparameters):
    model = LogisticRegression(**hyperparameters)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return model, y_pred

def calculate_performance_metrics(y_test, y_pred, model, X_test):
    metrics = {}
    metrics["accuracy"] = accuracy_score(y_test, y_pred)
    metrics["precision"] = precision_score(y_test, y_pred)
    metrics["recall"] = recall_score(y_test, y_pred)
    metrics["f1"] = f1_score(y_test, y_pred)
    metrics["roc_auc"] = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = conf_matrix.ravel()
    
    metrics["ppv"] = TP / (TP + FP)
    metrics["npv"] = TN / (TN + FN)
    metrics["specificity"] = TN / (TN + FP)
    
    return metrics

def log_to_wandb(metrics, model, X_train, X_test, y_train, y_test):
    # Log metrics
    wandb.log(metrics)
    
    # Save and log the model
    os.makedirs('models', exist_ok=True)
    model_path = "models/log_model.pkl"
    with open(model_path, "wb") as f:
        pickle.dump(model, f)
    
    artifact = wandb.Artifact("log_model", type="model")
    artifact.add_file(model_path)
    wandb.log_artifact(artifact)
    
    # Save and log the data
    os.makedirs('data', exist_ok=True)
    datasets = {"training_data": pd.DataFrame(X_train), "training_labels": pd.DataFrame(y_train), 
                "test_data": pd.DataFrame(X_test), "test_labels": pd.DataFrame(y_test)}
    
    for name, df in datasets.items():
        df.to_csv(f'data/{name}.csv', index=False)
    
    artifact = wandb.Artifact('train_val_sets', type='dataset', metadata={"Source": "CaseStudy_training_data.xlsx"})
    artifact.add_dir('data')
    wandb.log_artifact(artifact)
    
    
    # Plot plots to Weights & Biases
    label_names = ["Not-Defaulted", "Defaulted"]
    y_pred_proba = model.predict_proba(X_test)
    
    wandb.sklearn.plot_class_proportions(y_train, y_test, label_names)
    wandb.sklearn.plot_summary_metrics(model, X_train, y_train, X_test, y_test)
    wandb.sklearn.plot_roc(y_test, y_pred_proba, labels=label_names)
    wandb.sklearn.plot_precision_recall(y_test, y_pred_proba, labels=label_names)
    wandb.sklearn.plot_confusion_matrix(y_test, y_pred_proba.argmax(axis=1), labels=label_names)
    

In [8]:
# Random Search Hyperparameters
sweep_config = {
    'method': 'random',
    'metric': {
      'name': 'recall', # 
      'goal': 'maximize'
    },
    'parameters': {
        'C': {
            'values': [0.1, 1, 10]
        },
        'max_iter': {
            'values': [100, 200, 300]
        },
        'penalty': {
            'values': ['l1', 'l2']
        },
        'solver': {
            'values': ['liblinear', 'saga']
        },
        'class_weight': {
            'values': ['balanced', None]
        }
    }
}

In [9]:


# Grid Search Hyperparameters
sweep_config = {
    'method': 'grid',
    'metric': {
      'name': 'recall',
      'goal': 'maximize'
    },
    'parameters': {
        'C': {
            'values': [0.1, 1, 10]
        },
        'max_iter': {
            'values': [100, 200, 300]
        },
        'penalty': {
            'values': ['l1', 'l2']
        },
        'solver': {
            'values': ['liblinear', 'saga']
        },
        'class_weight': {
            'values': ['balanced', None]
        }
    }
}



In [10]:
# Bayesian Search Hyperparameters
sweep_config = {
    'method': 'bayes',
    'metric': {
      'name': 'recall',
      'goal': 'maximize'
    },
    'parameters': {
        'C': {
            'distribution': 'uniform',
            'min': 0.1,
            'max': 10
        },
        'max_iter': {
            'distribution': 'int_uniform',
            'min': 100,
            'max': 300
        },
        'penalty': {
            'values': ['l1', 'l2']
        },
        'solver': {
            'values': ['liblinear', 'saga']
        },
        'class_weight': {
            'values': ['balanced', None]
        }
    }
}

In [11]:
def sweep():
    # Initialize wandb
    run = wandb.init()
    
    # Get hyperparameters from wandb
    hyperparameters = run.config
    
    # Train the logistic regression model
    model, y_pred = train_logistic_regression(X_train_rus, y_train_rus, X_test, hyperparameters)
    
    # Calculate performance metrics
    metrics = calculate_performance_metrics(y_test, y_pred, model, X_test)
    
    # Log to wandb
    log_to_wandb(metrics, model, X_train, X_test, y_train, y_test)
    
    # Finish the wandb run
    run.finish()

    

In [12]:
sweep_id = wandb.sweep(sweep=sweep_config, project='bankloan_sweep')
wandb.agent(sweep_id, function=sweep, count=15)  

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: mj1wox45
Sweep URL: https://wandb.ai/mozeart/bankloan_sweep/sweeps/mj1wox45


[34m[1mwandb[0m: Agent Starting Run: cjjn9ek5 with config:
[34m[1mwandb[0m: 	C: 7.408540810372477
[34m[1mwandb[0m: 	class_weight: balanced
[34m[1mwandb[0m: 	max_iter: 146
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	solver: saga
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmozeart[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.740 MB of 0.740 MB uploaded\r'), FloatProgress(value=0.9997397698638043, max=1.0…

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.75
f1,0.60443
npv,0.91489
ppv,0.491
precision,0.491
recall,0.78601
roc_auc,0.8426
specificity,0.73844


[34m[1mwandb[0m: Agent Starting Run: cdj4w899 with config:
[34m[1mwandb[0m: 	C: 1.642045716782945
[34m[1mwandb[0m: 	class_weight: balanced
[34m[1mwandb[0m: 	max_iter: 245
[34m[1mwandb[0m: 	penalty: l1
[34m[1mwandb[0m: 	solver: saga
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.037 MB of 0.037 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.749
f1,0.60348
npv,0.91475
ppv,0.48974
precision,0.48974
recall,0.78601
roc_auc,0.84276
specificity,0.73712


[34m[1mwandb[0m: Agent Starting Run: 504gosv0 with config:
[34m[1mwandb[0m: 	C: 1.3053251892877589
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	max_iter: 274
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	solver: saga
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.039 MB of 0.039 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.75
f1,0.60443
npv,0.91489
ppv,0.491
precision,0.491
recall,0.78601
roc_auc,0.84268
specificity,0.73844


[34m[1mwandb[0m: Agent Starting Run: t57pmd1r with config:
[34m[1mwandb[0m: 	C: 7.045512080592222
[34m[1mwandb[0m: 	class_weight: balanced
[34m[1mwandb[0m: 	max_iter: 133
[34m[1mwandb[0m: 	penalty: l1
[34m[1mwandb[0m: 	solver: saga
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.037 MB of 0.037 MB uploaded\r'), FloatProgress(value=0.9948400940022479, max=1.0…

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.75
f1,0.60443
npv,0.91489
ppv,0.491
precision,0.491
recall,0.78601
roc_auc,0.84255
specificity,0.73844


[34m[1mwandb[0m: Agent Starting Run: 48n7fxnc with config:
[34m[1mwandb[0m: 	C: 9.52097881013477
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	max_iter: 137
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	solver: liblinear
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.038 MB of 0.038 MB uploaded\r'), FloatProgress(value=0.99496798943776, max=1.0))…

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.75
f1,0.60443
npv,0.91489
ppv,0.491
precision,0.491
recall,0.78601
roc_auc,0.84255
specificity,0.73844


[34m[1mwandb[0m: Agent Starting Run: ps6ofh9l with config:
[34m[1mwandb[0m: 	C: 3.4944927014457847
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	max_iter: 260
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	solver: saga
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.040 MB of 0.040 MB uploaded\r'), FloatProgress(value=0.9952049754314335, max=1.0…

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.75
f1,0.60443
npv,0.91489
ppv,0.491
precision,0.491
recall,0.78601
roc_auc,0.8426
specificity,0.73844


[34m[1mwandb[0m: Agent Starting Run: mx4ot9pw with config:
[34m[1mwandb[0m: 	C: 6.217686838342156
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	max_iter: 165
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	solver: liblinear
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.038 MB of 0.039 MB uploaded\r'), FloatProgress(value=0.9949983905712234, max=1.0…

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.75
f1,0.60443
npv,0.91489
ppv,0.491
precision,0.491
recall,0.78601
roc_auc,0.84256
specificity,0.73844


[34m[1mwandb[0m: Agent Starting Run: gubiscih with config:
[34m[1mwandb[0m: 	C: 4.191529263758742
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	max_iter: 202
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	solver: saga
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.039 MB of 0.039 MB uploaded\r'), FloatProgress(value=0.9950565317409819, max=1.0…

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.75
f1,0.60443
npv,0.91489
ppv,0.491
precision,0.491
recall,0.78601
roc_auc,0.84259
specificity,0.73844


[34m[1mwandb[0m: Agent Starting Run: vyv932a2 with config:
[34m[1mwandb[0m: 	C: 8.665506610674932
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	max_iter: 104
[34m[1mwandb[0m: 	penalty: l1
[34m[1mwandb[0m: 	solver: saga
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.038 MB of 0.038 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.75
f1,0.60443
npv,0.91489
ppv,0.491
precision,0.491
recall,0.78601
roc_auc,0.84254
specificity,0.73844


[34m[1mwandb[0m: Agent Starting Run: np5ukbaw with config:
[34m[1mwandb[0m: 	C: 5.530831603685222
[34m[1mwandb[0m: 	class_weight: balanced
[34m[1mwandb[0m: 	max_iter: 123
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	solver: liblinear
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.038 MB of 0.038 MB uploaded\r'), FloatProgress(value=0.9949677387210084, max=1.0…

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.75
f1,0.60443
npv,0.91489
ppv,0.491
precision,0.491
recall,0.78601
roc_auc,0.84256
specificity,0.73844


[34m[1mwandb[0m: Agent Starting Run: zxw2pmas with config:
[34m[1mwandb[0m: 	C: 0.5988500458241841
[34m[1mwandb[0m: 	class_weight: balanced
[34m[1mwandb[0m: 	max_iter: 171
[34m[1mwandb[0m: 	penalty: l1
[34m[1mwandb[0m: 	solver: liblinear
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.037 MB of 0.037 MB uploaded\r'), FloatProgress(value=0.9948142633430032, max=1.0…

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.749
f1,0.60472
npv,0.91612
ppv,0.4898
precision,0.4898
recall,0.79012
roc_auc,0.84268
specificity,0.7358


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ds5c28jl with config:
[34m[1mwandb[0m: 	C: 0.33119651343335854
[34m[1mwandb[0m: 	class_weight: balanced
[34m[1mwandb[0m: 	max_iter: 175
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	solver: liblinear
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.036 MB of 0.036 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.75
f1,0.60443
npv,0.91489
ppv,0.491
precision,0.491
recall,0.78601
roc_auc,0.84253
specificity,0.73844


[34m[1mwandb[0m: Agent Starting Run: twaukrk0 with config:
[34m[1mwandb[0m: 	C: 0.2645192458795328
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	max_iter: 168
[34m[1mwandb[0m: 	penalty: l1
[34m[1mwandb[0m: 	solver: liblinear
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.035 MB of 0.035 MB uploaded\r'), FloatProgress(value=0.9945389169753169, max=1.0…

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.749
f1,0.60348
npv,0.91475
ppv,0.48974
precision,0.48974
recall,0.78601
roc_auc,0.84299
specificity,0.73712


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: j6skqpnx with config:
[34m[1mwandb[0m: 	C: 6.292073299753262
[34m[1mwandb[0m: 	class_weight: balanced
[34m[1mwandb[0m: 	max_iter: 289
[34m[1mwandb[0m: 	penalty: l1
[34m[1mwandb[0m: 	solver: liblinear
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.038 MB of 0.038 MB uploaded\r'), FloatProgress(value=0.994935058422346, max=1.0)…

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.75
f1,0.60443
npv,0.91489
ppv,0.491
precision,0.491
recall,0.78601
roc_auc,0.84253
specificity,0.73844


[34m[1mwandb[0m: Agent Starting Run: g73st5vd with config:
[34m[1mwandb[0m: 	C: 8.712774120323722
[34m[1mwandb[0m: 	class_weight: None
[34m[1mwandb[0m: 	max_iter: 174
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	solver: saga
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Adding directory to artifact (.\data)... Done. 0.0s


VBox(children=(Label(value='0.040 MB of 0.040 MB uploaded\r'), FloatProgress(value=0.9951477300024021, max=1.0…

0,1
accuracy,▁
f1,▁
npv,▁
ppv,▁
precision,▁
recall,▁
roc_auc,▁
specificity,▁

0,1
accuracy,0.75
f1,0.60443
npv,0.91489
ppv,0.491
precision,0.491
recall,0.78601
roc_auc,0.84258
specificity,0.73844
