In [None]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

import mlflow
import mlflow.sklearn

In [None]:
# downloading data
data = pd.read_csv('credit_card_approval_dataset.csv')

# perform any necessary preprocessing, e.g. cleaning, encoding, etc.

# split the data into features and target
X = data.drop('approved', axis=1)  # assuming 'approved' is your target column
y = data['approved']

# split the data into a training set and a hold-out test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# set up MLflow
mlflow.set_experiment("CreditCardApprovalExperiment")

with mlflow.start_run():
    # log some basic information
    mlflow.log_param("data_shape", data.shape)
    mlflow.log_param("target_variable", "approved")

    # define a model
    model = RandomForestClassifier(random_state=42)

    # define a grid of hyperparameters to search
    hyperparameters = {
        'n_estimators': [100, 200, 300],
        'max_depth': [2, 4, 6],
    }

    # set up cross-validation grid search
    grid_search = GridSearchCV(model, hyperparameters, cv=5, scoring='roc_auc')

    # fit the model and tune hyperparameters
    grid_search.fit(X_train, y_train)

    # log the best parameters
    mlflow.log_param("best_params", grid_search.best_params_)

    # evaluate the best model on the test set
    y_pred_proba = grid_search.predict_proba(X_test)[:, 1]
    auc_roc = roc_auc_score(y_test, y_pred_proba)

    # log the performance metric
    mlflow.log_metric("auc_roc", auc_roc)

    # log the model
    mlflow.sklearn.log_model(grid_search.best_estimator_, "model")