In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# import mlflow
# import mlflow.sklearn

In [6]:
import pandas as pd

# Assuming the script is located in the parent directory of the "data" folder.
data = pd.read_csv('data/application_record.csv')
record = pd.read_csv('data/credit_record.csv')

In [7]:
data.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [None]:
profile = ProfileReport(data, title="Profiling Report")
profile.to_notebook_iframe()

In [8]:
record.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [None]:
profile_record = ProfileReport(record, title="Profiling Report")
profile_record.to_notebook_iframe()

STATUS Column

0: 1-29 days past due

1: 30-59 days past due

2: 60-89 days overdue 3: 90-119 days overdue

4: 120-149 days overdue

5: Overdue or bad debts, write-offs for more than 150 days

C: paid off that month

X: No loan for the month

In [None]:
record['dep_value'] = None
record['dep_value'][record['STATUS'] =='2']='Yes' 
record['dep_value'][record['STATUS'] =='3']='Yes' 
record['dep_value'][record['STATUS'] =='4']='Yes' 
record['dep_value'][record['STATUS'] =='5']='Yes' 

In [None]:
cpunt=record.groupby('ID').count()
cpunt['dep_value'][cpunt['dep_value'] > 0]='Yes' 
cpunt['dep_value'][cpunt['dep_value'] == 0]='No' 
cpunt = cpunt[['dep_value']]
cpunt.head()

In [None]:
new_data=pd.merge(new_data,cpunt,how='inner',on='ID')
new_data['target']=new_data['dep_value']
new_data.loc[new_data['target']=='Yes','target']=1
new_data.loc[new_data['target']=='No','target']=0
new_data.head()

In [None]:
new_data.dropna()
new_data = new_data.mask(new_data == 'NULL').dropna()

In [None]:
print(cpunt['dep_value'].value_counts())
cpunt['dep_value'].value_counts(normalize=True)

In [None]:
new_data.rename(columns={'CODE_GENDER':'Gender','FLAG_OWN_CAR':'Car','FLAG_OWN_REALTY':'Reality',
                         'CNT_CHILDREN':'ChldNo','AMT_INCOME_TOTAL':'inc',
                         'NAME_EDUCATION_TYPE':'edutp','NAME_FAMILY_STATUS':'famtp',
                        'NAME_HOUSING_TYPE':'houtp','FLAG_EMAIL':'email',
                         'NAME_INCOME_TYPE':'inctp','FLAG_WORK_PHONE':'wkphone',
                         'FLAG_PHONE':'phone','CNT_FAM_MEMBERS':'famsize',
                        'OCCUPATION_TYPE':'occyp'
                        },inplace=True)

In [None]:
new_data.head()

In [None]:
# downloading data
data = pd.read_csv('credit_card_approval_dataset.csv')

# perform any necessary preprocessing, e.g. cleaning, encoding, etc.

# split the data into features and target
X = data.drop('approved', axis=1)  # assuming 'approved' is your target column
y = data['approved']

# split the data into a training set and a hold-out test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# set up MLflow
mlflow.set_experiment("CreditCardApprovalExperiment")

with mlflow.start_run():
    # log some basic information
    mlflow.log_param("data_shape", data.shape)
    mlflow.log_param("target_variable", "approved")

    # define a model
    model = RandomForestClassifier(random_state=42)

    # define a grid of hyperparameters to search
    hyperparameters = {
        'n_estimators': [100, 200, 300],
        'max_depth': [2, 4, 6],
    }

    # set up cross-validation grid search
    grid_search = GridSearchCV(model, hyperparameters, cv=5, scoring='roc_auc')

    # fit the model and tune hyperparameters
    grid_search.fit(X_train, y_train)

    # log the best parameters
    mlflow.log_param("best_params", grid_search.best_params_)

    # evaluate the best model on the test set
    y_pred_proba = grid_search.predict_proba(X_test)[:, 1]
    auc_roc = roc_auc_score(y_test, y_pred_proba)

    # log the performance metric
    mlflow.log_metric("auc_roc", auc_roc)

    # log the model
    mlflow.sklearn.log_model(grid_search.best_estimator_, "model")