## Credit Card Approval Experiment Tracking

In [1]:
!python -V

Python 3.10.11


In [2]:
import requests
import pickle

import pandas as pd

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.svm import LinearSVR

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from tqdm import tqdm

In [3]:
# Data download.
data = pd.read_csv('data/application_record.csv')
record = pd.read_csv('data/credit_record.csv')

**Data Exploration**

In [4]:
# find all users' account open month.
begin_month=pd.DataFrame(record.groupby(["ID"])["MONTHS_BALANCE"].agg(min))
begin_month=begin_month.rename(columns={'MONTHS_BALANCE':'begin_month'}) 

new_data=pd.merge(data,begin_month,how="left",on="ID") #merge to record datatrain_raw_data.head()

In [5]:
# Assuming 'record' is your DataFrame containing the 'STATUS' and 'dep_value' columns.
record['dep_value'] = None
record.loc[record['STATUS'].isin(['2', '3', '4', '5']), 'dep_value'] = 'Yes'

cpunt=record.groupby('ID').count()
cpunt['dep_value'][cpunt['dep_value'] > 0]='Yes' 
cpunt['dep_value'][cpunt['dep_value'] == 0]='No' 
cpunt = cpunt[['dep_value']]

merge_data=pd.merge(new_data,cpunt,how='inner',on='ID')
merge_data.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,begin_month,dep_value
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-15.0,No
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-14.0,No
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0,-29.0,No
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,-4.0,No
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,-26.0,No


In [6]:
# Assuming 'new_data' is your DataFrame.
# Check for missing values (NaN) in the DataFrame
print(merge_data.isna().sum())

# Drop rows with any NaN values
merge_data.dropna(inplace=True)

# Display the first few rows of the cleaned DataFrame
merge_data.head()

ID                         0
CODE_GENDER                0
FLAG_OWN_CAR               0
FLAG_OWN_REALTY            0
CNT_CHILDREN               0
AMT_INCOME_TOTAL           0
NAME_INCOME_TYPE           0
NAME_EDUCATION_TYPE        0
NAME_FAMILY_STATUS         0
NAME_HOUSING_TYPE          0
DAYS_BIRTH                 0
DAYS_EMPLOYED              0
FLAG_MOBIL                 0
FLAG_WORK_PHONE            0
FLAG_PHONE                 0
FLAG_EMAIL                 0
OCCUPATION_TYPE        11323
CNT_FAM_MEMBERS            0
begin_month                0
dep_value                  0
dtype: int64


Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,begin_month,dep_value
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0,-29.0,No
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,-4.0,No
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,-26.0,No
5,5008810,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,-26.0,No
6,5008811,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,-38.0,No


In [7]:
# Now, we remove OCCUPATION_TYPE. As we said, it have too many missed data
# In addition, we will get rid of FLAG_MOBIL, FLAG_WORK_PHONE, FLAG_PHONE and FLAG_EMAIL. 
# These parameters do not affect in any way whether a person is creditworthy or not.

merge_data.drop(['ID','FLAG_WORK_PHONE','FLAG_PHONE','FLAG_EMAIL'], axis=1, inplace=True) 

In [8]:
# Now, let's convert all non-numeric data to numeric data using LabelEncoder().
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for x in merge_data:
    if merge_data[x].dtypes == 'object':
        merge_data[x] = le.fit_transform(merge_data[x])

In [9]:
data = merge_data.copy()

In [10]:
X = data.iloc[:,1:-1] # X value contains all the variables except labels
y = data.iloc[:,-1] # these are the labels
y

2        0
3        0
4        0
5        0
6        0
        ..
36452    1
36453    1
36454    1
36455    1
36456    1
Name: dep_value, Length: 25134, dtype: int32

In [11]:
# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [12]:
oversample = SMOTE()
X_balanced, y_balanced = oversample.fit_resample(X_train, y_train)
X_test_balanced, y_test_balanced = oversample.fit_resample(X_test, y_test)

In [13]:
def process_dataframe(data):
#     data.lpep_dropoff_datetime = pd.to_datetime(data.lpep_dropoff_datetime)
#     data.lpep_pickup_datetime = pd.to_datetime(data.lpep_pickup_datetime)

#     data['duration'] = data.lpep_dropoff_datetime - data.lpep_pickup_datetime
#     data.duration = data.duration.apply(lambda td: td.total_seconds() / 60)
#     data = data[(data.duration >= 1) & (data.duration <= 60)]
    
#     data['PULocationID'].astype(str, copy=False)
#     data['DOLocationID'].astype(str, copy=False)
    
    return data

In [14]:
# X_train = process_dataframe(train_raw_data)[num_features + cat_features]
# X_val = process_dataframe(val_raw_data)[num_features + cat_features] 

# y_train = process_dataframe(train_raw_data)['duration']
# y_val = process_dataframe(val_raw_data)['duration'] 

In [15]:
# X_val.isnull().sum()

## Simple Experimnet 

**Data Preprocessing**

In [16]:
lr = Ridge()
lr.fit(X_balanced, y_balanced)

y_pred = lr.predict(X_test_balanced)

mean_squared_error(y_test_balanced, y_pred, squared=False)

0.42610902640031584

## MLflow tracking

In [17]:
import mlflow

In [20]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.create_experiment(name="test_new", artifact_location="file:///C:/Users/Mint/Credit Card Project/creditcard_approval_prediction/artifacts")

'4'

In [21]:
with mlflow.start_run():
    mlflow.set_tag("workspace", "in_class")
    mlflow.log_param("train_data_name", "train_data.parquet")
    mlflow.log_param("validation_data_name", "validation_data.parquet")
    
    alpha = 0.99
    mlflow.log_param("alpha", alpha)
    
    lr = Lasso(alpha)
    lr.fit(X_balanced, y_balanced) 
    y_pred = lr.predict(X_test_balanced)

    rmse = mean_squared_error(y_test_balanced, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)

## Hyperparameters Optimization

In [22]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [24]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("test_new")

<Experiment: artifact_location=('file:///C:/Users/Mint/Credit Card '
 'Project/creditcard_approval_prediction/artifacts'), creation_time=1690107752989, experiment_id='4', last_update_time=1690107752989, lifecycle_stage='active', name='test_new', tags={}>

In [25]:
# X_balanced, y_balanced = oversample.fit_resample(X_train, y_train)
# X_test_balanced, y_test_balanced = oversample.fit_resample(X_test, y_test)

In [26]:
train = xgb.DMatrix(X_balanced, label=y_balanced)
validation = xgb.DMatrix(X_test_balanced, label=y_test_balanced)

In [27]:
def objective(params):
    with mlflow.start_run():
        num_boost_round = 500
        early_stopping_rounds = 50
        
        mlflow.log_params(params)
        mlflow.log_param('num_boost_round', num_boost_round)
        mlflow.log_param('early_stopping_rounds', early_stopping_rounds)
        mlflow.log_param('train_data_name', 'green_tripdata_2021-01.parquet')
        mlflow.log_param('validation_data_name', 'green_tripdata_2021-02.parquet')
        mlflow.set_tag('model', 'xgboost')

        booster = xgb.train(
            params = params,
            dtrain = train,
            evals = [(validation, "validation")],
            num_boost_round = num_boost_round,
            early_stopping_rounds = early_stopping_rounds
        )
        
        y_pred = booster.predict(validation)
        rmse = mean_squared_error(y_test_balanced, y_pred, squared=False)
        mlflow.log_metric('rmse', rmse)
        return {'loss': rmse, 'status': STATUS_OK}

In [28]:
grid_search = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child': hp.loguniform('min_child', -1, 3),
    'seed': 111,
    'objective': 'reg:linear'
}

In [29]:
best_model = fmin(
    fn=objective,
    space=grid_search,
    algo=tpe.suggest,
    max_evals=30,
    trials=Trials()
)

Parameters: { "min_child" } are not used.

[0]	validation-rmse:0.42262                           
[1]	validation-rmse:0.38920                           
[2]	validation-rmse:0.37875                           
[3]	validation-rmse:0.37768                           
[4]	validation-rmse:0.37927                           
[5]	validation-rmse:0.38115                           
[6]	validation-rmse:0.38034                           
[7]	validation-rmse:0.38037                           
[8]	validation-rmse:0.38001                           
[9]	validation-rmse:0.38047                           
[10]	validation-rmse:0.38129                          
[11]	validation-rmse:0.38138                          
[12]	validation-rmse:0.38199                          
[13]	validation-rmse:0.38206                          
[14]	validation-rmse:0.38231                          
[15]	validation-rmse:0.38242                          
[16]	validation-rmse:0.38262                          
[17]	validation-rmse:0

## Train the Best Model

In [30]:
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [32]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("test_new")

<Experiment: artifact_location=('file:///C:/Users/Mint/Credit Card '
 'Project/creditcard_approval_prediction/artifacts'), creation_time=1690107752989, experiment_id='4', last_update_time=1690107752989, lifecycle_stage='active', name='test_new', tags={}>

In [33]:
# We took the best params from the MLflow interface and copied them here
best_params = best_model

# Convert 'max_depth' to int
if 'max_depth' in best_params:
    best_params['max_depth'] = int(best_params['max_depth'])

mlflow.xgboost.autolog()

booster = xgb.train(
    params = best_params,
    dtrain = train,
    evals = [(validation, "validation")],
    num_boost_round = 500,
    early_stopping_rounds = 50,
)

2023/07/23 12:28:38 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '24793963be824614970e7af318dbe2c7', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


Parameters: { "min_child" } are not used.

[0]	validation-rmse:0.46969
[1]	validation-rmse:0.44728
[2]	validation-rmse:0.43790
[3]	validation-rmse:0.42049
[4]	validation-rmse:0.38662
[5]	validation-rmse:0.37427
[6]	validation-rmse:0.36961
[7]	validation-rmse:0.36105
[8]	validation-rmse:0.35568
[9]	validation-rmse:0.34662
[10]	validation-rmse:0.34036
[11]	validation-rmse:0.33893
[12]	validation-rmse:0.33568
[13]	validation-rmse:0.33175
[14]	validation-rmse:0.33085
[15]	validation-rmse:0.32988
[16]	validation-rmse:0.32703
[17]	validation-rmse:0.32544
[18]	validation-rmse:0.32190
[19]	validation-rmse:0.32031
[20]	validation-rmse:0.31783
[21]	validation-rmse:0.31749
[22]	validation-rmse:0.31646
[23]	validation-rmse:0.31681
[24]	validation-rmse:0.31652
[25]	validation-rmse:0.31488
[26]	validation-rmse:0.31466
[27]	validation-rmse:0.31190
[28]	validation-rmse:0.31054
[29]	validation-rmse:0.30936
[30]	validation-rmse:0.30843
[31]	validation-rmse:0.30791
[32]	validation-rmse:0.30700
[33]	valid



In [34]:
key="???"
if best_params.get(key):
    print(best_params.get(key))
else:
    print("no value")

no value


In [35]:
pd.DataFrame.from_dict([best_params])

Unnamed: 0,max_depth,min_child,reg_alpha,reg_lambda
0,4,6.240926,0.006922,0.304953


In [36]:
y_pred = booster.predict(validation)

rmse = mean_squared_error(y_test_balanced, y_pred, squared=False)
rmse

0.21818374871153104

## Model Logging 

In [37]:
with open('models/model.bin', 'wb') as f_out:
    pickle.dump(booster, f_out)

In [38]:
with open('preprocessing/process_dataframe.bin', 'wb') as f_out:
    pickle.dump(process_dataframe, f_out)

In [40]:
mlflow.set_experiment("test_new")
with mlflow.start_run():
    best_params = {

    }
    
    mlflow.log_params(best_params)
    mlflow.log_param("train_data_name", "train_data.parquet")
    mlflow.log_param("validation_data_name", "validation_data.parquet")
    mlflow.set_tag('model', 'xgboost')
    
    booster = xgb.train(
    params = best_params,
    dtrain = train,
    evals = [(validation, "validation")],
    num_boost_round = 500,
    early_stopping_rounds = 50,
    )
    
    mlflow.xgboost.log_model(booster, artifact_path='mlflow_models')
    mlflow.log_artifact('preprocessing/process_dataframe.bin', artifact_path='preprocessing')
    

[0]	validation-rmse:0.45140
[1]	validation-rmse:0.41633
[2]	validation-rmse:0.39410
[3]	validation-rmse:0.37153
[4]	validation-rmse:0.34977
[5]	validation-rmse:0.34015
[6]	validation-rmse:0.32577
[7]	validation-rmse:0.31875
[8]	validation-rmse:0.30893
[9]	validation-rmse:0.30596
[10]	validation-rmse:0.30184
[11]	validation-rmse:0.29984
[12]	validation-rmse:0.29605
[13]	validation-rmse:0.29466
[14]	validation-rmse:0.29390
[15]	validation-rmse:0.29361
[16]	validation-rmse:0.29146
[17]	validation-rmse:0.28874
[18]	validation-rmse:0.28646
[19]	validation-rmse:0.28609
[20]	validation-rmse:0.28537
[21]	validation-rmse:0.28572
[22]	validation-rmse:0.28487
[23]	validation-rmse:0.28363
[24]	validation-rmse:0.28263
[25]	validation-rmse:0.27990
[26]	validation-rmse:0.27927
[27]	validation-rmse:0.27674
[28]	validation-rmse:0.27683
[29]	validation-rmse:0.27474
[30]	validation-rmse:0.27391
[31]	validation-rmse:0.27342
[32]	validation-rmse:0.27241
[33]	validation-rmse:0.27248
[34]	validation-rmse:0.2



## Load Model

In [42]:
# Get the last run_id
run_info = mlflow.search_runs(experiment_ids="test_new", max_results=1)
run_id = run_info.iloc[0]["run_id"]

# The model uri should be in the format 'runs:/<run_id>/model'
model_uri = f"runs:/{run_id}/model"

# Load the model
loaded_model = mlflow.pyfunc.load_model(logged_model)

IndexError: single positional indexer is out-of-bounds

In [44]:
run_info = mlflow.search_runs(max_results=1)

In [None]:
type(loaded_model)

In [None]:
y_preds = loaded_model.predict(X_test_balanced)

In [None]:
mean_squared_error(y_preds, y_test_balanced, squared=False)

In [None]:
y_preds

In [None]:
print(loaded_model.metadata.get_model_info())

## Sklearn Models

In [None]:
mlflow.sklearn.autolog()

for algorithm in (LinearSVR, RandomForestRegressor, GradientBoostingRegressor):
    with mlflow.start_run():
        mlflow.log_param('train_data_name', 'green_tripdata_2022-01.parquet')
        mlflow.log_param('validation_data_name', 'green_tripdata_2022-02.parquet')
        mlflow.log_artifact('preprocessing/process_dataframe.bin', artifact_path='preprocessing')
        model = algorithm()
        model.fit(X_train, y_train)
        
        preds = model.predict(X_test)
        rmse = mean_squared_error(preds, y_test, squared=False)
        mlflow.log_metric("rmse", rmse)
        

## MLflow Client

In [None]:
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

In [None]:
MLFLOW_URI = "sqlite:///mlflow.db"

In [None]:
client = MlflowClient(MLFLOW_URI)

In [None]:
client.list_experiments()

In [None]:
client.create_experiment(name = 'new-experimet')

In [None]:
client.list_experiments()

In [None]:
runs = client.search_runs(
    experiment_ids='1',
    run_view_type=ViewType.ACTIVE_ONLY,
    filter_string='metrics.rmse < 7',
    max_results=5,
    order_by=["metrics.rmse ASC"],
)

In [None]:
for run in runs:
    print(f"run_id:{run.info.run_id}, metrics:{run.data.metrics['rmse']}")