In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics

import mlflow

In [2]:
data = pd.read_csv("loan_data_set 3.csv")

In [3]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

#### Load the datasets and capture numerical and categorical column

In [5]:
num_col = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_col = data.select_dtypes(include=['object']).columns.tolist()

In [6]:
cat_col.remove('Loan_Status')
cat_col.remove('Loan_ID')

#### Treat Missing Values for Categorical and Numerical Columns

In [7]:
# creating a list of categorical and numerical variables
for col in cat_col:
    data[col].fillna(data[col].mode()[0], inplace=True)

for col in num_col:
    data[col].fillna(data[col].median(), inplace=True)

#### Cap Extreme values to the 5th and 95th percentile for Numerical Data

In [8]:
data[num_col] = data[num_col].apply(lambda x: x.clip(*x.quantile([0.05, 0.95])))

#### Create a new feature TotalIncome
Sum of the applicant's income and the co-applicant's income

In [9]:
data['LoanAmount'] = np.log(data['LoanAmount']).copy()
data['TotalIncome'] = data['ApplicantIncome'] + data['CoapplicantIncome']
data['TotalIncome'] = np.log(data['TotalIncome']).copy()

#### Drop applicant and coapplicant income column

In [10]:
data = data.drop(['ApplicantIncome', 'CoapplicantIncome'], axis=1)

In [12]:
data = data.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

In [13]:
data.isnull().sum()

Loan_ID             0
Gender              0
Married             0
Dependents          0
Education           0
Self_Employed       0
LoanAmount          0
Loan_Amount_Term    0
Credit_History      0
Property_Area       0
Loan_Status         0
TotalIncome         0
dtype: int64

In [13]:
data.shape

(614, 12)

In [24]:
mlflow.get_artifact_uri()

'mlflow-artifacts:/0/43557bb5dd8745798f0fb7fff32927b7/artifacts'

In [14]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,TotalIncome
0,LP001002,Male,No,0,Graduate,No,4.85203,360.0,1.0,Urban,Y,8.674026
1,LP001003,Male,Yes,1,Graduate,No,4.85203,360.0,1.0,Rural,N,8.714568
2,LP001005,Male,Yes,0,Graduate,Yes,4.189655,360.0,1.0,Urban,Y,8.006368
3,LP001006,Male,Yes,0,Not Graduate,No,4.787492,360.0,1.0,Urban,Y,8.505323
4,LP001008,Male,No,0,Graduate,No,4.94876,360.0,1.0,Urban,Y,8.699515


#### Convert Categorical Columns to Numerical Columns using Label Encoding Technique

In [15]:
for col in cat_col:
    le = preprocessing.LabelEncoder()
    data[col] = le.fit_transform(data[col])

data['Loan_Status'] = le.fit_transform(data['Loan_Status'])

#### Split Data into Train and Test (70:30)

In [16]:
# Train Test Split
X = data.drop(['Loan_Status', 'Loan_ID'], axis=1)
y = data.Loan_Status

SEED = 1

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, random_state=SEED)

In [17]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,TotalIncome
0,1,0,0,0,0,4.85203,360.0,1.0,2,8.674026
1,1,1,1,0,0,4.85203,360.0,1.0,0,8.714568
2,1,1,0,0,1,4.189655,360.0,1.0,2,8.006368
3,1,1,0,1,0,4.787492,360.0,1.0,2,8.505323
4,1,0,0,0,0,4.94876,360.0,1.0,2,8.699515


#### Logistic Regression Using Grid Search Cross-Validation Approach

In [18]:
#_______________Logistic Regression______________#

lr = LogisticRegression(SEED)
lr_param_grid = {
    'C': [100, 10, 1.0, 0.1, 0.01],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}
lr_gs = GridSearchCV(
    estimator=lr,
    param_grid = lr_param_grid,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=0
)
lr_model = lr_gs.fit(X_train, y_train)

#### Decision Tree Model using Grid Search Cross-Validation Approach

In [19]:
#___________________Decision Tree________________#

dt = DecisionTreeClassifier(
    random_state=SEED
)
dt_param_grid = {
    "max_depth": [3, 5, 7, 9, 11, 13],
    "criterion": ['gini', 'entropy']
}

dt_gs = GridSearchCV(
    estimator=dt,
    param_grid = dt_param_grid,
    cv = 5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=0
)
dt_model = dt_gs.fit(X_train, y_train)

#### Random Forest Model using Grid Search Cross-Validation Approach

In [20]:
#____________________Random Forest_________________#

rf = RandomForestClassifier(random_state=SEED)
rf_param_grid = {
    'n_estimators': [400, 700],
    'max_depth': [15, 20, 25],
    'criterion': ['gini', 'entropy'],
    'max_leaf_nodes': [50, 100]
}

rf_gs = GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    cv=5,
    n_jobs=-1,
    scoring='accuracy',
    verbose=0
)
rf_model = rf_gs.fit(X_train, y_train)

#### Model Evaluation Metrics

In [21]:
def model_metrics(actual, pred):
    accuracy = metrics.accuracy_score(y_test, pred)
    f1 = metrics.f1_score(actual, pred, pos_label=1)
    fpr, tpr, threshold1 = metrics.roc_curve(y_test, pred)
    auc = metrics.auc(fpr, tpr)
    plt.figure(figsize=(8,8))
    # plot auc
    plt.plot(fpr, tpr, color='blue', label='ROC curve area = %0.2f'%auc)
    plt.plot([0,1], [0,1], 'r--')
    plt.xlim([-0.1, 1.1])
    plt.ylim([-0.1, 1.1])
    plt.xlabel('False Positive Rate', size=14)
    plt.ylabel('True Positive Rate', size=14)
    plt.legend(loc='lower right')

    # save plot
    plt.savefig("plots/ROC_curve.png")

    # Close plot
    plt.close()

    return (accuracy, f1, auc)

#### MLflow's Logging Functions

In [22]:
def mlflow_logs(model, X, y, name):
    with mlflow.start_run(run_name = name) as run:

        # Run id
        run_id = run.info.run_id
        mlflow.set_tag("run_id", run_id)

        # Make Predictions
        pred = model.predict(X)

        # Generate performance metrics
        (accuracy, f1, auc) = model_metrics(y, pred)

        # Logging best parameters
        mlflow.log_params(model.best_params_)

        # Logging model metric
        mlflow.log_metric("Mean  CV Score", model.best_score_)
        mlflow.log_metric("Accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("AUC", auc)

        # Logging artifactd and model
        mlflow.log_artifact("plots/ROC_curve.png")
        mlflow.sklearn.log_model(model, name)

        mlflow.end_run()

#### Make Predictions Using ML Models

In [25]:
mlflow_logs(dt_model, X_test, y_test, "DecisionTreeClassifier")
mlflow_logs(lr_model, X_test, y_test, "LogisticRegression")
mlflow_logs(rf_model, X_test, y_test, "RandomForestClassifier")

Exception: Run with UUID 43557bb5dd8745798f0fb7fff32927b7 is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True