# Application Scorecard Demo

## Introduction

TBC.

## Notebook Setup

In [None]:
import pandas as pd
import numpy as np
import os
import tempfile
import shutil

In [None]:
def load_dataframe(source_csv, column_types=None):
    """
    Load a DataFrame from a pickle file if available, or from a CSV file otherwise.
    Automatically handles the creation and cleanup of a temporary directory for the pickle file.
    
    :param source_csv: Path to the CSV file.
    :param column_types: Dictionary specifying columns and their types to prevent DtypeWarning.
    :return: Loaded DataFrame.
    """
    # Create a temporary directory
    temp_dir = tempfile.mkdtemp()
    pkl_file = os.path.join(temp_dir, 'dataframe.pkl')

    try:
        # Try loading from the pickle file
        df = pd.read_pickle(pkl_file)
        print("Loaded DataFrame from pickle.")
    except (FileNotFoundError, IOError):
        print("Pickle file not found. Loading CSV and creating pickle file...")
        # Load from CSV if pickle doesn't exist
        df = pd.read_csv(source_csv, dtype=column_types)
        # Save to pickle for future use
        df.to_pickle(pkl_file)
        print("DataFrame loaded from CSV and saved to pickle.")
    
    return df

## Initialize the client library

Every documentation project in the Platform UI comes with a _code snippet_ that lets the client library associate your documentation and tests with the right project on the Platform UI when you run this notebook. As you will see later, documentation projects are useful because they act as containers for model documentation and validation reports and they enable you to organize all of your documentation work in one place. 

Get your code snippet by creating a documentation project:

1. In a browser, log into the [Platform UI](https://app.prod.validmind.ai).

2. Go to **Documentation Projects** and click **Create new project**.

3. Select **`[Demo] Customer Churn Model`** and **`Initial Validation`** for the model name and type, give the project a unique  name to make it yours, and then click **Create project**.

4. Go to **Documentation Projects** > **YOUR_UNIQUE_PROJECT_NAME** > **Getting Started** and click **Copy snippet to clipboard**.

Next, replace this placeholder with your own code snippet:

In [None]:
import validmind as vm

vm.init(
  api_host = "https://api.prod.validmind.ai/api/v1/tracking",
  api_key = "...",
  api_secret = "...",
  project = "..."
)

In [None]:
vm.preview_template()

## Data Collection

In [None]:
# Define the URL to the Lending Club loan data set (2007-2014) hosted on AWS S3 for easy access.
source = "https://vmai.s3.us-west-1.amazonaws.com/datasets/lending_club_loan_data_2007_2014.csv"

# Load CSV with pandas, setting column 21 (index 20) to string data type to prevent DtypeWarning due to mixed types.
df = load_dataframe(source, column_types={20: str})

In [None]:
vm_raw_ds = vm.init_dataset(
    input_id='raw_dataset',
    dataset=df,
)

In [None]:
test= vm.tests.run_test(
    "validmind.data_validation.TabularDescriptionTables",
    inputs = {
        "dataset": vm_raw_ds
    }
)
test.log()

In [None]:
test= vm.tests.run_test(
    "validmind.data_validation.MissingValuesBarPlot",
    inputs = {
        "dataset": vm_raw_ds
    }
)
test.log()

## Data Preparation

In [None]:
# Drop non relevant columns for building an application scorecard model
COLS_TO_DROP = [
    "Unnamed: 0",
    "id", "member_id", "funded_amnt", "emp_title", "url", "desc", "application_type",
    "title", "zip_code", "delinq_2yrs", "mths_since_last_delinq", "mths_since_last_record", "mths_since_last_major_derog",
    "revol_bal", "total_rec_prncp", "total_rec_late_fee", "recoveries", "out_prncp_inv", "out_prncp",
    "collection_recovery_fee", "next_pymnt_d", "initial_list_status", "pub_rec",
    "collections_12_mths_ex_med", "policy_code", "acc_now_delinq", "pymnt_plan",
    "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim", "last_pymnt_d", "last_credit_pull_d",
    'earliest_cr_line', 'issue_d'
]

df.drop(columns=COLS_TO_DROP, axis=1, inplace=True)

In [None]:
# Calculate the fraction of missing values for each feature in the dataset.
missing_fractions = df.isnull().mean()

# Set a threshold for the minimum fraction of missing values to consider dropping a feature.
min_missing_fraction = 0.8

# Identify features where the missing value fraction exceeds the threshold.
to_drop = missing_fractions[missing_fractions > min_missing_fraction].index.tolist()
print(to_drop)

# Remove identified features with too many missing values from the dataset.
df.drop(columns=to_drop, inplace=True)

In [None]:
# Define the target variable for the model, representing loan default status.
# Map 'loan_status' to a binary variable where 'Fully Paid' loans are 0 (no default)
# and 'Charged Off' loans are 1 (default). Other statuses are treated as missing (NaN) and then removed.
target_column = "default"

df[target_column] = df["loan_status"].apply(
    lambda x: 0 if x == "Fully Paid" else 1 if x == "Charged Off" else np.nan
)

# Remove rows with missing target variable values to ensure model integrity.
df.dropna(subset=[target_column], inplace=True)

# Convert the target variable to integer type for modeling.
df[target_column] = df[target_column].astype(int)

# Drop the original 'loan_status' column as it's now redundant with 'default'.
df.drop(columns=["loan_status"], inplace=True)

In [None]:
vm_preprocessed_ds = vm.init_dataset(
    input_id='preprocessed_dataset',
    dataset=df,
    target_column=target_column
)

In [None]:
test= vm.tests.run_test(
    "validmind.data_validation.TabularDescriptionTables",
    inputs = {
        "dataset": vm_preprocessed_ds
    }
)
test.log()

In [None]:
test= vm.tests.run_test(
    "validmind.data_validation.MissingValuesBarPlot",
    inputs = {
        "dataset": vm_preprocessed_ds
    }
)
test.log()

In [None]:
test= vm.tests.run_test(
    "validmind.data_validation.IQROutliersTable",
    inputs = {
        "dataset": vm_preprocessed_ds
    }
)
test.log()

In [None]:
test= vm.tests.run_test(
    "validmind.data_validation.ClassImbalance",
    inputs = {
        "dataset": vm_preprocessed_ds
    }
)
test.log()

## Data Split

In [None]:
from sklearn.model_selection import train_test_split

test_size = 0.2

# Split data into train and test
X = df.drop(target_column, axis=1)
y = df[target_column]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=42, stratify=y
)

# Concatenate X_train with y_train to form df_train
df_train = pd.concat([X_train, y_train], axis=1)

# Concatenate X_test with y_test to form df_test
df_test = pd.concat([X_test, y_test], axis=1)

In [None]:
vm_train_ds = vm.init_dataset(
    input_id='train_dataset',
    dataset=df_train,
    target_column=target_column
)

vm_test_ds = vm.init_dataset(
    input_id='test_dataset',
    dataset=df_test,
    target_column=target_column
)

In [None]:
test= vm.tests.run_test(
    "validmind.data_validation.TabularNumericalHistograms",
    inputs = {
        "dataset": vm_train_ds
    }
)
test.log()

In [None]:
test= vm.tests.run_test(
    "validmind.data_validation.TabularCategoricalBarPlots",
    inputs = {
        "dataset": vm_train_ds
    }
)
test.log()

In [None]:
test= vm.tests.run_test(
    "validmind.data_validation.TargetRateBarPlots",
    inputs = {
        "dataset": vm_train_ds
    },
    params = {
        "default_column": target_column,
        "columns": None
    }
)
test.log()

In [None]:
test= vm.tests.run_test(
    "validmind.data_validation.PearsonCorrelationMatrix",
    inputs = {
        "dataset": vm_train_ds
    }
)
test.log()

In [None]:
test= vm.tests.run_test(
    "validmind.data_validation.WOEBinTable",
    inputs = {
        "dataset": vm_train_ds
    }
)
test.log()

## Feature Selection

In [None]:
FEATURES_TO_DROP = ['addr_state', 'total_rec_int', 'loan_amnt',
                    'funded_amnt_inv', 'dti', 'revol_util', 'total_pymnt',
                    'total_pymnt_inv', 'last_pymnt_amnt', "inq_last_6mths"]


# Keep rows where purpose is 'debt_consolidation' or 'credit_card'
df = df[df["purpose"].isin(["debt_consolidation", "credit_card"])]

# Remove rows where grade is 'F' or 'G'
df = df[~df["grade"].isin(["F", "G"])]

# Remove rows where sub_grade starts with 'F' or 'G'
df = df[~df["sub_grade"].str.startswith(("F", "G"))]

# Remove rows where home_ownership is 'OTHER', 'NONE', or 'ANY'
df = df[~df["home_ownership"].isin(["OTHER", "NONE", "ANY"])]

df.drop(FEATURES_TO_DROP, axis=1, inplace=True)