In [1]:
import itertools
import warnings 
import re
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import sympy
from scipy import stats
from collections import OrderedDict
from risk_helpers import *

In [2]:
%matplotlib inline
pd.set_option('display.precision',2)
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.set_option('display.float_format',lambda x: '%.4f' % x)

In [3]:
df = pd.read_csv(r'C:\Users\Iulia\Desktop\CreditRisk\credit_risk_dataset.csv')

In [4]:
# Compute missing percentage
missing_percentages = compute_missing_percentage(df)

# Print missing percentages
for column, percentage in missing_percentages.items():
    print(f"{column}: {percentage:.2f}%")

person_age: 0.00%
person_income: 0.00%
person_home_ownership: 0.00%
person_emp_length: 2.75%
loan_intent: 0.00%
loan_grade: 0.00%
loan_amnt: 0.00%
loan_int_rate: 9.56%
loan_status: 0.00%
loan_percent_income: 0.00%
cb_person_default_on_file: 0.00%
cb_person_cred_hist_length: 0.00%


In [5]:
# Delete features with missing percentage above 15%
delete_features_above_threshold(df, 15)

# Replace outliers with whiskers using default multiplier
replace_outliers_with_whiskers(df)

In [6]:
# Define the target variable and independent variables
target = 'loan_status'
independent_vars = list(df.columns[df.columns != target])

# Split the data into training and validation sets
train_data, valid_data = train_test_split(df, test_size=0.2, random_state=42)

In [7]:
# Fill missing values with median for numeric and mode for categorical
fill_missing_values(train_data)
fill_missing_values(valid_data)

# Check the resulting dataframe
train_data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
32377,40.5,46000.0,RENT,2.0,PERSONAL,C,4800.0,11.09,0,0.1,Y,15.5
1338,26.0,26000.0,OWN,0.0,DEBTCONSOLIDATION,E,8500.0,16.45,1,0.33,N,3.0
7047,23.0,51000.0,MORTGAGE,3.0,PERSONAL,C,16000.0,13.11,0,0.31,Y,3.0
8225,22.0,56004.0,MORTGAGE,6.0,MEDICAL,A,6000.0,7.88,0,0.11,N,4.0
7178,24.0,79000.0,RENT,3.0,PERSONAL,C,7000.0,12.54,0,0.09,N,3.0


In [8]:
# apply woe transform to the dataframe
data = woe_transform_dataframe(train_data, target_col='loan_status', event=1, bins=10)

# Obtain the WoE dictionary using woe_transform_dataframe
woe_dict = woe_transform_dataframe(train_data, target_col='loan_status', event=1, bins=10)

In [11]:
# Transform the original DataFrame using the WoE dictionary
transformed_df = transform_to_woe(train_data, woe_dict)

# Assuming 'validation_df' is your validation dataset and 'woe_dict' is the WoE dictionary obtained from the training dataset
transformed_valid_df = transform_to_woe(valid_data, woe_dict)

In [12]:
transformed_df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
32377,-0.0149,-0.1397,0.505,0.2013,-0.1035,-0.0624,-0.082,-0.2266,0,-0.7268,-0.2102,-0.0487
1338,-0.0201,0.8252,-1.2059,0.3127,0.3635,1.8485,-0.1095,1.5258,1,1.5375,-0.2102,0.0543
7047,-0.0193,-0.1397,-0.6718,0.0201,-0.1035,-0.0624,0.2733,-0.0897,0,1.5375,-0.2102,0.0543
8225,0.2144,-0.1397,-0.6718,-0.1514,0.2504,-0.9188,-0.4323,-0.8405,0,-0.7268,-0.2102,0.0321
7178,-0.0193,-0.5027,0.505,0.0201,-0.1035,-0.0624,-0.4323,-0.0897,0,-0.7268,-0.2102,0.0543


In [18]:
# Fill missing values with median for numeric and mode for categorical
fill_missing_values(transformed_df)
fill_missing_values(transformed_valid_df)

In [20]:
transformed_valid_df.loan_percent_income.value_counts(dropna=False)

-0.7268    1431
-0.4309    1119
-0.7420    1075
-0.1607     916
1.5375      620
-0.0762     507
-0.8716     438
0.1735      411
Name: loan_percent_income, dtype: int64