In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import sys

sys.path.append('../')


from modules.dataframeinfo import DataFrameInfo
from modules.data_transform import DataTransform, apply_powertransformations
from modules.dataframe_transform import DataFrameTransform, impute_all_null_columns, drop_outliers, drop_highly_correlated_columns

In [2]:
# Load Data
df = pd.read_csv("../csv_files/loan_payments.csv")

In [3]:
data_transform = DataTransform()
df_info_original = DataFrameInfo(df)

In [4]:
# Cast Datatypes
df = data_transform.encode_transform(df)
df = data_transform.transform_digit_string(df)
df = data_transform.cast_column_dtypes(df)

id --> int32
member_id --> int32
loan_amount --> int32
funded_amount --> float32
funded_amount_inv --> float32
term --> int32
int_rate --> float32
instalment --> float32
grade --> category
sub_grade --> category
employment_length --> int32
home_ownership --> category
annual_inc --> float32
verification_status --> category
issue_date --> date
loan_status --> category
payment_plan --> category
purpose --> category
dti --> float32
delinq_2yrs --> int32
earliest_credit_line --> date
inq_last_6mths --> int32
mths_since_last_delinq --> int32
mths_since_last_record --> int32
open_accounts --> int32
total_accounts --> int32
out_prncp --> float32
out_prncp_inv --> float32
total_payment --> float32
total_payment_inv --> float32
total_rec_prncp --> float32
total_rec_int --> float32
total_rec_late_fee --> float32
recoveries --> float32
collection_recovery_fee --> float32
last_payment_date --> date
last_payment_amount --> float32
next_payment_date --> date
last_credit_pull_date --> date
collections

In [5]:
# Impute columns that have null values
df = impute_all_null_columns(df)

                             number of nulls  percentage of nulls  \
funded_amount                           3007                 5.54   
term                                    4772                 8.80   
int_rate                                5169                 9.53   
employment_length                       2118                 3.91   
mths_since_last_delinq                 31002                57.17   
mths_since_last_record                 48050                88.60   
collections_12_mths_ex_med                51                 0.09   
mths_since_last_major_derog            46732                86.17   

                             percentage of non-nulls  
funded_amount                                  94.46  
term                                           91.20  
int_rate                                       90.47  
employment_length                              96.09  
mths_since_last_delinq                         42.83  
mths_since_last_record                         1

In [6]:
# APPLY THE TRANSFORMATIONS BASED OFF THE BEST POWER TRANSFORM METHOD FOR NORMALISATION
df = apply_powertransformations(df)

  arrmean = umr_sum(arr, axis, dtype, keepdims=True, where=where)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


In [7]:
# Drop outliers
df = drop_outliers(df)

In [8]:
# Drop highly correlated columns
df_info_transformed = DataFrameInfo(df)
num_cols = df_info_transformed.numerical_columns[3:-2]
num_cols
df = drop_highly_correlated_columns(df,num_cols)

Over correlated columns: ['loan_amount', 'funded_amount', 'funded_amount_inv', 'instalment', 'out_prncp', 'out_prncp_inv', 'total_payment', 'total_payment_inv', 'total_rec_prncp', 'recoveries', 'collection_recovery_fee']


In [9]:
df

Unnamed: 0.1,Unnamed: 0,id,member_id,term,int_rate,grade,sub_grade,employment_length,home_ownership,annual_inc,...,open_accounts,total_accounts,total_rec_int,total_rec_late_fee,last_payment_date,last_payment_amount,last_credit_pull_date,collections_12_mths_ex_med,policy_code,application_type
0,0,38676116,41461848,36.0,3.489637,A,A4,5.0,MORTGAGE,46000.0,...,12,27,512.210022,0.0,2022-01-01,248.820007,2022-01-01,0.0,1,INDIVIDUAL
1,1,38656203,41440010,36.0,3.340711,A,A3,9.0,RENT,50000.0,...,15,31,787.940002,0.0,2022-01-01,407.519989,2022-01-01,0.0,1,INDIVIDUAL
2,2,38656154,41439961,36.0,3.489637,A,A4,8.0,MORTGAGE,73913.0,...,7,18,824.539978,0.0,2021-10-01,12850.160156,2021-10-01,0.0,1,INDIVIDUAL
3,3,38656128,41439934,36.0,5.155342,C,C4,0.0,RENT,42000.0,...,6,13,947.469971,0.0,2021-06-01,13899.669922,2021-06-01,0.0,1,INDIVIDUAL
4,4,38656121,41439927,36.0,3.038983,A,A1,1.0,MORTGAGE,145000.0,...,23,50,770.929993,0.0,2022-01-01,456.540009,2022-01-01,0.0,1,INDIVIDUAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54225,54225,115606,87310,36.0,5.299521,F,F1,0.0,RENT,18000.0,...,2,2,1558.739990,0.0,2014-07-01,9921.870117,2013-06-01,0.0,1,INDIVIDUAL
54226,54226,76597,76583,36.0,3.913655,B,B2,0.0,MORTGAGE,250000.0,...,5,7,724.820007,0.0,2016-07-01,160.610001,2016-07-01,0.0,1,INDIVIDUAL
54228,54228,117045,70978,36.0,3.472037,A,A2,1.0,MORTGAGE,300000.0,...,8,18,415.369995,0.0,2016-09-01,110.580002,2013-05-01,0.0,1,INDIVIDUAL
54229,54229,88854,70699,36.0,3.472037,A,A2,4.0,RENT,200000.0,...,2,2,174.179993,0.0,2014-03-01,0.000000,2013-05-01,0.0,1,INDIVIDUAL


In [10]:
# df.to_cv('transformed_loans.csv')