In [1]:
import pandas as pd
from data_transform import DataTransform
from dataframeinfo import DataFrameInfo
from sklearn.preprocessing import OrdinalEncoder


In [2]:
# Load Data
df = pd.read_csv("loan_payments.csv")
# Cast Datatypes
transformer = DataTransform()
df_info = DataFrameInfo(df)
df = transformer.encode_transform(df)
df = transformer.transform_digit_string(df)
df = transformer.cast_column_dtypes(df)


id --> int32
member_id --> int32
loan_amount --> int32
funded_amount --> float32
funded_amount_inv --> float32
term --> int32
int_rate --> float32
instalment --> float32
grade --> category
sub_grade --> category
employment_length --> int32
home_ownership --> category
annual_inc --> float32
verification_status --> category
issue_date --> date
loan_status --> category
payment_plan --> category
purpose --> category
dti --> float32
delinq_2yrs --> int32
earliest_credit_line --> date
inq_last_6mths --> int32
mths_since_last_delinq --> int32
mths_since_last_record --> int32
open_accounts --> int32
total_accounts --> int32
out_prncp --> float32
out_prncp_inv --> float32
total_payment --> float32
total_payment_inv --> float32
total_rec_prncp --> float32
total_rec_int --> float32
total_rec_late_fee --> float32
recoveries --> float32
collection_recovery_fee --> float32
last_payment_date --> date
last_payment_amount --> float32
next_payment_date --> date
last_credit_pull_date --> date
collections

In [3]:
df['term'].to_csv('term.csv')

In [5]:
df_info.get_counts()

Unnamed: 0,count
Unnamed: 0,54231
id,54231
member_id,54231
loan_amount,54231
funded_amount,51224
funded_amount_inv,54231
int_rate,49062
instalment,54231
annual_inc,54231
dti,54231


In [6]:
df_info.get_null_counts()

Unnamed: 0,number of nulls,percentage of nulls,percentage of non-nulls
Unnamed: 0,0,0.0,100.0
id,0,0.0,100.0
member_id,0,0.0,100.0
loan_amount,0,0.0,100.0
funded_amount,3007,5.54,94.46
funded_amount_inv,0,0.0,100.0
int_rate,5169,9.53,90.47
instalment,0,0.0,100.0
annual_inc,0,0.0,100.0
dti,0,0.0,100.0


In [7]:
df_info.get_only_columns_with_nulls()

Unnamed: 0,number of nulls,percentage of nulls,percentage of non-nulls
funded_amount,3007,5.54,94.46
int_rate,5169,9.53,90.47
mths_since_last_delinq,31002,57.17,42.83
mths_since_last_record,48050,88.6,11.4
collections_12_mths_ex_med,51,0.09,99.91
mths_since_last_major_derog,46732,86.17,13.83


In [8]:
df_info.show_categorical_columns()



Categorical Columns and their Unique Values: 
       categorical column  number of unique values
0       employment_length                       11
1          home_ownership                        5
2     verification_status                        3
3              issue_date                       61
4             loan_status                        9
5            payment_plan                        2
6                 purpose                       14
7    earliest_credit_line                      587
8       last_payment_date                       99
9       next_payment_date                       97
10  last_credit_pull_date                      102
11       application_type                        1


In [9]:
df_info.show_numerical_columns()



Numerical Columns and their Unique Values: 
             categorical column  number of unique values
0                   loan_amount                     1083
1                 funded_amount                     1123
2             funded_amount_inv                     5247
3                      int_rate                      458
4                    instalment                    19940
5                    annual_inc                     6132
6                           dti                     3611
7                   delinq_2yrs                       16
8                inq_last_6mths                       24
9        mths_since_last_delinq                      101
10       mths_since_last_record                      121
11                open_accounts                       48
12               total_accounts                       88
13                    out_prncp                    17673
14                out_prncp_inv                    17872
15                total_payment           

In [10]:
df_info.get_median()

Unnamed: 0,50%
Unnamed: 0,27115.0
id,7084590.0
member_id,8709873.0
loan_amount,12000.0
funded_amount,12000.0
funded_amount_inv,11300.0
int_rate,13.16
instalment,347.15
annual_inc,61000.0
dti,15.6


In [11]:
df_info.get_percentiles()

Unnamed: 0,min,25%,50%,75%,max
Unnamed: 0,0.0,13557.5,27115.0,40672.5,54230.0
id,55521.0,759433.0,7084590.0,8860616.0,38676120.0
member_id,70694.0,958772.0,8709873.0,10527140.0,41461850.0
loan_amount,500.0,7000.0,12000.0,18000.0,35000.0
funded_amount,500.0,7000.0,12000.0,18000.0,35000.0
funded_amount_inv,0.0,6700.0,11300.0,18000.0,35000.0
int_rate,5.42,10.37,13.16,16.2,26.06
instalment,15.67,224.205,347.15,527.55,1407.01
annual_inc,3300.0,45000.0,61000.0,86000.0,2039784.0
dti,0.0,10.2,15.6,21.26,39.91


In [12]:
df_info.get_mean()

Unnamed: 0,mean
Unnamed: 0,27115.0
id,7621797.0
member_id,8655350.0
loan_amount,13333.08
funded_amount,13229.51
funded_amount_inv,12952.62
int_rate,13.50733
instalment,400.014
annual_inc,72220.85
dti,15.86709


In [13]:
df_info.get_mean_std()

Unnamed: 0,mean,std
Unnamed: 0,27115.0,15655.29
id,7621797.0,9571362.0
member_id,8655350.0,10312810.0
loan_amount,13333.08,8082.197
funded_amount,13229.51,8019.018
funded_amount_inv,12952.62,8099.474
int_rate,13.50733,4.392893
instalment,400.014,238.92
annual_inc,72220.85,51589.34
dti,15.86709,7.623124


In [29]:
# Encoding employment_length
enc = OrdinalEncoder()
X = df[['employment_length']]
df[['employment_length']] = enc.fit_transform(X)  

In [19]:
enc.categories_

[array(['1 year', '10+ years', '2 years', '3 years', '4 years', '5 years',
        '6 years', '7 years', '8 years', '9 years', '< 1 year', nan],
       dtype=object)]

In [20]:
df[['employment_length']]

Unnamed: 0,employment_length
0,5.0
1,9.0
2,8.0
3,0.0
4,1.0
...,...
54226,0.0
54227,10.0
54228,1.0
54229,4.0
