In [1]:
%load_ext kedro.ipython

In [2]:
import logging

import matplotlib.pyplot as plt
import pandas as pd

pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)

_logger = logging.getLogger(__name__)
_logger.setLevel(logging.DEBUG)

In [3]:
catalog.list()


[1m[[0m
    [32m'train_dataset'[0m,
    [32m'test_dataset'[0m,
    [32m'submission_dataset'[0m,
    [32m'preprocessed_train_dataset'[0m,
    [32m'preprocessed_test_dataset'[0m,
    [32m'parameters'[0m
[1m][0m

In [4]:
train_df = catalog.load("preprocessed_train_dataset")
test_df = catalog.load("preprocessed_test_dataset")

In [5]:
_logger.info(f"Train dataset shape: {train_df.shape}")
_logger.info(f"Test dataset shape: {test_df.shape}")
_logger.info(f"Difference in columns between both datasets: {set(train_df.columns).symmetric_difference(set(test_df.columns))}")
_logger.info(f"Customer IDs which are in both datasets: {set(train_df['ID']).intersection(set(test_df['ID']))}")

In [6]:
train_df.head()

Unnamed: 0,ID,LOAN_AMOUNT,FUNDED_AMOUNT,FUNDED_AMOUNT_INVESTOR,TERM,BATCH_ENROLLED,INTEREST_RATE,GRADE,SUB_GRADE,EMPLOYMENT_DURATION,HOME_OWNERSHIP,VERIFICATION_STATUS,PAYMENT_PLAN,LOAN_TITLE,DEBIT_TO_INCOME,DELINQUENCY_TWO_YEARS,INQUIRES_SIX_MONTHS,OPEN_ACCOUNT,PUBLIC_RECORD,REVOLVING_BALANCE,REVOLVING_UTILITIES,TOTAL_ACCOUNTS,INITIAL_LIST_STATUS,TOTAL_RECEIVED_INTEREST,TOTAL_RECEIVED_LATE_FEE,RECOVERIES,COLLECTION_RECOVERY_FEE,COLLECTION_12_MONTHS_MEDICAL,APPLICATION_TYPE,LAST_WEEK_PAY,ACCOUNTS_DELINQUENT,TOTAL_COLLECTION_AMOUNT,TOTAL_CURRENT_BALANCE,TOTAL_REVOLVING_CREDIT_LIMIT,LOAN_STATUS,__HOME_OWNERSHIP__
0,65087372,10000,32236,12329.36286,59,BAT2522922,11.135007,B,C4,MORTGAGE,MORTGAGE,Not Verified,n,Debt Consolidation,16.284758,1,0,13,0,24246,74.932551,7,WAITING,2929.646315,0.102055,2.498291,0.793724,0,INDIVIDUAL,49,0,31,311301,6619,0,176346.6267
1,1450153,3609,11940,12191.99692,59,BAT1586599,12.237563,C,D3,RENT,RENT,Source Verified,n,Debt consolidation,15.412409,0,0,12,0,812,78.297186,13,FORWARDED,772.769385,0.036181,2.377215,0.974821,0,INDIVIDUAL,109,0,53,182610,20885,0,39833.921
2,1969101,28276,9311,21603.22455,59,BAT2136391,12.545884,F,D4,MORTGAGE,MORTGAGE,Source Verified,n,Debt Consolidation,28.137619,0,0,14,0,1843,2.07304,20,WAITING,863.324396,18.77866,4.316277,1.020075,0,INDIVIDUAL,66,0,34,89801,26155,0,91506.69105
3,6651430,11170,6954,17877.15585,59,BAT2428731,16.731201,C,C3,MORTGAGE,MORTGAGE,Source Verified,n,Debt consolidation,18.04373,1,0,7,0,13819,67.467951,12,WAITING,288.173196,0.044131,0.10702,0.749971,0,INDIVIDUAL,39,0,40,9189,60214,0,108286.5759
4,14354669,16890,13226,13539.92667,59,BAT5341619,15.0083,C,D4,MORTGAGE,MORTGAGE,Source Verified,n,Credit card refinancing,17.209886,1,3,13,1,1544,85.250761,22,WAITING,129.239553,19.306646,1294.818751,0.368953,0,INDIVIDUAL,18,0,430,126029,22579,0,44234.82545


In [9]:
test_df.head()

Unnamed: 0,ID,LOAN_AMOUNT,FUNDED_AMOUNT,FUNDED_AMOUNT_INVESTOR,TERM,BATCH_ENROLLED,INTEREST_RATE,GRADE,SUB_GRADE,EMPLOYMENT_DURATION,HOME_OWNERSHIP,VERIFICATION_STATUS,PAYMENT_PLAN,LOAN_TITLE,DEBIT_TO_INCOME,DELINQUENCY_TWO_YEARS,INQUIRES_SIX_MONTHS,OPEN_ACCOUNT,PUBLIC_RECORD,REVOLVING_BALANCE,REVOLVING_UTILITIES,TOTAL_ACCOUNTS,INITIAL_LIST_STATUS,TOTAL_RECEIVED_INTEREST,TOTAL_RECEIVED_LATE_FEE,RECOVERIES,COLLECTION_RECOVERY_FEE,COLLECTION_12_MONTHS_MEDICAL,APPLICATION_TYPE,LAST_WEEK_PAY,ACCOUNTS_DELINQUENT,TOTAL_COLLECTION_AMOUNT,TOTAL_CURRENT_BALANCE,TOTAL_REVOLVING_CREDIT_LIMIT,LOAN_STATUS,__HOME_OWNERSHIP__
0,56492997,17120,10365,16025.08269,59,BAT2575549,12.163926,A,D1,RENT,RENT,Source Verified,n,Home improvement,16.749219,1,0,12,1,3576,67.278287,5,FORWARDED,4469.449851,0.088031,8.425776,0.731797,0,INDIVIDUAL,135,0,24,475442,4364,,76468.8219
1,22540813,7133,11650,12615.7956,59,BAT2833642,6.564296,B,E3,MORTGAGE,MORTGAGE,Source Verified,n,Credit card refinancing,18.157975,0,0,11,0,1932,71.313157,21,WAITING,993.90753,0.041237,6.157008,0.992918,0,INDIVIDUAL,56,0,1,72412,2573,,38079.01344
2,9862181,25291,25825,11621.28083,59,BAT1761981,14.7299,A,C3,MORTGAGE,MORTGAGE,Source Verified,n,Debt consolidation,15.190011,0,0,22,0,598,50.883065,23,FORWARDED,729.113379,0.021745,5.705077,0.28158,0,INDIVIDUAL,3,0,26,284825,19676,,51275.93268
3,10097822,30781,9664,15375.82351,59,BAT5341619,10.523767,A,A2,RENT,RENT,Verified,n,Debt Consolidation,21.29255,0,0,11,0,5222,82.449083,28,WAITING,715.867091,0.092398,2.469688,0.959162,0,INDIVIDUAL,21,0,32,40842,7226,,68867.98965
4,47771809,8878,9419,7176.647582,58,BAT4694572,9.997013,C,B3,OWN,OWN,Verified,n,Debt consolidation,4.812117,0,0,11,0,553,49.075855,9,WAITING,248.572854,0.010354,2.127835,0.402315,0,INDIVIDUAL,104,0,33,90825,26145,,91556.85423


In [10]:
train_df.isna().sum()


ID                              [1;36m0[0m
LOAN_AMOUNT                     [1;36m0[0m
FUNDED_AMOUNT                   [1;36m0[0m
FUNDED_AMOUNT_INVESTOR          [1;36m0[0m
TERM                            [1;36m0[0m
BATCH_ENROLLED                  [1;36m0[0m
INTEREST_RATE                   [1;36m0[0m
GRADE                           [1;36m0[0m
SUB_GRADE                       [1;36m0[0m
EMPLOYMENT_DURATION             [1;36m0[0m
HOME_OWNERSHIP                  [1;36m0[0m
VERIFICATION_STATUS             [1;36m0[0m
PAYMENT_PLAN                    [1;36m0[0m
LOAN_TITLE                      [1;36m0[0m
DEBIT_TO_INCOME                 [1;36m0[0m
DELINQUENCY_TWO_YEARS           [1;36m0[0m
INQUIRES_SIX_MONTHS             [1;36m0[0m
OPEN_ACCOUNT                    [1;36m0[0m
PUBLIC_RECORD                   [1;36m0[0m
REVOLVING_BALANCE               [1;36m0[0m
REVOLVING_UTILITIES             [1;36m0[0m
TOTAL_ACCOUNTS                  [1;36m0[0m
INITIAL_L

In [11]:
test_df.isna().sum()


ID                                  [1;36m0[0m
LOAN_AMOUNT                         [1;36m0[0m
FUNDED_AMOUNT                       [1;36m0[0m
FUNDED_AMOUNT_INVESTOR              [1;36m0[0m
TERM                                [1;36m0[0m
BATCH_ENROLLED                      [1;36m0[0m
INTEREST_RATE                       [1;36m0[0m
GRADE                               [1;36m0[0m
SUB_GRADE                           [1;36m0[0m
EMPLOYMENT_DURATION                 [1;36m0[0m
HOME_OWNERSHIP                      [1;36m0[0m
VERIFICATION_STATUS                 [1;36m0[0m
PAYMENT_PLAN                        [1;36m0[0m
LOAN_TITLE                          [1;36m0[0m
DEBIT_TO_INCOME                     [1;36m0[0m
DELINQUENCY_TWO_YEARS               [1;36m0[0m
INQUIRES_SIX_MONTHS                 [1;36m0[0m
OPEN_ACCOUNT                        [1;36m0[0m
PUBLIC_RECORD                       [1;36m0[0m
REVOLVING_BALANCE                   [1;36m0[0m
REVOLVING_UTILITIES