In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

sample_data = True

data_directory = os.path.join('.', 'decline_data')

loan_data_path = os.path.join(data_directory, 'Declined_LoanData.csv')
loan = pd.read_csv(loan_data_path, low_memory=False, encoding='latin-1')

pd.set_option('display.max_columns', len(loan.columns))
loan.head(3)

pd.reset_option('display.max_columns')

# see all loan table column variables datatypes.
print(loan.dtypes)

# See shape, all columns and statistic values for each columns.
print(loan.describe())

print(loan.shape)
print(loan.info())

# If we're sampling, reduce the data set down.
if sample_data:
    loan = loan.sample(frac=0.1, replace=False, random_state=1234, axis=0)
    
print(loan.shape)
print(loan.info())

Amount Requested        float64
Application Date         object
Loan Title               object
Risk_Score              float64
Debt-To-Income Ratio     object
Zip Code                 object
State                    object
Employment Length        object
Policy Code               int64
timestamp                 int64
dtype: object
       Amount Requested    Risk_Score   Policy Code     timestamp
count      1.107939e+07  4.676607e+06  1.107939e+07  1.107939e+07
mean       1.339154e+04  6.233829e+02  5.542545e-03  1.477929e+09
std        1.619671e+04  1.084081e+02  1.051398e-01  2.904828e+06
min        0.000000e+00  0.000000e+00  0.000000e+00  1.476812e+09
25%        4.500000e+03  5.910000e+02  0.000000e+00  1.476817e+09
50%        1.000000e+04  6.400000e+02  0.000000e+00  1.476821e+09
75%        2.000000e+04  6.780000e+02  0.000000e+00  1.476829e+09
max        1.400000e+06  9.900000e+02  2.000000e+00  1.485553e+09
(11079386, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1107938

In [2]:
# Rename the columns to match those in the accepted loan data.
loan.columns = ['loan_amnt', 'date', 'title', 'mean_fico', 'dti', 
                'zip_code', 'addr_state', 'emp_length', 'policy_code', 'timestamp']

# These are dropped for the same reasons they were dropped in the accepted loan data.
loan.drop(['date', 'title', 'zip_code', 'addr_state', 'policy_code', 'timestamp'], axis=1, inplace=True)

# Convert dti from string, but only if it isn't already numeric.
if loan.dti.dtype == 'float64':
    print("dti is already numeric, no need to convert it")

elif loan.dti.dtype == 'object':
    loan['dti'] = loan['dti'].map(lambda x: str(x).lstrip(' ').rstrip('% '))
    print(loan['dti'][0:5])
    loan['dti'] = loan['dti'].astype('float64')

# We will clean the emp_length column to use it in data exploration.
loan['emp_length'] = loan.emp_length.str.replace('+','')
loan['emp_length'] = loan.emp_length.str.replace('<','')
loan['emp_length'] = loan.emp_length.str.replace('years','')
loan['emp_length'] = loan.emp_length.str.replace('year','')
loan['emp_length'] = loan.emp_length.str.replace('n/a','0')

# Convert emp_length to float.
print("Unique values for emp_length:")
print(loan.emp_length.unique())

loan['emp_length'] = loan['emp_length'].astype('float64')

9065590     28.77
8389960     14.27
3245734      1260
10030653      100
8515327      1.05
Name: dti, dtype: object
Unique values for emp_length:
[' 1 ' '5 ' '6 ' '4 ' '0' '10 ' '2 ' '3 ' '1 ' '8 ' '9 ' '7 ']


In [3]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1107939 entries, 9065590 to 4958500
Data columns (total 4 columns):
loan_amnt     1107939 non-null float64
mean_fico     467802 non-null float64
dti           1107939 non-null float64
emp_length    1107939 non-null float64
dtypes: float64(4)
memory usage: 42.3 MB


In [4]:
# Check correlation of the columns.
cor = loan.corr()
cor.loc[:,:] = np.tril(cor, k=-1) 
cor = cor.stack()
cor[(cor > 0.55) | (cor < -0.55)]

Series([], dtype: float64)

In [5]:
# save cleaned data to .csv
destination_filepath = os.path.join(data_directory, "Cleaned_RejectedLoanData.csv")
loan.to_csv(destination_filepath, encoding="UTF-8")
