In [1]:
import re
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# "magic" command to make plots show up in the notebook
%matplotlib inline 

In [2]:
df = pd.read_csv("/home/work/toy-project/data/lending_club.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
###Drop Current
df = df[df['loan_status'] != 'Current']

## Drop Columns which contains NaN value more than 90% 

In [9]:
drop_cols = df.columns[df.isnull().mean() > 0.9]
df = df.drop(drop_cols,axis=1)

In [10]:
drop_cols

Index(['member_id', 'desc', 'next_pymnt_d', 'annual_inc_joint', 'dti_joint',
       'verification_status_joint', 'revol_bal_joint',
       'sec_app_fico_range_low', 'sec_app_fico_range_high',
       'sec_app_earliest_cr_line', 'sec_app_inq_last_6mths',
       'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util',
       'sec_app_open_act_il', 'sec_app_num_rev_accts',
       'sec_app_chargeoff_within_12_mths',
       'sec_app_collections_12_mths_ex_med',
       'sec_app_mths_since_last_major_derog', 'hardship_type',
       'hardship_reason', 'hardship_status', 'deferral_term',
       'hardship_amount', 'hardship_start_date', 'hardship_end_date',
       'payment_plan_start_date', 'hardship_length', 'hardship_dpd',
       'hardship_loan_status', 'orig_projected_additional_accrued_interest',
       'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
       'debt_settlement_flag_date', 'settlement_status', 'settlement_date',
       'settlement_amount', 'settlement_perce

## Drop useless columns
- Date columns (since no FE in following project), titles , zipcode (encrypted) , policy_code (single unique value)

In [7]:
df.drop(['last_pymnt_d','last_credit_pull_d','earliest_cr_line'] , 1, inplace = True)

In [8]:
df.drop(['url','emp_title','title','zip_code','policy_code'] , 1, inplace = True)  ## policy_code 추가(unique값 1개라 의미없)

In [9]:
obj_col = df.select_dtypes('object').columns.values
df[obj_col].nunique().sort_values(ascending=False)

id                      1382384
issue_d                     139
addr_state                   51
sub_grade                    35
purpose                      14
emp_length                   11
loan_status                   8
grade                         7
home_ownership                6
verification_status           3
term                          2
pymnt_plan                    2
initial_list_status           2
application_type              2
hardship_flag                 2
disbursement_method           2
debt_settlement_flag          2
dtype: int64

## Drop rows fulled with NaN values

In [8]:
df[df['grade'].isna()]

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,disbursement_method,debt_settlement_flag
421095,Total amount funded in policy code 1: 6417608175,,,,,,,,,,...,,,,,,,,,,
421096,Total amount funded in policy code 2: 1944088810,,,,,,,,,,...,,,,,,,,,,
528961,Total amount funded in policy code 1: 1741781700,,,,,,,,,,...,,,,,,,,,,
528962,Total amount funded in policy code 2: 564202131,,,,,,,,,,...,,,,,,,,,,
651664,Total amount funded in policy code 1: 1791201400,,,,,,,,,,...,,,,,,,,,,
651665,Total amount funded in policy code 2: 651669342,,,,,,,,,,...,,,,,,,,,,
749520,Total amount funded in policy code 1: 1443412975,,,,,,,,,,...,,,,,,,,,,
749521,Total amount funded in policy code 2: 511988838,,,,,,,,,,...,,,,,,,,,,
877716,Total amount funded in policy code 1: 2063142975,,,,,,,,,,...,,,,,,,,,,
877717,Total amount funded in policy code 2: 823319310,,,,,,,,,,...,,,,,,,,,,


In [10]:
df = df[df['grade'].notna()]

## Label Encoding

In [11]:
df['verification_status'] = df.verification_status.map({'Verified': 0, 'Source Verified': 1, 'Not Verified': 2})
df['debt_settlement_flag'] = df.debt_settlement_flag.map({'N': 0, 'Y': 1})
df['initial_list_status'] = df.initial_list_status.map({'w': 0, 'f': 1})
df['application_type'] = df.application_type.map({'Individual': 0, 'Joint App': 1})
df['hardship_flag'] = df.hardship_flag.map({'N': 0, 'Y': 1})
df['pymnt_plan'] = df.pymnt_plan.map({'n': 0, 'y': 1})
df['disbursement_method'] = df.disbursement_method.map({'Cash': 0, 'DirectPay': 1})

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['addr_state'] = le.fit_transform(df['addr_state'].astype(str))
df['sub_grade'] = le.fit_transform(df['sub_grade'].astype(str))
df['purpose'] = le.fit_transform(df['purpose'].astype(str))
df['emp_length'] = le.fit_transform(df['emp_length'].astype(str))
df['home_ownership'] = le.fit_transform(df['home_ownership'].astype(str))
df['term'] = le.fit_transform(df['term'].astype(str))

In [12]:
df['loan_status'].value_counts()

Fully Paid                                             1076751
Charged Off                                             268559
Late (31-120 days)                                       21467
In Grace Period                                           8436
Late (16-30 days)                                         4349
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                     40
Name: loan_status, dtype: int64

In [13]:
###Targets

df['grade'] = df.grade.map({'A': 0, 'B': 1,'C': 2, 'D': 3,'E': 4, 'F': 5,'G': 6})   
df['loan_status'] = df.loan_status.map({'Fully Paid':0, 
                                            'Charged Off': 1
                                            , 'Late (31-120 days)': 1
                                           , 'Late (16-30 days)': 1
                                           , 'In Grace Period': 0
                                           , 'Does not meet the credit policy. Status:Fully Paid': 0
                                           , 'Does not meet the credit policy. Status:Charged Off': 1
                                           , 'Default': 1})

In [14]:
# df.fillna(-1,inplace = True)

In [15]:
# from sklearn.model_selection import train_test_split
# train, test = train_test_split(df,test_size = 0.2, random_state = 2021)

In [16]:
# train.reset_index(drop=True, inplace=True)
# test.reset_index(drop=True, inplace=True)

In [17]:
# answer = test[['id','loan_status']]
# test.drop('loan_status',1, inplace = True)

In [18]:
# # Kernel Density Plot
# fig = plt.figure(figsize=(15,8),)
# ax=sns.kdeplot(df.loc[(df['loan_status'] == 0),'last_fico_range_high'] , color='gray',shade=True,label='Normal')
# ax=sns.kdeplot(df.loc[(df['loan_status'] == 1),'last_fico_range_high'] , color='g',shade=True, label='Default')
# plt.title('last_fico_range_high Distribution Normal vs Default', fontsize = 25, pad = 40)
# plt.ylabel("Frequency of Default", fontsize = 15, labelpad = 20)
# plt.xlabel("last_fico_range_high", fontsize = 15, labelpad = 20);

In [19]:
# # Kernel Density Plot
# fig = plt.figure(figsize=(15,8),)
# ax=sns.kdeplot(df.loc[(df['loan_status'] == 0),'last_fico_range_low'] , color='gray',shade=True,label='Normal')
# ax=sns.kdeplot(df.loc[(df['loan_status'] == 1),'last_fico_range_low'] , color='g',shade=True, label='Default')
# plt.title('last_fico_range_low Distribution Normal vs Default', fontsize = 25, pad = 40)
# plt.ylabel("Frequency of Default", fontsize = 15, labelpad = 20)
# plt.xlabel("last_fico_range_low", fontsize = 15, labelpad = 20);

In [14]:
df.to_pickle('/home/work/toy-project/data/df_preprocessesd.pkl')
# train.to_pickle("train.pkl")
# test.to_pickle("test.pkl")
# answer.to_pickle("answer.pkl")