# 2-Wrangling

This sections aims to deal with outliers and missing data.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
import pickle
import re

In [2]:
loan_data=pd.read_csv("C:\\Users\\yfawz\\OneDrive\\Desktop\\load_default_prediction\\data\\raw\\loan.csv", low_memory=False)

In [3]:
feature_desc=pd.read_excel("C:\\Users\\yfawz\\OneDrive\\Desktop\\load_default_prediction\\data\\raw\\LCDataDictionary.xlsx")

In [6]:
loan_data.isnull().sum()

id                                            2260668
member_id                                     2260668
loan_amnt                                           0
funded_amnt                                         0
funded_amnt_inv                                     0
term                                                0
int_rate                                            0
installment                                         0
grade                                               0
sub_grade                                           0
emp_title                                      166969
emp_length                                     146907
home_ownership                                      0
annual_inc                                          4
verification_status                                 0
issue_d                                             0
loan_status                                         0
pymnt_plan                                          0
url                         

In [7]:
loan_data.columns.values

array(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       'emp_title', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'loan_status', 'pymnt_plan',
       'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti',
       'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths',
       'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d',
       'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint

There are features in this dataset that are not required, so let's remove examine which ones we need and not, one by one.

But before we do that, we will split the set into train and test splits.

In [21]:
X_train, X_test, y_train, y_test = train_test_split(loan_data.drop('loan_status', axis=1).values, 
                                                    loan_data.loan_status.values,
                                                    test_size=0.25,
                                                    stratify=loan_data.loan_status.values)

It took a while to split the data, so we will save the notebook using pickle.

In [12]:
with open('train.pickle', 'wb') as f:
    pickle.dump([X_train, y_train], f)

In [13]:
with open('test.pickle', 'wb') as f:
    pickle.dump([X_test, y_test], f)

In [4]:
with open('train.pickle', 'rb') as f:
    X_train, y_train = pickle.load(f)

In [5]:
with open('test.pickle', 'rb') as f:
    X_test, y_test = pickle.load(f)

Let's check if the target variable has any null values in both the test and training set.

In [9]:
np.sum(loan_data.loan_status.isnull())

0

The target variable does not have any null values.

Let's creat 2 dataframes using the training and testing set and renaming the columns so they are easily identifiable.

In [11]:
 len(X_test), len(X_train)

(565167, 1695501)

In [12]:
loan_train=pd.DataFrame(X_train, columns=loan_data.columns.drop("loan_status"))

In [13]:
loan_test=pd.DataFrame(X_test, columns=loan_data.columns.drop("loan_status"))

We will start by removing columns that have all null values.

In [14]:
null_coumns=loan_train.columns[loan_train.isnull().sum()==len(loan_train)]

In [15]:
null_coumns

Index(['id', 'member_id', 'url'], dtype='object')

We see that following columns can be dropped as they are all null values and are insignificant:

1. 'id'
2. 'member_id'
3. 'url'



In [16]:
loan_train.drop(null_coumns, axis=1,inplace=True)

In [17]:
loan_test.drop(null_coumns, axis=1,inplace=True)

We see that there are a few columns with a significant number of null values. We will set a threshold of 75%. If column values are more than 75% null, they will be dropped as they will not provide much significance when predicting.

In [18]:
null_columns_75=loan_train.columns[(loan_train.isnull().sum()/len(loan_train))>=0.75]

In [19]:
null_columns_75

Index(['desc', 'mths_since_last_record', 'annual_inc_joint', 'dti_joint',
       'verification_status_joint', 'mths_since_recent_bc_dlq',
       'revol_bal_joint', 'sec_app_earliest_cr_line', 'sec_app_inq_last_6mths',
       'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util',
       'sec_app_open_act_il', 'sec_app_num_rev_accts',
       'sec_app_chargeoff_within_12_mths',
       'sec_app_collections_12_mths_ex_med',
       'sec_app_mths_since_last_major_derog', 'hardship_type',
       'hardship_reason', 'hardship_status', 'deferral_term',
       'hardship_amount', 'hardship_start_date', 'hardship_end_date',
       'payment_plan_start_date', 'hardship_length', 'hardship_dpd',
       'hardship_loan_status', 'orig_projected_additional_accrued_interest',
       'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
       'debt_settlement_flag_date', 'settlement_status', 'settlement_date',
       'settlement_amount', 'settlement_percentage', 'settlement_term'],
      d

Just by looking at the columns, none of them would be significant in predicting lateness.

In [20]:
loan_train.drop(null_columns_75, axis=1,inplace=True)

In [21]:
loan_test.drop(null_columns_75, axis=1,inplace=True)

In [23]:
loan_train.head()

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,disbursement_method,debt_settlement_flag
0,8000,8000,8000,36 months,10.99,261.88,B,B2,Legal Assistant,10+ years,...,50,0,0,11100,8116,7800,0,N,Cash,N
1,3900,3900,3900,36 months,12.99,131.39,C,C1,radiographer,6 years,...,0,1,0,25881,10129,7750,13531,N,Cash,N
2,18000,18000,18000,60 months,7.97,364.72,A,A5,Senior Losn Examiner,10+ years,...,0,0,0,305986,82784,35600,117986,N,Cash,N
3,3500,3500,3500,36 months,15.02,121.37,C,C3,detective,10+ years,...,75,1,0,71470,51208,3500,67970,N,Cash,N
4,22000,22000,21950,36 months,7.62,685.55,A,A3,Aptean,9 years,...,25,0,0,345978,44605,17700,50928,N,Cash,N


Our features are now 104, down from 145, a 27.5% reduction.

We now need to deal with all the missing values. Let's find the columsn with missing values.

In [24]:
columns_nan=loan_train.columns[loan_train.isnull().sum()!=0].values

In [25]:
len(columns_nan)

73

In [35]:
columns_nan

array(['emp_title', 'emp_length', 'annual_inc', 'title', 'zip_code',
       'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths',
       'mths_since_last_delinq', 'open_acc', 'pub_rec', 'revol_util',
       'total_acc', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m',
       'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
       'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths',
       'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_inq',
       'mths_since_recent_revol_delinq', 'num_accts_e

There are 73 columns with nan values, we need to deal with them.

1. 'emp_title', Employment title. The entries are strings. If there are Nan's, they will be classified them as "Not Given"
2. 'emp_length',Employment Length. The entries are strings. If there are Nan's, they we will classified them as "Not Given"
3. 'annual_inc', Annual Income. The entries are  floats. If there are Nan's, they will be classified as 0
4. 'title', Purpose Title is just another column of purpose but human readable, so this column will be removed.
5. 'zip_code', Zip code is that of loanee addresses.  This column will not be useful in our analysis and can be dropped. 
6. 'dti' is debt-to-income ratio. This is a significant ratio. Entries are floats. Nan's will be replaced with the mean. 

In [26]:
loan_train['emp_title']=loan_train['emp_title'].fillna("Not Given")
loan_test['emp_title']=loan_test['emp_title'].fillna("Not Given")

In [27]:
loan_train['emp_length']=loan_train['emp_length'].fillna("Not Given")
loan_test['emp_length']=loan_test['emp_length'].fillna("Not Given")

In [28]:
loan_train['annual_inc']=loan_train['annual_inc'].fillna(0)
loan_test['annual_inc']=loan_test['annual_inc'].fillna(0)

In [29]:
loan_train.drop('title', inplace=True, axis=1)
loan_test.drop('title', inplace=True, axis=1)

In [30]:
loan_train.drop('zip_code', inplace=True, axis=1)
loan_test.drop('zip_code', inplace=True, axis=1)

In [31]:
loan_train['dti']=loan_train['dti'].fillna(np.mean(loan_train['dti']))
loan_test['dti']=loan_test['dti'].fillna(np.mean(loan_train['dti']))

Let's refresh so we have a less crowded list. As the other columns will require us to see the description what they are precisely.

In [42]:
columns_nan_1=loan_train.columns[loan_train.isnull().sum()!=0].values

In [43]:
len(columns_nan_1)

67

In [44]:
columns_nan_1

array(['delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths',
       'mths_since_last_delinq', 'open_acc', 'pub_rec', 'revol_util',
       'total_acc', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m',
       'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
       'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths',
       'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_inq',
       'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',
       'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num

There are 67 remaining columns, let's start!!

1. 'delinq_2yrs is a counter of how many time a loanee was 30+ days late over the past 2 years. Entries are floats. Nan's will be replaced with 0.
2. 'earliest_cr_line'.This when the borrowers first credit line was opened. Entries are strings with dates(Month-Year). If it's Nan, it is most probable thay they never had a credit line. They will be filled using back fills.
3. 'inq_last_6mths'. This is  a count of the number of inquiries over the last 6 months. Entries are integers. Nan's will be replaced with 0.
4. 'open_acc' is the number of current open credit lines for the borrower. These are integers and are rarely zero. Nan's will be replaced with the mean.
5. 'pub_rec'is the number of deragotory public record which is a sever case(Bankruptcy, civil court etc). These values are most commonly 0. Entries will be raplced with 0.
6. 'revol_util'. This is revolving account utilization rate in %. Entries are floats. (Example (Credit Card Balance/Credit Limit)). We will replace them the mean.
7. 'total_acc' is the number of credit lines in borrowers file. This has a large range of integers(0-100). But for ease of analysis, we will use the mean.
8. 'last_pymnt_d'. This is a string of the date of the latest payment. We will replace Nan's with backfill.
9. 'next_pymnt_d'. This is a string with the next schedule payment date. Nan's will be replaced with backfill.
10. 'last_credit_pull_d'. This is the most recent month Lending Club pulled a credit check for the borrower. Nan's will be replaced with backfill.

In [45]:
feature_desc[feature_desc['LoanStatNew']=='delinq_2yrs']['Description'].values

array(["The number of 30+ days past-due incidences of delinquency in the borrower's credit file for the past 2 years"],
      dtype=object)

In [32]:
loan_train['delinq_2yrs']=loan_train['delinq_2yrs'].fillna(0)
loan_test['delinq_2yrs']=loan_test['delinq_2yrs'].fillna(0)

In [47]:
feature_desc[feature_desc['LoanStatNew']=='earliest_cr_line']['Description'].values

array(["The month the borrower's earliest reported credit line was opened"],
      dtype=object)

In [33]:
loan_train['earliest_cr_line']=loan_train['earliest_cr_line'].fillna(method='backfill')
loan_test['earliest_cr_line']=loan_test['earliest_cr_line'].fillna(method='backfill')

In [49]:
feature_desc[feature_desc['LoanStatNew']=='inq_last_6mths']['Description'].values

array(['The number of inquiries in past 6 months (excluding auto and mortgage inquiries)'],
      dtype=object)

In [34]:
loan_train['inq_last_6mths']=loan_train['inq_last_6mths'].fillna(0)
loan_test['inq_last_6mths']=loan_test['inq_last_6mths'].fillna(0)

In [51]:
feature_desc[feature_desc['LoanStatNew']=='open_acc']['Description'].values

array(["The number of open credit lines in the borrower's credit file."],
      dtype=object)

In [35]:
loan_train['open_acc']=loan_train['open_acc'].fillna(np.mean(loan_train['open_acc']))
loan_test['open_acc']=loan_test['open_acc'].fillna(np.mean(loan_train['open_acc']))

In [53]:
feature_desc[feature_desc['LoanStatNew']=='pub_rec']['Description'].values

array(['Number of derogatory public records'], dtype=object)

In [36]:
loan_train['pub_rec']=loan_train['pub_rec'].fillna(0)
loan_test['pub_rec']=loan_test['pub_rec'].fillna(0)

In [55]:
feature_desc[feature_desc['LoanStatNew']=='revol_util']['Description'].values

array(['Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.'],
      dtype=object)

In [37]:
loan_train['revol_util']=loan_train['revol_util'].fillna(np.mean(loan_train['revol_util']))
loan_test['revol_util']=loan_test['revol_util'].fillna(np.mean(loan_train['revol_util']))

In [57]:
feature_desc[feature_desc['LoanStatNew']=='total_acc']['Description'].values

array(["The total number of credit lines currently in the borrower's credit file"],
      dtype=object)

In [38]:
loan_train['total_acc']=loan_train['total_acc'].fillna(np.mean(loan_train['total_acc']))
loan_test['total_acc']=loan_test['total_acc'].fillna(np.mean(loan_train['total_acc']))

In [59]:
feature_desc[feature_desc['LoanStatNew']=='last_pymnt_d']['Description'].values

array(['Last month payment was received'], dtype=object)

In [39]:
loan_train['last_pymnt_d']=loan_train['last_pymnt_d'].fillna(method='backfill')
loan_test['last_pymnt_d']=loan_test['last_pymnt_d'].fillna(method='backfill')

In [61]:
feature_desc[feature_desc['LoanStatNew']=='next_pymnt_d']['Description'].values

array(['Next scheduled payment date'], dtype=object)

In [40]:
loan_train['next_pymnt_d']=loan_train['next_pymnt_d'].fillna(method='ffill')
loan_test['next_pymnt_d']=loan_test['next_pymnt_d'].fillna(method='ffill')

In [63]:
feature_desc[feature_desc['LoanStatNew']=='last_credit_pull_d']['Description'].values

array(['The most recent month LC pulled credit for this loan'],
      dtype=object)

In [41]:
loan_train['last_credit_pull_d']=loan_train['last_credit_pull_d'].fillna(method='backfill')
loan_test['last_credit_pull_d']=loan_test['last_credit_pull_d'].fillna(method='backfill')

Again Let's refresh so we have a new set to work with.

In [65]:
columns_nan_2=loan_train.columns[loan_train.isnull().sum()!=0].values

In [66]:
len(columns_nan_2)

58

In [67]:
columns_nan_2

array(['mths_since_last_delinq', 'next_pymnt_d',
       'collections_12_mths_ex_med', 'mths_since_last_major_derog',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m',
       'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il',
       'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m',
       'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
       'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths',
       'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 'mths_since_recent_bc', 'mths_since_recent_inq',
       'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',
       'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl',
       'num_il_tl', 'num_op_rev_tl', 'num_rev_accts',
       'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m',
       'num_tl_30dpd', 'num_tl

1. 'mths_since_last_delinq'. This is the number of months since the borrower was last 30+ days late. Nan's are probably for borrowers who were never 30+ days late and will be replaced with zeros.
2. 'collections_12_mths_ex_med'. This is the number of collections made for payments that were late in the last 12 months. Nan's will be replaced with 0.
3. 'mths_since_last_major_derog'. This is the number of months since there was 90-day or worse rating. Nan's will be replaced with 0.
4. 'acc_now_delinq'. This is the number of accounts that the borrower is delinquent(30+ days late). Most of the results are 0. Nans will be replaced with 0.
5. 'tot_coll_amt'. This is the total collected amount ever owed. Entries are floats represnting dollar amounts. Nans will be replaced with zeros.
6. 'tot_cur_bal' is the total current balance owed on all accounts.Nans will be replaced with 0s.
7. 'open_acc_6m' is the number of open trades in the last 6 months.Nans will be replaced with 0s.
8. 'open_act_il' is the number of currently active installment trades. Nans will be replaced with the mean round to the nearest integer.
9. 'open_il_12m' is the number of installment accounts opened in the last 12 months.Nans will be replaced with the mean round to the nearest integer.
10. 'open_il_24m' is the number of installment accounts opened in the last 12 months.Nans will be replaced with the mean round to the nearest integer.
11. 'mths_since_rcnt_il' number of months since most recent installment account opened.Nans will be replaced with the mean round to the nearest integer.
12. 'total_bal_il'. Current balance of only installment accounts. Nans will be replaced with the mean.
13. 'il_util'. Ratio of current balance to credit limit on installment acounts.Nans will be replaced with the mean.
14. 'open_rv_12m'. Number of revoliving accounts opened in the last 12 months. Nans will be replaced with the mean round to the nearest integer..
15. 'open_rv_24m'. Number of revoliving accounts opened in the last 24 months. Nans will be replaced with the mean round to the nearest integer.
16. 'max_bal_bc'. Maximum current balanced owed on all revolving accounts.Nans will be replaced with the mean.
17. 'all_util'. This is the balance to credit limit ratio on all trades.Nans will be replaced with the mean.

In [68]:
feature_desc[feature_desc['LoanStatNew']=='mths_since_last_delinq']['Description'].values

array(["The number of months since the borrower's last delinquency."],
      dtype=object)

In [42]:
loan_train['mths_since_last_delinq']=loan_train['mths_since_last_delinq'].fillna(0)
loan_test['mths_since_last_delinq']=loan_test['mths_since_last_delinq'].fillna(0)

In [70]:
feature_desc[feature_desc['LoanStatNew']=='collections_12_mths_ex_med']['Description'].values

array(['Number of collections in 12 months excluding medical collections'],
      dtype=object)

In [43]:
loan_train['collections_12_mths_ex_med']=loan_train['collections_12_mths_ex_med'].fillna(0)
loan_test['collections_12_mths_ex_med']=loan_test['collections_12_mths_ex_med'].fillna(0)

In [72]:
feature_desc[feature_desc['LoanStatNew']=='mths_since_last_major_derog']['Description'].values

array(['Months since most recent 90-day or worse rating'], dtype=object)

In [44]:
loan_train['mths_since_last_major_derog']=loan_train['mths_since_last_major_derog'].fillna(0)
loan_test['mths_since_last_major_derog']=loan_test['mths_since_last_major_derog'].fillna(0)

In [74]:
feature_desc[feature_desc['LoanStatNew']=='acc_now_delinq']['Description'].values

array(['The number of accounts on which the borrower is now delinquent.'],
      dtype=object)

In [45]:
loan_train['acc_now_delinq']=loan_train['acc_now_delinq'].fillna(0)
loan_test['acc_now_delinq']=loan_test['acc_now_delinq'].fillna(0)

In [76]:
feature_desc[feature_desc['LoanStatNew']=='tot_coll_amt']['Description'].values

array(['Total collection amounts ever owed'], dtype=object)

In [None]:
loan_train['tot_coll_amt']=loan_train['tot_coll_amt'].fillna(0)
loan_test['tot_coll_amt']=loan_test['tot_coll_amt'].fillna(0)

In [78]:
feature_desc[feature_desc['LoanStatNew']=='tot_cur_bal']['Description'].values

array(['Total current balance of all accounts'], dtype=object)

In [None]:
loan_train['tot_cur_bal']=loan_train['tot_cur_bal'].fillna(0)
loan_test['tot_cur_bal']=loan_test['tot_cur_bal'].fillna(0)

In [80]:
feature_desc[feature_desc['LoanStatNew']=='open_acc_6m']['Description'].values

array(['Number of open trades in last 6 months'], dtype=object)

In [None]:
loan_train['open_acc_6m']=loan_train['open_acc_6m'].fillna(0)
loan_test['open_acc_6m']=loan_test['open_acc_6m'].fillna(0)

In [82]:
feature_desc[feature_desc['LoanStatNew']=='open_act_il']['Description'].values

array(['Number of currently active installment trades'], dtype=object)

In [None]:
loan_train['open_act_il']=loan_train['open_act_il'].fillna(np.round(np.mean(loan_train['open_act_il']),0))
loan_test['open_act_il']=loan_test['open_act_il'].fillna(np.round(np.mean(loan_train['open_act_il']),0))

In [84]:
feature_desc[feature_desc['LoanStatNew']=='open_il_12m']['Description'].values

array(['Number of installment accounts opened in past 12 months'],
      dtype=object)

In [None]:
loan_train['open_il_12m']=loan_train['open_il_12m'].fillna(np.round(np.mean(loan_train['open_il_12m']),0))
loan_test['open_il_12m']=loan_test['open_il_12m'].fillna(np.round(np.mean(loan_train['open_il_12m']),0))

In [86]:
feature_desc[feature_desc['LoanStatNew']=='open_il_24m']['Description'].values

array(['Number of installment accounts opened in past 24 months'],
      dtype=object)

In [None]:
loan_train['open_il_24m']=loan_train['open_il_24m'].fillna(np.round(np.mean(loan_train['open_il_24m']),0))
loan_test['open_il_24m']=loan_test['open_il_24m'].fillna(np.round(np.mean(loan_train['open_il_24m']),0))

In [88]:
feature_desc[feature_desc['LoanStatNew']=='mths_since_rcnt_il']['Description'].values

array(['Months since most recent installment accounts opened'],
      dtype=object)

In [None]:
loan_train['mths_since_rcnt_il']=loan_train['mths_since_rcnt_il'].fillna(np.round(np.mean(loan_train['mths_since_rcnt_il']),0))
loan_test['mths_since_rcnt_il']=loan_test['mths_since_rcnt_il'].fillna(np.round(np.mean(loan_train['mths_since_rcnt_il']),0))

In [90]:
feature_desc[feature_desc['LoanStatNew']=='total_bal_il']['Description'].values

array(['Total current balance of all installment accounts'], dtype=object)

In [None]:
loan_train['total_bal_il']=loan_train['total_bal_il'].fillna(np.mean(loan_train['total_bal_il']))
loan_test['total_bal_il']=loan_test['total_bal_il'].fillna(np.mean(loan_train['total_bal_il']))

In [92]:
feature_desc[feature_desc['LoanStatNew']=='il_util']['Description'].values

array(['Ratio of total current balance to high credit/credit limit on all install acct'],
      dtype=object)

In [None]:
loan_train['il_util']=loan_train['il_util'].fillna(np.mean(loan_train['il_util']))
loan_test['il_util']=loan_test['il_util'].fillna(np.mean(loan_train['il_util']))

In [94]:
feature_desc[feature_desc['LoanStatNew']=='open_rv_12m']['Description'].values

array(['Number of revolving trades opened in past 12 months'],
      dtype=object)

In [None]:
loan_train['open_rv_12m']=loan_train['open_rv_12m'].fillna(np.round(np.mean(loan_train['open_rv_12m']),0))
loan_test['open_rv_12m']=loan_test['open_rv_12m'].fillna(np.round(np.mean(loan_train['open_rv_12m']),0))

In [None]:
loan_train['open_rv_24m']=loan_train['open_rv_24m'].fillna(np.round(np.mean(loan_train['open_rv_24m']),0))
loan_test['open_rv_24m']=loan_test['open_rv_24m'].fillna(np.round(np.mean(loan_train['open_rv_24m']),0))

In [97]:
feature_desc[feature_desc['LoanStatNew']=='max_bal_bc']['Description'].values

array(['Maximum current balance owed on all revolving accounts'],
      dtype=object)

In [None]:
loan_train['max_bal_bc']=loan_train['max_bal_bc'].fillna(np.mean(loan_train['max_bal_bc']))
loan_test['max_bal_bc']=loan_test['max_bal_bc'].fillna(np.mean(loan_train['max_bal_bc']))

In [99]:
feature_desc[feature_desc['LoanStatNew']=='all_util']['Description'].values

array(['Balance to credit limit on all trades'], dtype=object)

In [None]:
loan_train['all_util']=loan_train['all_util'].fillna(np.mean(loan_train['all_util']))
loan_test['all_util']=loan_test['all_util'].fillna(np.mean(loan_train['all_util']))

Once again, let's refresh to clear the full columns.

In [101]:
columns_nan_3=loan_train.columns[loan_train.isnull().sum()!=0].values

In [102]:
len(columns_nan_3)

41

In [103]:
columns_nan_3

array(['next_pymnt_d', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl',
       'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal',
       'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths',
       'delinq_amnt', 'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op',
       'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl', 'mort_acc',
       'mths_since_recent_bc', 'mths_since_recent_inq',
       'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',
       'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl',
       'num_il_tl', 'num_op_rev_tl', 'num_rev_accts',
       'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m',
       'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m',
       'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies',
       'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort',
       'total_bc_limit', 'total_il_high_credit_limit'], dtype=object)

1. 'total_rev_hi_lim'. We do not have a desciption but we can assume this is the high limit of all revolinvg accounts. Nans can be replaced with means.
2. 'inq_fi'. Number of personal finance credit inquiries. Nans can be raplaced with the mean rounded to the nearest integer
3. 'total_cu_tl'. Number of finance trades in total.Nans can be raplaced with the mean rounded to the nearest integer.
4. 'inq_last_12m'. This is the number of credit inquireis in the last 12 months. Nans can be raplaced with the mean rounded to the nearest integer.
5. 'acc_open_past_24mths'. Number of trades opened in the last 24 months.Nans can be raplaced with the mean rounded to the nearest integer.
6. 'avg_cur_bal'. This is the average current balance of all accounts. Nans can be replaced with the mean.
7. 'bc_open_to_buy'. This is the total dollar value to buy on revolving bankcards. Nans can be replaced with the mean 
8. 'bc_util'. This is the ratio of total current balance to high credit/credit limit for all bankcard accounts. This can be replaced with the mean.
9. 'chargeoff_within_12_mths' Number of charges off in the last 12 months. Nans can be replced with 0.
10. 'delinq_amnt'. The past-du amount for the accounts that the borrower is now delinquent. Nans can be replaced with 0s.
11. 'mo_sin_old_il_acct'. This the number of months since oldest bank installment account opened.Nans can be raplaced with the mean rounded to the nearest integer.
12. 'mo_sin_old_rev_tl_op'. This is the number of months since oldest revolving account opened. Nans can be raplaced with the mean rounded to the nearest integer.
13. 'mo_sin_rcnt_rev_tl_op'. This is the number of months since the most recent revolving account opened.Nans can be raplaced with the mean rounded to the nearest integer.
14. 'mo_sin_rcnt_tl'. This is the number of months since the most recent account opened. Nans can be raplaced with the mean rounded to the nearest integer.  
15. 'mort_acc'. This is the number of mortgage accounts. Nans can be raplaced with the mean rounded to the nearest integer.

In [104]:
feature_desc[feature_desc['LoanStatNew']=='total_rev_hi_lim']['Description'].values

array([], dtype=object)

In [None]:
loan_train['total_rev_hi_lim']=loan_train['total_rev_hi_lim'].fillna(np.mean(loan_train['total_rev_hi_lim']))
loan_test['total_rev_hi_lim']=loan_test['total_rev_hi_lim'].fillna(np.mean(loan_train['total_rev_hi_lim']))

In [106]:
feature_desc[feature_desc['LoanStatNew']=='inq_fi']['Description'].values

array(['Number of personal finance inquiries'], dtype=object)

In [None]:
loan_train['inq_fi']=loan_train['inq_fi'].fillna(np.round(np.mean(loan_train['inq_fi']),0))
loan_test['inq_fi']=loan_test['inq_fi'].fillna(np.round(np.mean(loan_train['inq_fi']),0))

In [108]:
feature_desc[feature_desc['LoanStatNew']=='total_cu_tl']['Description'].values

array(['Number of finance trades'], dtype=object)

In [None]:
loan_train['total_cu_tl']=loan_train['total_cu_tl'].fillna(np.round(np.mean(loan_train['total_cu_tl']),0))
loan_test['total_cu_tl']=loan_test['total_cu_tl'].fillna(np.round(np.mean(loan_train['total_cu_tl']),0))

In [110]:
feature_desc[feature_desc['LoanStatNew']=='inq_last_12m']['Description'].values

array(['Number of credit inquiries in past 12 months'], dtype=object)

In [None]:
loan_train['inq_last_12m']=loan_train['inq_last_12m'].fillna(np.round(np.mean(loan_train['inq_last_12m']),0))
loan_test['inq_last_12m']=loan_test['inq_last_12m'].fillna(np.round(np.mean(loan_train['inq_last_12m']),0))

In [112]:
feature_desc[feature_desc['LoanStatNew']=='acc_open_past_24mths']['Description'].values

array(['Number of trades opened in past 24 months.'], dtype=object)

In [None]:
loan_train['acc_open_past_24mths']=loan_train['acc_open_past_24mths'].fillna(np.round(np.mean(loan_train['acc_open_past_24mths']),0))
loan_test['acc_open_past_24mths']=loan_test['acc_open_past_24mths'].fillna(np.round(np.mean(loan_train['acc_open_past_24mths']),0))

In [114]:
feature_desc[feature_desc['LoanStatNew']=='avg_cur_bal']['Description'].values

array(['Average current balance of all accounts'], dtype=object)

In [None]:
loan_train['avg_cur_bal']=loan_train['avg_cur_bal'].fillna(np.mean(loan_train['avg_cur_bal']))
loan_test['avg_cur_bal']=loan_test['avg_cur_bal'].fillna(np.mean(loan_train['avg_cur_bal']))

In [116]:
feature_desc[feature_desc['LoanStatNew']=='bc_open_to_buy']['Description'].values

array(['Total open to buy on revolving bankcards.'], dtype=object)

In [None]:
loan_train['bc_open_to_buy']=loan_train['bc_open_to_buy'].fillna(np.mean(loan_train['bc_open_to_buy']))
loan_test['bc_open_to_buy']=loan_test['bc_open_to_buy'].fillna(np.mean(loan_train['bc_open_to_buy']))

In [118]:
feature_desc[feature_desc['LoanStatNew']=='bc_util']['Description'].values

array(['Ratio of total current balance to high credit/credit limit for all bankcard accounts.'],
      dtype=object)

In [None]:
loan_train['bc_util']=loan_train['bc_util'].fillna(np.mean(loan_train['bc_util']))
loan_test['bc_util']=loan_test['bc_util'].fillna(np.mean(loan_train['bc_util']))

In [120]:
feature_desc[feature_desc['LoanStatNew']=='chargeoff_within_12_mths']['Description'].values

array(['Number of charge-offs within 12 months'], dtype=object)

In [None]:
loan_train['chargeoff_within_12_mths']=loan_train['chargeoff_within_12_mths'].fillna(0)
loan_test['chargeoff_within_12_mths']=loan_test['chargeoff_within_12_mths'].fillna(0)

In [122]:
feature_desc[feature_desc['LoanStatNew']=='delinq_amnt']['Description'].values

array(['The past-due amount owed for the accounts on which the borrower is now delinquent.'],
      dtype=object)

In [None]:
loan_train['delinq_amnt']=loan_train['delinq_amnt'].fillna(0)
loan_test['delinq_amnt']=loan_test['delinq_amnt'].fillna(0)

In [124]:
feature_desc[feature_desc['LoanStatNew']=='mo_sin_old_il_acct']['Description'].values

array(['Months since oldest bank installment account opened'],
      dtype=object)

In [None]:
loan_train['mo_sin_old_il_acct']=loan_train['mo_sin_old_il_acct'].fillna(np.round(np.mean(loan_train['mo_sin_old_il_acct']),0))
loan_test['mo_sin_old_il_acct']=loan_test['mo_sin_old_il_acct'].fillna(np.round(np.mean(loan_train['mo_sin_old_il_acct']),0))

In [126]:
feature_desc[feature_desc['LoanStatNew']=='mo_sin_old_rev_tl_op']['Description'].values

array(['Months since oldest revolving account opened'], dtype=object)

In [None]:
loan_train['mo_sin_old_rev_tl_op']=loan_train['mo_sin_old_rev_tl_op'].fillna(np.round(np.mean(loan_train['mo_sin_old_rev_tl_op']),0))
loan_test['mo_sin_old_rev_tl_op']=loan_test['mo_sin_old_rev_tl_op'].fillna(np.round(np.mean(loan_train['mo_sin_old_rev_tl_op']),0))

In [128]:
feature_desc[feature_desc['LoanStatNew']=='mo_sin_rcnt_rev_tl_op']['Description'].values

array(['Months since most recent revolving account opened'], dtype=object)

In [None]:
loan_train['mo_sin_rcnt_rev_tl_op']=loan_train['mo_sin_rcnt_rev_tl_op'].fillna(np.round(np.mean(loan_train['mo_sin_rcnt_rev_tl_op']),0))
loan_test['mo_sin_rcnt_rev_tl_op']=loan_test['mo_sin_rcnt_rev_tl_op'].fillna(np.round(np.mean(loan_train['mo_sin_rcnt_rev_tl_op']),0))

In [130]:
feature_desc[feature_desc['LoanStatNew']=='mo_sin_rcnt_tl']['Description'].values

array(['Months since most recent account opened'], dtype=object)

In [None]:
loan_train['mo_sin_rcnt_tl']=loan_train['mo_sin_rcnt_tl'].fillna(np.round(np.mean(loan_train['mo_sin_rcnt_tl']),0))
loan_test['mo_sin_rcnt_tl']=loan_test['mo_sin_rcnt_tl'].fillna(np.round(np.mean(loan_train['mo_sin_rcnt_tl']),0))

In [132]:
feature_desc[feature_desc['LoanStatNew']=='mort_acc']['Description'].values

array(['Number of mortgage accounts.'], dtype=object)

In [None]:
loan_train['mort_acc']=loan_train['mort_acc'].fillna(np.round(np.mean(loan_train['mort_acc']),0))
loan_test['mort_acc']=loan_test['mort_acc'].fillna(np.round(np.mean(loan_train['mort_acc']),0))

AND once again....refresh

In [134]:
columns_nan_4=loan_train.columns[loan_train.isnull().sum()!=0].values

In [135]:
len(columns_nan_4)

26

In [136]:
columns_nan_4

array(['next_pymnt_d', 'mths_since_recent_bc', 'mths_since_recent_inq',
       'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd',
       'num_actv_bc_tl', 'num_actv_rev_tl', 'num_bc_sats', 'num_bc_tl',
       'num_il_tl', 'num_op_rev_tl', 'num_rev_accts',
       'num_rev_tl_bal_gt_0', 'num_sats', 'num_tl_120dpd_2m',
       'num_tl_30dpd', 'num_tl_90g_dpd_24m', 'num_tl_op_past_12m',
       'pct_tl_nvr_dlq', 'percent_bc_gt_75', 'pub_rec_bankruptcies',
       'tax_liens', 'tot_hi_cred_lim', 'total_bal_ex_mort',
       'total_bc_limit', 'total_il_high_credit_limit'], dtype=object)

1. 'mths_since_recent_bc'. Number of months since most recent bankcard account opened. Nans replaced with mean round to nearest integer.
2. 'mths_since_recent_inq' Number of months since most recent inquiry. Nans replaced with mean round to nearest integer.
3. 'mths_since_recent_revol_delinq'. Number of months since most recent revolving delinquency.Nans replaced with 0
4. 'num_accts_ever_120_pd'. Number of accounts ever 120 or more days past due. Nans replaced with 0.
5. 'num_actv_bc_tl'. Number of currently active bankcard accounts. Nans replaced with mean round to nearest integer.
6. 'num_actv_rev_tl' Number of currently active revolving trades.Nans replaced with mean round to nearest integer.
7. 'num_bc_sats'.Number of satisfactory bankcard accounts. Nans replaced with mean round to nearest integer.
8. 'num_bc_tl'.Number of bankcard accounts.  Nans replaced with mean round to nearest integer.
9. 'num_il_tl'. Number of installment accounts. Nans replaced with mean round to nearest integer.
10. 'num_op_rev_tl'. Number of open revolving accounts

In [137]:
feature_desc[feature_desc['LoanStatNew']=='mths_since_recent_bc']['Description'].values

array(['Months since most recent bankcard account opened.'], dtype=object)

In [None]:
loan_train['mths_since_recent_bc']=loan_train['mths_since_recent_bc'].fillna(np.round(np.mean(loan_train['mths_since_recent_bc']),0))
loan_test['mths_since_recent_bc']=loan_test['mths_since_recent_bc'].fillna(np.round(np.mean(loan_train['mths_since_recent_bc']),0))

In [139]:
feature_desc[feature_desc['LoanStatNew']=='mths_since_recent_inq']['Description'].values

array(['Months since most recent inquiry.'], dtype=object)

In [None]:
loan_train['mths_since_recent_inq']=loan_train['mths_since_recent_inq'].fillna(np.round(np.mean(loan_train['mths_since_recent_inq']),0))
loan_test['mths_since_recent_inq']=loan_test['mths_since_recent_inq'].fillna(np.round(np.mean(loan_train['mths_since_recent_inq']),0))

In [141]:
feature_desc[feature_desc['LoanStatNew']=='mths_since_recent_revol_delinq']['Description'].values

array(['Months since most recent revolving delinquency.'], dtype=object)

In [None]:
loan_train['mths_since_recent_revol_delinq']=loan_train['mths_since_recent_revol_delinq'].fillna(0)
loan_test['mths_since_recent_revol_delinq']=loan_test['mths_since_recent_revol_delinq'].fillna(0)

In [143]:
feature_desc[feature_desc['LoanStatNew']=='num_accts_ever_120_pd']['Description'].values

array(['Number of accounts ever 120 or more days past due'], dtype=object)

In [None]:
loan_train['num_accts_ever_120_pd']=loan_train['num_accts_ever_120_pd'].fillna(0)
loan_test['num_accts_ever_120_pd']=loan_test['num_accts_ever_120_pd'].fillna(0)

In [145]:
feature_desc[feature_desc['LoanStatNew']=='num_actv_bc_tl']['Description'].values

array(['Number of currently active bankcard accounts'], dtype=object)

In [None]:
loan_train['num_actv_bc_tl']=loan_train['num_actv_bc_tl'].fillna(np.round(np.mean(loan_train['mths_since_recent_inq']),0))
loan_test['num_actv_bc_tl']=loan_test['num_actv_bc_tl'].fillna(np.round(np.mean(loan_train['mths_since_recent_inq']),0))

In [147]:
feature_desc[feature_desc['LoanStatNew']=='num_actv_rev_tl']['Description'].values

array(['Number of currently active revolving trades'], dtype=object)

In [None]:
loan_train['num_actv_rev_tl']=loan_train['num_actv_rev_tl'].fillna(np.round(np.mean(loan_train['num_actv_rev_tl']),0))
loan_test['num_actv_rev_tl']=loan_test['num_actv_rev_tl'].fillna(np.round(np.mean(loan_train['num_actv_rev_tl']),0))

In [149]:
feature_desc[feature_desc['LoanStatNew']=='num_bc_sats']['Description'].values

array(['Number of satisfactory bankcard accounts'], dtype=object)

In [None]:
loan_train['num_bc_sats']=loan_train['num_bc_sats'].fillna(np.round(np.mean(loan_train['num_bc_sats']),0))
loan_test['num_bc_sats']=loan_test['num_bc_sats'].fillna(np.round(np.mean(loan_train['num_bc_sats']),0))

In [151]:
feature_desc[feature_desc['LoanStatNew']=='num_bc_tl']['Description'].values

array(['Number of bankcard accounts'], dtype=object)

In [None]:
loan_train['num_bc_tl']=loan_train['num_bc_tl'].fillna(np.round(np.mean(loan_train['num_bc_tl']),0))
loan_test['num_bc_tl']=loan_test['num_bc_tl'].fillna(np.round(np.mean(loan_train['num_bc_tl']),0))

In [153]:
feature_desc[feature_desc['LoanStatNew']=='num_il_tl']['Description'].values

array(['Number of installment accounts'], dtype=object)

In [None]:
loan_train['num_il_tl']=loan_train['num_il_tl'].fillna(np.round(np.mean(loan_train['num_il_tl']),0))
loan_test['num_il_tl']=loan_test['num_il_tl'].fillna(np.round(np.mean(loan_train['num_il_tl']),0))

In [155]:
feature_desc[feature_desc['LoanStatNew']=='num_op_rev_tl']['Description'].values

array(['Number of open revolving accounts'], dtype=object)

In [None]:
loan_train['num_op_rev_tl']=loan_train['num_op_rev_tl'].fillna(np.round(np.mean(loan_train['num_op_rev_tl']),0))
loan_test['num_op_rev_tl']=loan_test['num_op_rev_tl'].fillna(np.round(np.mean(loan_train['num_op_rev_tl']),0))

This will be the last refresh of them all.........

In [157]:
columns_nan_5=loan_train.columns[loan_train.isnull().sum()!=0].values

In [158]:
len(columns_nan_5)

16

In [159]:
columns_nan_5

array(['next_pymnt_d', 'num_rev_accts', 'num_rev_tl_bal_gt_0', 'num_sats',
       'num_tl_120dpd_2m', 'num_tl_30dpd', 'num_tl_90g_dpd_24m',
       'num_tl_op_past_12m', 'pct_tl_nvr_dlq', 'percent_bc_gt_75',
       'pub_rec_bankruptcies', 'tax_liens', 'tot_hi_cred_lim',
       'total_bal_ex_mort', 'total_bc_limit',
       'total_il_high_credit_limit'], dtype=object)

1. 'num_rev_accts'. This is the number of revolving accounts per borrower. Nans replaced with mean rounded to nearest integer.
2. 'num_rev_tl_bal_gt_0'. Number of revolving trades with balance >0. Nans replaced with mean rounded to nearest integer.
3. 'num_sats'. Number of satisfactoy accounts.  Nans replaced with mean rounded to nearest integer. 
4. 'num_tl_120dpd_2m'. Number of accounts currently 120 days past due (updated in past 2 months). Nans replaced with 0.
5. 'num_tl_30dpd'.Number of accounts currently 30 days past due (updated in past 2 months).Nans replaced with 0.
6. 'num_tl_90g_dpd_24m'. Number of accounts 90 or more days past due in last 24 months. Nans replaced with 0.
7. 'num_tl_op_past_12m'. Number of accounts opened in past 12 months.Nans replaced with mean rounded to nearest integer.
8. 'pct_tl_nvr_dlq'. Percent of trades never delinquent. Nans replaced with mean.
9. 'percent_bc_gt_75'.Percentage of all bankcard accounts > 75% of limit.Nans replaced with mean.
10. 'pub_rec_bankruptcies'.Number of public record bankruptcies. Nans replaced with 0.
11. 'tax_liens'. Number of tax liens. Nans replaced with 0.
12. 'tot_hi_cred_lim'. Total high credit/credit limit. Nans replaced with mean.
13. 'total_bal_ex_mort'. Total credit balance excluding mortgage.Nans replaced with mean.
14. 'total_bc_limit'. Total bankcard high credit/credit limit. Nans replaced with mean.
15. 'total_il_high_credit_limit'. Total installment high credit/credit limit. Nans replaced with mean.

In [160]:
feature_desc[feature_desc['LoanStatNew']=='num_rev_accts']['Description'].values

array(['Number of revolving accounts'], dtype=object)

In [None]:
loan_train['num_rev_accts']=loan_train['num_rev_accts'].fillna(np.round(np.mean(loan_train['num_rev_accts']),0))
loan_test['num_rev_accts']=loan_test['num_rev_accts'].fillna(np.round(np.mean(loan_train['num_rev_accts']),0))

In [162]:
feature_desc[feature_desc['LoanStatNew']=='num_rev_tl_bal_gt_0']['Description'].values

array(['Number of revolving trades with balance >0'], dtype=object)

In [None]:
loan_train['num_rev_tl_bal_gt_0']=loan_train['num_rev_tl_bal_gt_0'].fillna(np.round(np.mean(loan_train['num_rev_tl_bal_gt_0']),0))
loan_test['num_rev_tl_bal_gt_0']=loan_test['num_rev_tl_bal_gt_0'].fillna(np.round(np.mean(loan_train['num_rev_tl_bal_gt_0']),0))

In [164]:
feature_desc[feature_desc['LoanStatNew']=='num_sats']['Description'].values

array(['Number of satisfactory accounts'], dtype=object)

In [None]:
loan_train['num_sats']=loan_train['num_sats'].fillna(np.round(np.mean(loan_train['num_sats']),0))
loan_test['num_sats']=loan_test['num_sats'].fillna(np.round(np.mean(loan_train['num_sats']),0))

In [166]:
feature_desc[feature_desc['LoanStatNew']=='num_tl_120dpd_2m']['Description'].values

array(['Number of accounts currently 120 days past due (updated in past 2 months)'],
      dtype=object)

In [None]:
loan_train['num_tl_120dpd_2m']=loan_train['num_tl_120dpd_2m'].fillna(0)
loan_test['num_tl_120dpd_2m']=loan_test['num_tl_120dpd_2m'].fillna(0)

In [168]:
feature_desc[feature_desc['LoanStatNew']=='num_tl_30dpd']['Description'].values

array(['Number of accounts currently 30 days past due (updated in past 2 months)'],
      dtype=object)

In [None]:
loan_train['num_tl_30dpd']=loan_train['num_tl_30dpd'].fillna(0)
loan_test['num_tl_30dpd']=loan_test['num_tl_30dpd'].fillna(0)

In [170]:
feature_desc[feature_desc['LoanStatNew']=='num_tl_90g_dpd_24m']['Description'].values

array(['Number of accounts 90 or more days past due in last 24 months'],
      dtype=object)

In [None]:
loan_train['num_tl_90g_dpd_24m']=loan_train['num_tl_90g_dpd_24m'].fillna(0)
loan_test['num_tl_90g_dpd_24m']=loan_test['num_tl_90g_dpd_24m'].fillna(0)

In [172]:
feature_desc[feature_desc['LoanStatNew']=='num_tl_op_past_12m']['Description'].values

array(['Number of accounts opened in past 12 months'], dtype=object)

In [None]:
loan_train['num_tl_op_past_12m']=loan_train['num_tl_op_past_12m'].fillna(np.round(np.mean(loan_train['num_tl_op_past_12m']),0))
loan_test['num_tl_op_past_12m']=loan_test['num_tl_op_past_12m'].fillna(np.round(np.mean(loan_train['num_tl_op_past_12m']),0))

In [174]:
feature_desc[feature_desc['LoanStatNew']=='pct_tl_nvr_dlq']['Description'].values

array(['Percent of trades never delinquent'], dtype=object)

In [175]:
loan_train['pct_tl_nvr_dlq']=loan_train['pct_tl_nvr_dlq'].fillna(np.mean(loan_train['pct_tl_nvr_dlq']))
loan_test['pct_tl_nvr_dlq']=loan_test['pct_tl_nvr_dlq'].fillna(np.mean(loan_train['pct_tl_nvr_dlq']))

In [176]:
feature_desc[feature_desc['LoanStatNew']=='percent_bc_gt_75']['Description'].values

array(['Percentage of all bankcard accounts > 75% of limit.'],
      dtype=object)

In [177]:
loan_train['percent_bc_gt_75']=loan_train['percent_bc_gt_75'].fillna(np.mean(loan_train['percent_bc_gt_75']))
loan_test['percent_bc_gt_75']=loan_test['percent_bc_gt_75'].fillna(np.mean(loan_train['percent_bc_gt_75']))

In [178]:
feature_desc[feature_desc['LoanStatNew']=='pub_rec_bankruptcies']['Description'].values

array(['Number of public record bankruptcies'], dtype=object)

In [179]:
loan_train['pub_rec_bankruptcies']=loan_train['pub_rec_bankruptcies'].fillna(0)
loan_test['pub_rec_bankruptcies']=loan_test['pub_rec_bankruptcies'].fillna(0)

In [180]:
feature_desc[feature_desc['LoanStatNew']=='tax_liens']['Description'].values

array(['Number of tax liens'], dtype=object)

In [181]:
loan_train['tax_liens']=loan_train['tax_liens'].fillna(0)
loan_test['tax_liens']=loan_test['tax_liens'].fillna(0)

In [182]:
feature_desc[feature_desc['LoanStatNew']=='tot_hi_cred_lim']['Description'].values

array(['Total high credit/credit limit'], dtype=object)

In [183]:
loan_train['tot_hi_cred_lim']=loan_train['tot_hi_cred_lim'].fillna(np.mean(loan_train['tot_hi_cred_lim']))
loan_test['tot_hi_cred_lim']=loan_test['tot_hi_cred_lim'].fillna(np.mean(loan_train['tot_hi_cred_lim']))

In [184]:
feature_desc[feature_desc['LoanStatNew']=='total_bal_ex_mort']['Description'].values

array(['Total credit balance excluding mortgage'], dtype=object)

In [185]:
loan_train['total_bal_ex_mort']=loan_train['total_bal_ex_mort'].fillna(np.mean(loan_train['total_bal_ex_mort']))
loan_test['total_bal_ex_mort']=loan_test['total_bal_ex_mort'].fillna(np.mean(loan_train['total_bal_ex_mort']))

In [186]:
feature_desc[feature_desc['LoanStatNew']=='total_bc_limit']['Description'].values

array(['Total bankcard high credit/credit limit'], dtype=object)

In [187]:
loan_train['total_bc_limit']=loan_train['total_bc_limit'].fillna(np.mean(loan_train['total_bc_limit']))
loan_test['total_bc_limit']=loan_test['total_bc_limit'].fillna(np.mean(loan_train['total_bc_limit']))

In [188]:
feature_desc[feature_desc['LoanStatNew']=='total_il_high_credit_limit']['Description'].values

array(['Total installment high credit/credit limit'], dtype=object)

In [189]:
loan_train['total_il_high_credit_limit']=loan_train['total_il_high_credit_limit'].fillna(np.mean(loan_train['total_il_high_credit_limit']))
loan_test['total_il_high_credit_limit']=loan_test['total_il_high_credit_limit'].fillna(np.mean(loan_train['total_il_high_credit_limit']))

Let's do a final test to ensure that there are no Nan values in our data.

In [198]:
loan_train.columns[loan_train.isnull().sum()!=0]

Index([], dtype='object')

In [199]:
loan_test.columns[loan_test.isnull().sum()!=0]

Index([], dtype='object')

All our columns contating values and there were no evident outliers detected that were signicant enough to remove.

Now....Let's convert all the columns with dates to timestamps.

In [201]:
loan_train.earliest_cr_line=pd.to_datetime(loan_train.earliest_cr_line)
loan_test.earliest_cr_line=pd.to_datetime(loan_test.earliest_cr_line)

In [202]:
loan_train.next_pymnt_d=pd.to_datetime(loan_train.next_pymnt_d)
loan_test.next_pymnt_d=pd.to_datetime(loan_test.next_pymnt_d)

In [203]:
loan_train.last_pymnt_d=pd.to_datetime(loan_train.last_pymnt_d)
loan_test.last_pymnt_d=pd.to_datetime(loan_test.last_pymnt_d)

In [204]:
loan_train.last_credit_pull_d=pd.to_datetime(loan_train.last_credit_pull_d)
loan_test.last_credit_pull_d=pd.to_datetime(loan_test.last_credit_pull_d)

In [330]:
loan_train.issue_d=pd.to_datetime(loan_train.issue_d)
loan_test.issue_d=pd.to_datetime(loan_test.issue_d)

# Dealing with Datatypes

We need to prepare our data as much as we can for modelling, so let's try and conver the categorical data to numerical data.

In [231]:
loan_train.select_dtypes(include=['object']).nunique()

loan_amnt                     1570
funded_amnt                   1570
funded_amnt_inv               7962
term                             2
int_rate                       669
installment                  87592
grade                            7
sub_grade                       35
emp_title                   411359
emp_length                      12
home_ownership                   6
verification_status              3
issue_d                        139
pymnt_plan                       2
purpose                         14
addr_state                      51
revol_bal                    93152
initial_list_status              2
out_prncp                   294142
out_prncp_inv               304517
total_pymnt                1251216
total_pymnt_inv            1056684
total_rec_prncp             400645
total_rec_int               557498
total_rec_late_fee           14957
recoveries                  101136
collection_recovery_fee     108356
last_pymnt_amnt             563533
policy_code         

It seems that some of our columns that should be integers are actuall object, we need to fix this.

Let's loop through the first entry entry of every column and check the datatype. If it is an integer or a float, we will convert the data type.

In [253]:
for col in loan_train.columns:
    if type(loan_train[col][0])==int:
        loan_train[col]=loan_train[col].astype('int64')
    elif type(loan_train[col][0])==float:
        loan_train[col]=loan_train[col].astype('float')

Let's check if that worked....

In [254]:
loan_train.select_dtypes(include=['object']).nunique()

term                         2
grade                        7
sub_grade                   35
emp_title               411359
emp_length                  12
home_ownership               6
verification_status          3
issue_d                    139
pymnt_plan                   2
purpose                     14
addr_state                  51
initial_list_status          2
application_type             2
hardship_flag                2
disbursement_method          2
debt_settlement_flag         2
dtype: int64

Much better now.
But another problem has risen. Employment title has 411,359 entries. This will be disastrous for modelling if not taken care of. So let's see if we can group emplyoment titles as much as we can. But first, let's convert the datatypes of the testing set.

In [271]:
type(loan_test['funded_amnt_inv'][0])

float

In [272]:
for col in loan_test.columns:
    if type(loan_test[col][0])==np.int64:
        loan_test[col]=loan_test[col].astype('int')
    elif type(loan_test[col][0])==float:
        loan_test[col]=loan_test[col].astype('float')

In [273]:
loan_test.select_dtypes(include=['object']).nunique()

term                         2
grade                        7
sub_grade                   35
emp_title               172515
emp_length                  12
home_ownership               6
verification_status          3
issue_d                    139
pymnt_plan                   2
purpose                     14
addr_state                  51
initial_list_status          2
application_type             2
hardship_flag                2
disbursement_method          2
debt_settlement_flag         2
dtype: int64

# Grouping Employment Titles

We had 411,581 unique entries for emp_title in the training set.
We had 172,515 unique entries for emp_title in the testing set.

First let's convert all entries to lower case and remove empty spaces.

In [293]:
loan_train.emp_title=loan_train.emp_title.str.lower()
loan_train.emp_title=loan_train.emp_title.str.strip()
emp_title_train= loan_train.emp_title

In [275]:
#do the same for the testing set
loan_test.emp_title=loan_test.emp_title.str.lower()
loan_test.emp_title=loan_test.emp_title.str.strip()
emp_title_test= loan_test.emp_title

In [276]:
emp_title_lower.nunique()

331732

In [278]:
emp_title_lower.value_counts()[:20]

not given             125153
teacher                36213
manager                34448
owner                  25167
registered nurse       17589
supervisor             16681
driver                 16668
sales                  14253
rn                     12935
office manager         10612
project manager        10340
general manager         9997
truck driver            9631
director                7923
president               7356
engineer                6729
sales manager           6364
operations manager      6155
police officer          5746
vice president          5636
Name: emp_title, dtype: int64

In [279]:
low_occurence_title=emp_title_lower[emp_title_lower.isin(emp_title_lower.value_counts()[(emp_title_lower.value_counts()<2000)].index)]
len(low_occurence_title.value_counts())

331664

- Just by converting the entries to be all lowercase and stripping empty space, the unique values went down to 331,569. This is a reduction of aorund 80,000 unique values.

- Just Looking at the unique value counts, some of them can be grouped level of seniority or type of industry. 

331,664 titles of the 331,732 unique titles occur less than 2000 times. That is 99.9% of the titles. Is there a way to  group the titles? 

Let's find the most common words and attempt to group titles. But let's clarify our situation:

1. There are 1,695,501 entries of titles in the dataframe.
2. 1,154,192 entries occur less than 2000 times.
3. We will attempt in grouping this data by looking at the keywords in the titles.

After looking at the key words manually in excel, a list of words that can be grouped will be placed in a 12 lists.
1. Executive (CEO, CFO, OWner, president etc)
2. Assistant (Assistants)
3. Senior (Manager, Supervisor, 'leader)
4. Skilled (Engineer, Programmer, Lawyer)
5. Technical (Mechanic, Builder etc)
6. Health(Nurse, doctor, pharmacy)
7. Business(Finance, Accounting, legal)
8. Adminstrative(Clerk, office, banker)
9. Education(Teacher, professor)
10. Low skill(Waiter, bartender)
11. Federal (Officer, army, airforce)
12. Other titles(


In [280]:
executive=[
    'director','director,',
    'vp', 'vice', 
    'executive', 'owner',
    'partner','chief',
    'ceo','pres',
    'md','shareholder',
    'president','principal',
    'cfo','coo',
    'co-owner','cto','sole',
    'svp','cio','dean'
]

In [281]:
assistant=[ 
    'assistant','asst',
    'asst.','support',
    'asm','pa',
    'agm','avp',
    'pta'
]

In [282]:
senior=[
    'supervisor','supervisior',
    'lead', 'operations',
    'management', 'project',
    'manager,', 'leader',
    'mgr', 'ops',
    'manger', 'senior',
    'sr.', 'sr','manager',
    'self','employed',
    'managment','head',
    'mamager','rsa',
    'boss','mananger',
    'gm','captain',
    'superviser'
]

In [283]:
skilled= [
    'analyst','engineer',
    'data','engineering',
    'design','network',
     'software','designer',
    'digital','media',
    'computer','architect',
    'developer','lab','energy',
    'mechanical','law','attorney',
    'geologist','estimator','it',
    'draftsman','technologist',
    'chef','system',
    'research','accenture',
    'capital', 'e-7','solar',
    'turbines','boeing',
    'pm','environmental',
    'actuary','sfc',
    'quality','qc',
    'controller',
    'expeditor','paralegal',
    'litigation','meteorologist',
    'css','researcher',
    'jeweler','booz',
    'copywriter','publisher',
    'pilot','jewelry','professor',
    'programmer','logistics',
    'planner','engineeer',
    'cpa','chemist',
    'lawyer','barista',
    'underwriter','judge',
    'at&t','trainer',
    'coder','ibm',
    'verizon','csa',
    'dsp','cma',
    'counsel'
]

In [284]:
technical=[
    'tech','technician',
    'support', 'operator',
    'systems', 'technical',
    'maintenance', 'field',
    'production', 'technology',
    'solutions','mechanic',
    'construction',
    'information', 'inspector',
    'equipment', 'auto',
    'plant', 'machine',
    'foreman', 'truck', 
    'warehouse','electric',
    'scientist','factory',
    'miner','welder','toolmaker',
    'specialist','electrician',
    'formen','forman',
    'builder','yard',
    'superintendent',
    'assembly', 'worker',
    'handler','sanitation',
    'operater','maker',
    'technition','metal',
    'plumber','plumbing',
    'installer','machines',
    'repair','install',
    'installation','assembler',
    'roofer','parts',
    'maintainer','union',
    'landscaper','rigger',
    'steamfitter','cable',
    'insulator','journeyman',
    'tankerman','fitter',
    'unloader','manufacturing',
    'laborer','landman',
    'surveyor','facilities',
    'boilermaker','contractor',
    'pumper','technitian',
    'pipefitter','technican',
    'grinder','machanic',
    'framer','machinist',
    'carpenter','millwright',
    'fabricator','loader',
    'driller','machinest',
    'ironworker','leadman',
    'trader','finisher',
    'service'
]

In [285]:
health=[ 
    'health', 'medical',
    'nurse', 'clinical',
    'hospital', 'patient',
    'nursing', 'healthcare',
    'dental', 'pharmacy',
    'surgical' ,'anesthesiologist',
    'pediatrician','optician',
    'paramedic','dietician',
    'medic','dietary',
    'biologist','pharmacist',
    'nutritionist','therpost',
    'massage','therapist',
    'pharmaceutical','dietitian',
    'audiologist','ma',
    'cpht','cardiac',
    'hygienist', 'cst',
    'mlt','microbiologist',
    'resident','psychologist',
    'np','rdh',
    'arnp','stna','rn',
    'lpn', 'radiographer',
    'physician','phlebotomist',
    'dentist','pathologist',
    'lvn','sonographer',
    'emt','crna',
    'veterinarian','doctor',
    'clinician','optometrist',
    'esthetician','psychotherapist',
    'kaiser','chiropractor',
    'pct'
]

In [286]:
business=[
    'sales', 'associate',
    'business', 'consultant',
    'development', 'marketing',
    'rep', 'account',
    'financial', 'representative',
    'admin', 'insurance',
    'finance', 'accounting',
    'bank', 'associates',
    'accounts', 'legal','banker'
    'consulting','hr',
    'human','resources','salesman',
    'accountant','accounting',
    'investments','recruiter',
    'recruitment','investment',
    'lender','internal',
    'sale','rsr',
    'banker','merrill',
    'citigroup','chase',
    'citibank','sprint',
    'economic','economist',
    'wealth','msr',
    'asset','supply',
    'logistician','broker',
    'fargo','chase',
    'stanley','salesperson',
    'loan'
]

In [288]:
education=[
    'school', 'education',
    'teacher', 'university',
    'fellow', 'college',
    'counselor', 'schools',
    'teachers','educational',
    'educator','tutor',
    'para','paraprofessional',
    'elementary','faculty',
    'lecturer'
]

In [289]:
admin=[
    'administrative', 'administration',
    'aministrator','retail',
    'security','office',
    'shipping','desk',
    'book','keeping', 
    'clerk','controller',
    'purchaser','buyer',
    'teller','administrator',
    'scheduler','customer',
    'client','processor',
    'coordinator','bookeeper',
    'filler','communications',
    'auditor','examiner',
    'tax','dba',
    'orderfiller','psr',
    'procurement','merchandiser',
    'investigator','clerical',
    'member','admissions',
    'key','safety',
    'secretary','bookkeeper',
    'librarian','csr',
    'advisor','adjuster',
    'purchasing','registrar',
    'court','payroll',
    'inventory','control',
    'appraiser','billing',
    'biller','receiver',
    'collector','collections',
    'claims'
]

In [290]:
low_skill=[
    'costco', 'buy',
    'best','resort',
    'delivery','derivery',
    'groomer','mail',
    'post','bartender',
    'helper','maintence',
    'deli', 'stylist',
    'depot','stylist',
    'house','courier',
    'casino','driver',
    'farmer','decorator',
    'cosmetologist','cook',
    'cabin','crew',
    'houseman','valet',
    'host','mailman',
    'realtor','carrier',
    'airlines','attendant',
    'attendent','letter',
    'estate','realtor',
    'hostess','housekeeper',
    'stylist','musician',
    'reader','painter',
    'manicurist','checker',
    'provider','aide',
    'groundskeeper','guest',
    'fedex','work',
    'social','packer',
    'packing','mixer',
    'janitorial','chauffeur',
    'bellman','hair',
    'dresser','receiving',
    'server','bar',
    'cleaning','dishwasher',
    'lowes','dealer',
    'forklift','storekeeper',
    'stock','transcriptionist',
    'transporter','lineman',
    'cca','yardmaster',
    'care','loss',
    'tailor','psa',
    'nordstrom','video',
    'producer','editor',
    'concierge','waitress',
    'caregiver','instructor',
    'cna','cashier',
    'receptionist','custodian',
    'housekeeping','caregiver',
    'waiter','nanny',
    'stocker','janitor',
    'walmart','conductor',
    'labor','hairstylist',
    'baker','porter',
    'mailhandler','meat',
    'food','cleaner',
    'doorman','truckdriver',
    'hha','pressman',
    'ups','barber',
    'warehouseman','photographer',
    'pastor','superintendant',
    'maintance','minister',
    'chaplain','longshoreman',
    'walgreens','coach',
    'order','selector'
]

In [291]:
federal=[
    'police','officer',
    'fire', 'transportation',
    'deputy','federal',
    'affairs','commander',
    'custodial','public',
    'train','government',
    'sherrif','parole'
    'city','emergency'
    'sergeant','guard',
    'crime','city',
    'agent','duty',
    'platoon','army',
    'master','sergeant',
    'ssg','ssgt',
    'mate','navy',
    'fireman','nypd',
    'lapd','corporal',
    'corrections',"raytheon",
    'grumman','sheriff',
    'lausd','dispatch',
    'emergency','detective',
    'firefighter','dispatcher',
    'lieutenant','soldier',
    'trooper','usaf',
    'usps','military',
    'postmaster','pca',
    'state','postal',
    'states','defense',
    'us','department',
    'major','patrolman',
    'caseworker','e6',
    'e-6','lockheed',
    'e7'
]

Let's loop through each of the titles and assign them to their respective tags.

We will great one big list of titles that can replace the one in the dataframe.

In [296]:
new_titles=[]
executive_titles=[]; assistant_titles=[]; senior_titles=[]; skilled_titles=[]; technical_titles=[]; health_titles=[]
business_titles=[]; education_titles=[]; admin_titles=[]; federal_titles=[]; other_titles=[]; low_skill_titles=[];

for title in emp_title_train.values:
    title_stripped=title.strip()
    title_split_strip=re.split(r"/+| ",title_stripped)
    count=0
    if count==0:
        for word in executive:
            if (word in title_split_strip) and (count==0):
                executive_titles.append(title)
                new_titles.append("executive")
                count=+1
    if count==0:
        for word in assistant:
             if (word in title_split_strip) and (count==0):
                assistant_titles.append(title)
                new_titles.append("assistant")
                count=+1
    if count==0:
        for word in senior:
             if (word in title_split_strip) and (count==0):
                senior_titles.append(title)
                new_titles.append("senior")
                count=+1
    if count==0:
        for word in skilled:
            if (word in title_split_strip) and (count==0):
                skilled_titles.append(title)
                new_titles.append("skilled")
                count=+1
    if count==0:
        for word in technical:
             if (word in title_split_strip) and (count==0):
                technical_titles.append(title)
                new_titles.append("technical")
                count=+1
    if count==0:
        for word in health:
             if (word in title_split_strip) and (count==0):
                health_titles.append(title)
                new_titles.append("health")
                count=+1
    if count==0:
        for word in education:
             if (word in title_split_strip) and (count==0):
                education_titles.append(title)
                new_titles.append("education")
                count=+1
    if count==0:
        for word in business:
             if (word in title_split_strip) and (count==0):
                business_titles.append(title)
                new_titles.append("business")
                count=+1     
    if count==0:        
        for word in admin:
             if (word in title_split_strip) and (count==0):
                admin_titles.append(title)
                new_titles.append("admin")
                count=+1
    if count==0:
        for word in federal: 
             if (word in title_split_strip) and (count==0) :
                federal_titles.append(title)
                new_titles.append("federal")
                count=+1
    if count==0:
        for word in low_skill: 
             if (word in title_split_strip) and (count==0):
                low_skill_titles.append(title)
                new_titles.append("low_skill")
                count=+1
    if count==0:
        other_titles.append(title)
        new_titles.append("other")
        count=+1

Let's ensure that the length of sum of all titles is equal to the length of the dataframe.

In [297]:
#Sum length of all titles
sum_of_all_titles=len(low_skill_titles)+ len(executive_titles)+len(assistant_titles)+len(senior_titles)+len(skilled_titles)+len(technical_titles)+len(health_titles)+len(business_titles)+len(education_titles)+len(admin_titles)+len(federal_titles)+len(other_titles)

In [298]:
#assert dataframe column length is equal to the sum of all titles
assert len(emp_title_lower.values)==sum_of_all_titles

We have included a list that shows other titles which are unidentified entries. Let's examine this.

In [299]:
print("Other titles comprises of {} % of all titles in the training set".format(np.round(len(other_titles)*100/len(emp_title_train))))
print("Other titles comprises of {} entries".format(len(other_titles)))

Other titles comprises of 16.0 % of all titles in the training set
Other titles comprises of 272454 entries


Going through all of these manually entered entries will be a daunting task. 84% of entries are correctly labelled, so we will keep it as is. How much has our dataframe improved?

In [300]:
print('The original title list had {} unique values, the new list has {} unique values, this will siginifcantly aid our modelling'.format(len(emp_title_lower.value_counts()),len(pd.Series(new_titles).value_counts())))

The original title list had 331732 unique values, the new list has 12 unique values, this will siginifcantly aid our modelling


Let's put it into back into the training set.

In [302]:
loan_train_new=loan_train.copy()
loan_train_new.emp_title=new_titles

Now lets do the same for the test set.

In [303]:
new_titles_test=[]
executive_titles=[]; assistant_titles=[]; senior_titles=[]; skilled_titles=[]; technical_titles=[]; health_titles=[]
business_titles=[]; education_titles=[]; admin_titles=[]; federal_titles=[]; other_titles=[]; low_skill_titles=[];

for title in emp_title_test.values:
    title_stripped=title.strip()
    title_split_strip=re.split(r"/+| ",title_stripped)
    count=0
    if count==0:
        for word in executive:
            if (word in title_split_strip) and (count==0):
                executive_titles.append(title)
                new_titles_test.append("executive")
                count=+1
    if count==0:
        for word in assistant:
             if (word in title_split_strip) and (count==0):
                assistant_titles.append(title)
                new_titles_test.append("assistant")
                count=+1
    if count==0:
        for word in senior:
             if (word in title_split_strip) and (count==0):
                senior_titles.append(title)
                new_titles_test.append("senior")
                count=+1
    if count==0:
        for word in skilled:
            if (word in title_split_strip) and (count==0):
                skilled_titles.append(title)
                new_titles_test.append("skilled")
                count=+1
    if count==0:
        for word in technical:
             if (word in title_split_strip) and (count==0):
                technical_titles.append(title)
                new_titles_test.append("technical")
                count=+1
    if count==0:
        for word in health:
             if (word in title_split_strip) and (count==0):
                health_titles.append(title)
                new_titles_test.append("health")
                count=+1
    if count==0:
        for word in education:
             if (word in title_split_strip) and (count==0):
                education_titles.append(title)
                new_titles_test.append("education")
                count=+1
    if count==0:
        for word in business:
             if (word in title_split_strip) and (count==0):
                business_titles.append(title)
                new_titles_test.append("business")
                count=+1     
    if count==0:        
        for word in admin:
             if (word in title_split_strip) and (count==0):
                admin_titles.append(title)
                new_titles_test.append("admin")
                count=+1
    if count==0:
        for word in federal: 
             if (word in title_split_strip) and (count==0) :
                federal_titles.append(title)
                new_titles_test.append("federal")
                count=+1
    if count==0:
        for word in low_skill: 
             if (word in title_split_strip) and (count==0):
                low_skill_titles.append(title)
                new_titles_test.append("low_skill")
                count=+1
    if count==0:
        other_titles.append(title)
        new_titles_test.append("other")
        count=+1

In [304]:
#Sum length of all titles
sum_of_all_test_titles=len(low_skill_titles)+ len(executive_titles)+len(assistant_titles)+len(senior_titles)+len(skilled_titles)+len(technical_titles)+len(health_titles)+len(business_titles)+len(education_titles)+len(admin_titles)+len(federal_titles)+len(other_titles)

In [305]:
#assert dataframe column length is equal to the sum of all titles
assert len(emp_title_test.values)==sum_of_all_test_titles

In [307]:
print("Other titles comprises of {} % of all titles".format(np.round(len(other_titles)*100/len(emp_title_test))))
print("Other titles comprises of {} entries".format(len(other_titles)))

Other titles comprises of 16.0 % of all titles
Other titles comprises of 91107 entries


In [308]:
loan_test_new=loan_test.copy()
loan_test_new.emp_title=new_titles_test

In [311]:
loan_train_new.emp_title.head()

0        other
1       senior
2        admin
3    technical
4      skilled
Name: emp_title, dtype: object

# Target Variable Should be Binary

We want to classify a loan whether it is bad(late) or good(on time). But that is not the labels we have as seen below.

In [314]:
y_train=pd.DataFrame(y_train).rename(columns={0:"loan_status"})
y_test=pd.DataFrame(y_test).rename(columns={0:"loan_status"})

In [315]:
y_train.loan_status.value_counts()

Fully Paid                                             781464
Current                                                689771
Charged Off                                            196241
Late (31-120 days)                                      16423
In Grace Period                                          6714
Late (16-30 days)                                        2803
Does not meet the credit policy. Status:Fully Paid       1491
Does not meet the credit policy. Status:Charged Off       571
Default                                                    23
Name: loan_status, dtype: int64

Let's create list of loan statuses.

In [316]:
good_loans=[
    'Fully Paid',
    'Current',
    'Does not meet the credit policy. Status:Fully Paid'
]
bad_loans=[
    'Default',
    'Late (31-120 days)',
    'In Grace Period',
    'Late (16-30 days)',
    'Does not meet the credit policy. Status:Charged Off',
    'Charged Off'
]

Let's loop through all the statuses and label them as good or bad.

In [320]:
#Lopping thorugh the training set
list_good_bad=[]
unidentified=[]
for status in y_train.loan_status.values:
    if status in good_loans:
        list_good_bad.append("good")
    elif status in bad_loans:
        list_good_bad.append("bad")
    else:
        unidentified.append(status)
assert len(unidentified)==0

In [321]:
#Lopping thorugh the test set
list_good_bad=[]
unidentified=[]
for status in y_train.loan_status.values:
    if status in good_loans:
        list_good_bad.append("good")
    elif status in bad_loans:
        list_good_bad.append("bad")
    else:
        unidentified.append(status)
assert len(unidentified)==0

In [331]:
loan_train_new[:1000].select_dtypes(include=['object']).nunique()

term                     2
grade                    7
sub_grade               32
emp_title               12
emp_length              12
home_ownership           3
verification_status      3
issue_d                 93
pymnt_plan               1
purpose                 13
addr_state              49
initial_list_status      2
application_type         2
hardship_flag            1
disbursement_method      2
debt_settlement_flag     2
dtype: int64

In [313]:
y_train

array(['Fully Paid', 'Current', 'Charged Off', ..., 'Fully Paid',
       'Fully Paid', 'Fully Paid'], dtype=object)

In [197]:
df_X_train=loan_train
df_X_test=loan_test

In [190]:
df_X_train.to_csv("C:\\Users\\yfawz\\OneDrive\\Desktop\\load_default_prediction\\data\\processed\\df_X_train.csv")
df_X_test.to_csv("C:\\Users\\yfawz\\OneDrive\\Desktop\\load_default_prediction\\data\\processed\\df_X_test.csv")

In [213]:
df_y_train=pd.DataFrame(y_train).rename(columns={0:"loan_status"})
df_y_test=pd.DataFrame(y_test).rename(columns={0:"loan_status"})

In [214]:
df_y_train.to_csv("C:\\Users\\yfawz\\OneDrive\\Desktop\\load_default_prediction\\data\\processed\\df_y_train.csv")
df_y_test.to_csv("C:\\Users\\yfawz\\OneDrive\\Desktop\\load_default_prediction\\data\\processed\\df_y_test.csv")

In [1]:
import dill

In [None]:
dill.dump_session('wrangling_env.db')

In [2]:
dill.load_session('wrangling_env.db')

EOFError: Ran out of input

In [2]:
df_X_train=pd.read_csv("C:\\Users\\yfawz\\OneDrive\\Desktop\\load_default_prediction\\data\\processed\\df_X_train.csv", low_memory=False, index_col=0)

  mask |= (ar1 == a)


In [3]:
df_X_test=pd.read_csv("C:\\Users\\yfawz\\OneDrive\\Desktop\\load_default_prediction\\data\\processed\\df_X_test.csv", low_memory=False, index_col=0)

In [4]:
loan_train=df_X_train

In [5]:
loan_test=df_X_test