In [1]:
import pandas as pd
import numpy as np
import re

from nltk.corpus import stopwords

!python3 --version

Python 3.7.12


## Meta Data

In [2]:
meta_data = pd.read_csv(
    "/kaggle/input/loan-interest-rate-prediction-data/meta_data.csv"
)
meta_data

Unnamed: 0,Variable,Definition
0,X1,Interest Rate on the loan
1,X2,A unique id for the loan.
2,X3,A unique id assigned for the borrower.
3,X4,Loan amount requested
4,X5,Loan amount funded
5,X6,Investor-funded portion of loan
6,X7,Number of payments (36 or 60)
7,X8,Loan grade
8,X9,Loan subgrade
9,X10,Employer or job title (self-filled)


## Data

The training and testing data will be sanitized (not processed) before EDA, pipelining, and modeling for:

* sensible column names

* choosing the appropriate data type and reprsentation for each feature

* cleaning the text data in categorical features

    - white spaces
    
    - case inconsistency
    
    - potential entry errors
    
* date columns formating

* removing training examples with missing values in the target

If we were building a live machine learning system and we expect that the raw data will continue to come in the forms similar to these csv files, we would include all of these steps in our pipelines. 

For this project, however, we we only have two periods--- train time and test time. To save some preprocessing compute time, we can do some ad-hoc processing to both the train and test data at the outset before any EDA, pipelining, and modeling.

**Note**: We are not applying any transformers on the test and data sets, and so there would be no data leakage. Nothing is learned from the data set yet. All the sanitization is so that we can spend more time on eda, pipelining, and modeling. 

In [3]:
train = pd.read_csv(
    "/kaggle/input/loan-interest-rate-prediction-data/train.csv", low_memory=False
)
test = pd.read_csv("/kaggle/input/loan-interest-rate-prediction-data/test.csv")
train.shape, test.shape

((400000, 32), (80000, 32))

## Column Rename

In [4]:
{key: val for key, val in zip(meta_data.Variable, meta_data.Definition)}

{'X1': 'Interest Rate on the loan',
 'X2': 'A unique id for the loan.',
 'X3': 'A unique id assigned for the borrower.',
 'X4': 'Loan amount requested',
 'X5': 'Loan amount funded',
 'X6': 'Investor-funded portion of loan',
 'X7': 'Number of payments (36 or 60)',
 'X8': 'Loan grade',
 'X9': 'Loan subgrade',
 'X10': 'Employer or job title (self-filled)',
 'X11': 'Number of years employed (0 to 10; 10 = 10 or more)',
 'X12': 'Home ownership status: RENT, OWN, MORTGAGE, OTHER.',
 'X13': 'Annual income of borrower',
 'X14': 'Income verified, not verified, or income source was verified',
 'X15': 'Date loan was issued',
 'X16': 'Reason for loan provided by borrower',
 'X17': 'Loan category, as provided by borrower',
 'X18': 'Loan title, as provided by borrower',
 'X19': 'First 3 numbers of zip code',
 'X20': 'State of borrower',
 'X21': "A ratio calculated using the borrower's total monthly debt payments on the total debt obligations, excluding mortgage and the requested loan, divided by the

In [5]:
rename_dict = {
    "X1": "interest_rate",
    "X2": "id_loan",
    "X3": "id_borrower",
    "X4": "loan_amt_requested",
    "X5": "loan_amt_funded",
    "X6": "loan_amt_investor_funded_portion",
    "X7": "num_of_payment_months",
    "X8": "loan_grade",
    "X9": "loan_subgrade",
    "X10": "self_filled_employer_job_title",
    "X11": "num_of_years_employed",
    "X12": "home_ownership_status",
    "X13": "borrower_annual_income",
    "X14": "verify_income_or_source",
    "X15": "loan_issued_date",
    "X16": "borrower_provided_reason_for_loan",
    "X17": "borrower_provided_loan_category",
    "X18": "borrower_provided_loan_title",
    "X19": "zip_first_three",
    "X20": "borrower_state",
    "X21": "monthly_debt_to_income_ratio",
    "X22": "num_of_past_dues",
    "X23": "borrower_earliest_credit_open_date",
    "X24": "num_of_creditor_inquiries",
    "X25": "num_of_months_since_delinquency",
    "X26": "num_of_months_since_public_rec",
    "X27": "num_of_open_credit_line",
    "X28": "num_of_derog_publib_rec",
    "X29": "total_credit_rev_balance",
    "X30": "rev_line_util_rate",
    "X31": "total_credit_line",
    "X32": "init_loan_status",
}

In [6]:
train.rename(columns=rename_dict, inplace=True)
test.rename(columns=rename_dict, inplace=True)

## Remove Entirely Missing Rows and Rows with Missing Target 

Remove all rows with missing target value in the *training* set:

In [7]:
train.interest_rate.isna().sum()

61010

In [8]:
train.shape

(400000, 32)

In [9]:
train.dropna(axis=0, subset=["interest_rate"], inplace=True)
train.interest_rate.isna().sum()

0

In [10]:
train.shape

(338990, 32)

Remove all rows that are complete empty (missing values in all features):

In [11]:
features = train.columns[1:].to_list()
features

['id_loan',
 'id_borrower',
 'loan_amt_requested',
 'loan_amt_funded',
 'loan_amt_investor_funded_portion',
 'num_of_payment_months',
 'loan_grade',
 'loan_subgrade',
 'self_filled_employer_job_title',
 'num_of_years_employed',
 'home_ownership_status',
 'borrower_annual_income',
 'verify_income_or_source',
 'loan_issued_date',
 'borrower_provided_reason_for_loan',
 'borrower_provided_loan_category',
 'borrower_provided_loan_title',
 'zip_first_three',
 'borrower_state',
 'monthly_debt_to_income_ratio',
 'num_of_past_dues',
 'borrower_earliest_credit_open_date',
 'num_of_creditor_inquiries',
 'num_of_months_since_delinquency',
 'num_of_months_since_public_rec',
 'num_of_open_credit_line',
 'num_of_derog_publib_rec',
 'total_credit_rev_balance',
 'rev_line_util_rate',
 'total_credit_line',
 'init_loan_status']

In [12]:
train.shape, test.shape

((338990, 32), (80000, 32))

In [13]:
train.dropna(axis=0, how="all", subset=features, inplace=True)
test.dropna(axis=0, how="all", subset=features, inplace=True)
train.shape, test.shape

((338989, 32), (80000, 32))

In [14]:
# Row bind train and test with indicator variable
train["is_train"] = 1
test["is_train"] = 0
df = pd.concat([train, test], axis=0)
df.shape

(418989, 33)

## Clean String Columns

In [15]:
# Convert all column strings to lower case and remove leading and trailing white space charaters
df[df.select_dtypes(include="object").columns] = df[
    df.select_dtypes(include="object").columns
].apply(lambda col: col.str.lower().str.strip())

In [16]:
for col in df.select_dtypes(include="object").columns:
    print(col, ":", df[col].unique()[:3], "\n\n")

interest_rate : ['11.89%' '10.71%' '16.99%'] 


loan_amt_requested : ['$25,000' '$7,000' '$1,200'] 


loan_amt_funded : ['$25,000' '$7,000' '$1,200'] 


loan_amt_investor_funded_portion : ['$19,080' '$673' '$24,725'] 


num_of_payment_months : ['36 months' '60 months'] 


loan_grade : ['b' 'd' 'c'] 


loan_subgrade : ['b4' 'b5' 'd3'] 


self_filled_employer_job_title : [nan 'cnn' 'web programmer'] 


num_of_years_employed : ['< 1 year' '1 year' '10+ years'] 


home_ownership_status : ['rent' 'own' 'mortgage'] 


verify_income_or_source : ['verified - income' 'not verified' 'verified - income source'] 


loan_issued_date : ['aug-09' 'may-08' 'aug-14'] 


borrower_provided_reason_for_loan : ['due to a lack of personal finance education and exposure to poor financing skills growing up, i was easy prey for credit predators. i am devoted to becoming debt-free and can assure my lenders that i will pay on-time every time. i have never missed a payment during the last 16 years that i have had 

### Dollar Amounts

The dollar amount columns should be numeric and so we remove the dollar signs.

In [17]:
dollar_cols = [col for col in df.columns if col.startswith("loan_amt")]
# Check missing before processing
df[dollar_cols].isna().sum()

loan_amt_requested                  0
loan_amt_funded                     0
loan_amt_investor_funded_portion    0
dtype: int64

In [18]:
df[dollar_cols] = df[dollar_cols].replace(to_replace="\$|,", value="", regex=True)
df[dollar_cols] = df[dollar_cols].astype(np.int32)
df[dollar_cols]

Unnamed: 0,loan_amt_requested,loan_amt_funded,loan_amt_investor_funded_portion
0,25000,25000,19080
1,7000,7000,673
2,25000,25000,24725
3,1200,1200,1200
4,10800,10800,10692
...,...,...,...
79995,6400,6400,6400
79996,30000,30000,30000
79997,17600,17600,17600
79998,2500,2500,2500


In [19]:
# Check missing after for potential unintended errors
df[dollar_cols].isna().sum()

loan_amt_requested                  0
loan_amt_funded                     0
loan_amt_investor_funded_portion    0
dtype: int64

In [20]:
df[dollar_cols].dtypes

loan_amt_requested                  int32
loan_amt_funded                     int32
loan_amt_investor_funded_portion    int32
dtype: object

### Interest Rate and Rate Feature

Remove the percentage signs from the rate features:

In [21]:
rate_cols = [col for col in df.columns if col.endswith("rate")]
# Check missing before processing
df[rate_cols].isna().sum()

interest_rate         80000
rev_line_util_rate      254
dtype: int64

In [22]:
df[rate_cols] = df[rate_cols].replace(to_replace="\%", value="", regex=True)
df[rate_cols] = df[rate_cols].apply(pd.to_numeric)
df[rate_cols]

Unnamed: 0,interest_rate,rev_line_util_rate
0,11.89,52.1
1,10.71,76.7
2,16.99,66.3
3,13.11,40.4
4,13.57,25.6
...,...,...
79995,,47.9
79996,,51.3
79997,,37.1
79998,,76.7


In [23]:
# Check missing after for potential unintended errors
df[rate_cols].isna().sum()

interest_rate         80000
rev_line_util_rate      254
dtype: int64

In [24]:
df[rate_cols].dtypes

interest_rate         float64
rev_line_util_rate    float64
dtype: object

### Date Columns

Some date values have 'month-year' format while others have 'year-month'. We must account for both formats:

In [25]:
date_cols = [col for col in df.columns if col.endswith("date")]
# Check missingness
df[date_cols].isna().sum()

loan_issued_date                      0
borrower_earliest_credit_open_date    0
dtype: int64

In [26]:
# Append zero to single digit years since excel removes the first zero, e.g. 1-sep or 3-sep should really be 01-sep and 03-sep
df[date_cols] = df[date_cols].apply(
    lambda col: col.apply(lambda val: "0" + val if re.match("\d-\w{3}", val) else val)
)

In [27]:
df[date_cols].isna().sum()

loan_issued_date                      0
borrower_earliest_credit_open_date    0
dtype: int64

Another issue we must handle is that the years are only reported as two digits. Therefore, there may be confusion when we try to convert those values to the full four-digit format. To use good judgement, we will append 20 to values between 00 and 20; we will append 19 to values between 20 and 99. This is so that we do not end up with year beyond 2020, which does not make sense.

In [28]:
# Convert all dates to 'month-year' format
df[date_cols] = df[date_cols].apply(
    lambda col: col.apply(
        lambda val: val[-3:] + "-" + val[:2] if re.match("\d{2}-\w{3}", val) else val
    )
)

In [29]:
df[date_cols].isna().sum()

loan_issued_date                      0
borrower_earliest_credit_open_date    0
dtype: int64

In [30]:
# Append 20 and 19 accordingly
df[date_cols] = df[date_cols].apply(
    lambda col: col.apply(
        lambda val: (val[:3] + "-" + "20" + val[-2:])
        if int(val[-2:][0]) >= 0 and int(val[-2:][0]) < 2
        else (val[:3] + "-" + "19" + val[-2:])
    )
)

Finally, we can now convert to date columns to datetimes:

In [31]:
df[date_cols] = df[date_cols].apply(
    lambda col: pd.to_datetime(col, format="%b-%Y", errors="coerce").dt.to_period("M")
)
df[date_cols]

Unnamed: 0,loan_issued_date,borrower_earliest_credit_open_date
0,2009-08,1994-02
1,2008-05,2000-10
2,2014-08,2000-06
3,2010-03,1985-01
4,2009-11,1996-12
...,...,...
79995,2015-01,2006-04
79996,2015-01,1996-08
79997,2015-01,2004-12
79998,2015-01,1999-06


In [32]:
df[date_cols].isna().sum()

loan_issued_date                      0
borrower_earliest_credit_open_date    0
dtype: int64

In [33]:
df[date_cols].dtypes

loan_issued_date                      period[M]
borrower_earliest_credit_open_date    period[M]
dtype: object

The date columns are granular to monthly frequency.

### Inspect Remaining Columns

In [34]:
remainining_cols = list(
    set(df.select_dtypes(include="object").columns.to_list())
    - set(date_cols + rate_cols + dollar_cols)
)
remainining_cols.sort()
remainining_cols

['borrower_provided_loan_category',
 'borrower_provided_loan_title',
 'borrower_provided_reason_for_loan',
 'borrower_state',
 'home_ownership_status',
 'init_loan_status',
 'loan_grade',
 'loan_subgrade',
 'num_of_payment_months',
 'num_of_years_employed',
 'self_filled_employer_job_title',
 'verify_income_or_source',
 'zip_first_three']

### Home Ownership

In [35]:
df.groupby(remainining_cols[4], dropna=False).size()

home_ownership_status
any              1
mortgage    184952
none            30
other          107
own          33204
rent        148736
NaN          51959
dtype: int64

The value `any` only appears once, which is very rare and may be a typo since the meta data does not contain this category. On a real job, I would investigate the source of this data and find out if this category will appear in the test data or even in future data. For this project, we will remove this single training example:

In [36]:
df = df.loc[df[remainining_cols[4]] != "any"]

In [37]:
df.groupby(remainining_cols[4], dropna=False).size()

home_ownership_status
mortgage    184952
none            30
other          107
own          33204
rent        148736
NaN          51959
dtype: int64

Note another important thing to keep in mind is that the None category is not the same as NaN; it means that the borrower is not a home owner.

### Verify Income or Income Source

In [38]:
df.groupby(remainining_cols[-2], dropna=False).size()

verify_income_or_source
not verified                129846
verified - income           149355
verified - income source    139787
dtype: int64

### Borrower Provided Loan Category

In [39]:
df.groupby(remainining_cols[0], dropna=False).size()

borrower_provided_loan_category
car                     4688
credit_card            94340
debt_consolidation    248109
educational              279
home_improvement       23545
house                   1989
major_purchase          8544
medical                 3948
moving                  2531
other                  20537
renewable_energy         309
small_business          6027
vacation                2207
wedding                 1935
dtype: int64

### Number of Years Employed

In [40]:
df.groupby(remainining_cols[9], dropna=False).size()

num_of_years_employed
1 year        26526
10+ years    135214
2 years       37337
3 years       33111
4 years       24862
5 years       27654
6 years       22986
7 years       23423
8 years       20182
9 years       16055
< 1 year      32462
NaN           19176
dtype: int64

We can treat this as a categorical variable rather than continous.

### Number of Payment Months

In [41]:
df.groupby(remainining_cols[8], dropna=False).size()

num_of_payment_months
36 months    301420
60 months    117568
dtype: int64

### Initial Loan Status

In [42]:
df.groupby(remainining_cols[5], dropna=False).size()

init_loan_status
f    273039
w    145949
dtype: int64

### Borrower Provided Reason For Loan

In [43]:
df[remainining_cols[2]].loc[df[remainining_cols[2]].notna()].sample(1).values

array(['borrower added on 12/05/10 > we currently have $2300 in our wedding fund, after paying multiple vendor deposits, and are depositing between $660 and $750 per month exclusively for wedding use.  we expect to receive approximately $6,000 from parents to go toward the wedding, but we would not receive that amount until right before the wedding, which is in august.  this loan is more for security than an inability to fund the wedding ourselves.<br/> borrower added on 12/09/10 > the monthly income shown above is only mine.  my fiance has been employed as a data analyst at a financial software company for the past 3+ years.  our combined annual gross income is approximately $96k.  because we are aggressively saving for our wedding and expect to receive contributions from parents, we anticipate paying off this loan earlier than the 60 months terms.<br/>'],
      dtype=object)

This feature has a lot of text, and should probably be handled with more sophisticated NLP techniques. What we will do here first is to remove non-alphanumeric characters and stop words, effectively trimming the text bodies.

In [44]:
# Check the number of missing values before processing
df[remainining_cols[2]].isna().sum()

313917

First, create a dataframe containing the original count of words in the text bodies:

In [45]:
word_count = pd.DataFrame(
    {
        "original_counts": df[remainining_cols[2]].apply(
            lambda val: len(val) if isinstance(val, str) else np.NaN
        )
    }
)
word_count.original_counts.describe()

count    105071.000000
mean        235.828687
std         265.428946
min           0.000000
25%          92.000000
50%         163.000000
75%         289.000000
max        3986.000000
Name: original_counts, dtype: float64

Remove stop words and non-alphanumeric characters:

In [46]:
stop_words = set(stopwords.words("english"))

# Convert the text column to a numpy array
text_array_reason = df[remainining_cols[2]].to_numpy()

# Remove the stop words and non-alphanumeric characters using a numpy vectorized operation
filtered_text_reason = np.vectorize(
    lambda x: " ".join(
        [
            re.sub(r"\W+", "", word)
            for word in x.split()
            if word.lower() not in stop_words and re.sub(r"\W+", "", word).isalpha()
        ]
    )
    if x is not np.nan
    else "missing"
)(text_array_reason)

Check word counts after processing:

In [47]:
word_count["post_processed_counts"] = np.vectorize(
    lambda x: float(len(x)) if x != "missing" else np.nan
)(filtered_text_reason)
word_count["post_processed_counts"].describe()

count    105071.000000
mean        146.753053
std         162.301272
min           0.000000
25%          59.000000
50%         103.000000
75%         177.000000
max        2790.000000
Name: post_processed_counts, dtype: float64

In [48]:
del word_count

Before re-assignment, check that we have not unintentially created any missing values:

In [49]:
(filtered_text_reason == "missing").sum() == df[remainining_cols[2]].isna().sum()

True

In [50]:
# Restore the original indices
processed_text_reason = pd.DataFrame(
    {
        "original_indices": df[remainining_cols[2]].index,
        remainining_cols[2] + "_processed": pd.Series(filtered_text_reason).apply(
            lambda val: val if val != "missing" else np.NaN
        ),
    }
).set_index("original_indices")
processed_text_reason

Unnamed: 0_level_0,borrower_provided_reason_for_loan_processed
original_indices,Unnamed: 1_level_1
0,due lack personal finance education exposure p...
1,want pay last bit credit card debt better rate
2,trying pay friend back apartment brokers fee i...
3,funded would use loan consolidate two loans in...
4,currently personal loan citifinancial high int...
...,...
79995,
79996,
79997,
79998,


In [51]:
df[remainining_cols[2]] = (
    processed_text_reason.borrower_provided_reason_for_loan_processed
)
df[remainining_cols[2]].isna().sum()

313917

### Borrower State

In [52]:
df.groupby(remainining_cols[3], dropna=False).size()

borrower_state
ak     1149
al     5186
ar     3078
az     9677
ca    64288
co     8898
ct     6323
dc     1270
de     1139
fl    28099
ga    13462
hi     2234
ia        7
id        8
il    16585
in     5980
ks     3872
ky     3950
la     5018
ma     9730
md     9873
me        4
mi    10527
mn     7521
mo     6689
ms     1157
mt     1248
nc    11690
ne        6
nh     1994
nj    15964
nm     2322
nv     5941
ny    35416
oh    13706
ok     3788
or     5509
pa    14729
ri     1834
sc     4879
sd      896
tn     5515
tx    33082
ut     3091
va    12633
vt      770
wa     9626
wi     5424
wv     2140
wy     1061
dtype: int64

### Self Filled Employer or Job Title

This features is also contains text data. We will again remove stop words and non-alphanumeric charaters to sanitize it:

In [53]:
# Check missing values
df[remainining_cols[10]].isna().sum()

24650

In [54]:
# Convert the text column to a numpy array
text_array_job = df[remainining_cols[10]].to_numpy()

# Remove the stop words and non-alphanumeric characters using a numpy vectorized operation
filtered_text_job = np.vectorize(
    lambda x: " ".join(
        [
            re.sub(r"\W+", "", word)
            for word in x.split()
            if word.lower() not in stop_words and re.sub(r"\W+", "", word).isalpha()
        ]
    )
    if x is not np.nan
    else "missing"
)(text_array_job)

Check for unintential errors:

In [55]:
(filtered_text_job == "missing").sum() == df[remainining_cols[10]].isna().sum()

True

In [56]:
# Restore the original indices
processed_text_job = pd.DataFrame(
    {
        "original_indices": df[remainining_cols[10]].index,
        remainining_cols[10] + "_processed": pd.Series(filtered_text_job).apply(
            lambda val: val if val != "missing" else np.NaN
        ),
    }
).set_index("original_indices")
processed_text_job

Unnamed: 0_level_0,self_filled_employer_job_title_processed
original_indices,Unnamed: 1_level_1
0,
1,cnn
2,web programmer
3,city beaumont texas
4,state farm insurance
...,...
79995,supervisor
79996,manager
79997,field unit supervisor
79998,senior project manager


In [57]:
df[remainining_cols[10]] = processed_text_job.self_filled_employer_job_title_processed
df[remainining_cols[10]].isna().sum()

24650

### Loan Subgrade

In [58]:
df.groupby(remainining_cols[7], dropna=False).size()

loan_subgrade
a1      7995
a2      8686
a3      9706
a4     15154
a5     17771
b1     18010
b2     20691
b3     24698
b4     23952
b5     20078
c1     21330
c2     21313
c3     20059
c4     19236
c5     17377
d1     15126
d2     13183
d3     11532
d4     10952
d5      9181
e1      7289
e2      6830
e3      5589
e4      4706
e5      3918
f1      3104
f2      2295
f3      2037
f4      1599
f5      1205
g1       854
g2       641
g3       469
g4       306
g5       250
NaN    51866
dtype: int64

### Loan Grade

In [59]:
df.groupby(remainining_cols[6], dropna=False).size()

loan_grade
a       59312
b      107429
c       99315
d       59974
e       28332
f       10240
g        2520
NaN     51866
dtype: int64

### Three Digit Zip

In [60]:
len(df[remainining_cols[-1]].unique())

886

We can creat new features from these zip codes using catboost encoding or target encoding for other numerical columns.

### Loan Title

This is a text column; remove stop words and non-alphanumeric characters.

In [61]:
# Check missing values
df[remainining_cols[1]].isna().sum()

16

In [62]:
# Convert the text column to a numpy array
text_array_title = df[remainining_cols[1]].to_numpy()

# Remove the stop words and non-alphanumeric characters using a numpy vectorized operation
filtered_text_title = np.vectorize(
    lambda x: " ".join(
        [
            re.sub(r"\W+", "", word)
            for word in x.split()
            if word.lower() not in stop_words and re.sub(r"\W+", "", word).isalpha()
        ]
    )
    if x is not np.nan
    else "missing"
)(text_array_title)

Check errors:

In [63]:
(filtered_text_title == "missing").sum() == df[remainining_cols[1]].isna().sum()

True

In [64]:
# Restore the original indices
processed_text_title = pd.DataFrame(
    {
        "original_indices": df[remainining_cols[1]].index,
        remainining_cols[1] + "_processed": pd.Series(filtered_text_title).apply(
            lambda val: val if val != "missing" else np.NaN
        ),
    }
).set_index("original_indices")
processed_text_title

Unnamed: 0_level_0,borrower_provided_loan_title_processed
original_indices,Unnamed: 1_level_1
0,debt consolidation ontime payer
1,credit card payoff
2,mlue
3,zxcvb
4,
...,...
79995,credit card refinancing
79996,credit card refinancing
79997,credit card refinancing
79998,debt consolidation


In [65]:
df[remainining_cols[1]] = processed_text_title.borrower_provided_loan_title_processed
df[remainining_cols[1]].isna().sum()

16

## Final Check 

In [66]:
for col in df.select_dtypes(include="object").columns:
    print(col, ":", df[col].unique()[:3], "\n\n")

num_of_payment_months : ['36 months' '60 months'] 


loan_grade : ['b' 'd' 'c'] 


loan_subgrade : ['b4' 'b5' 'd3'] 


self_filled_employer_job_title : [nan 'cnn' 'web programmer'] 


num_of_years_employed : ['< 1 year' '1 year' '10+ years'] 


home_ownership_status : ['rent' 'own' 'mortgage'] 


verify_income_or_source : ['verified - income' 'not verified' 'verified - income source'] 


borrower_provided_reason_for_loan : ['due lack personal finance education exposure poor financing skills growing up easy prey credit predators devoted becoming debtfree assure lenders pay ontime every time never missed payment last years credit'
 'want pay last bit credit card debt better rate'
 'trying pay friend back apartment brokers fee incurred well credit card stuff'] 


borrower_provided_loan_category : ['debt_consolidation' 'credit_card' 'car'] 


borrower_provided_loan_title : ['debt consolidation ontime payer' 'credit card payoff' 'mlue'] 


zip_first_three : ['941xx' '112xx' '100xx'] 


borr

In [67]:
df.dtypes

interest_rate                           float64
id_loan                                 float64
id_borrower                             float64
loan_amt_requested                        int32
loan_amt_funded                           int32
loan_amt_investor_funded_portion          int32
num_of_payment_months                    object
loan_grade                               object
loan_subgrade                            object
self_filled_employer_job_title           object
num_of_years_employed                    object
home_ownership_status                    object
borrower_annual_income                  float64
verify_income_or_source                  object
loan_issued_date                      period[M]
borrower_provided_reason_for_loan        object
borrower_provided_loan_category          object
borrower_provided_loan_title             object
zip_first_three                          object
borrower_state                           object
monthly_debt_to_income_ratio            

Split the data back into train and test set based on indicator:

In [68]:
train_sanitized, test_sanitized = (
    df.loc[df.is_train == 1].drop(["is_train"], axis=1),
    df.loc[df.is_train != 1].drop(["is_train"], axis=1),
)
train_sanitized.shape, test_sanitized.shape

((338988, 32), (80000, 32))

## Write to Disk

In [69]:
train_sanitized.to_csv(
    "/kaggle/working/train_sanitized.csv", index=False, date_format="%y-%b"
)
test_sanitized.to_csv(
    "/kaggle/working/test_sanitized.csv", index=False, date_format="%y-%b"
)