# 1. Packages

In [1]:
import datetime

# PySpark packages
from pyspark.sql import SparkSession
from pyspark.sql.functions import (isnan,coalesce,col,when)
from pyspark.sql.functions import (UserDefinedFunction,StringType,lit,udf,datediff,date_add)
from pyspark.sql.types import DoubleType,IntegerType,FloatType,DateType

# PySpark ML
from pyspark.ml.feature import VectorAssembler,StringIndexer,StandardScaler
from pyspark.ml.classification import LogisticRegression

In [2]:
spark = SparkSession.builder.appName("String").getOrCreate()

In [3]:
spark

# 2. The Dataset

In [4]:
loan = spark.read.csv('loan.csv', inferSchema = True, header = True)

In [5]:
# 107864 records and 116 columns.
print(loan.count())
print(len(loan.columns))

107864
116


# 3. Data type transformation
- From previous steps, I have observed several columns had been incorrectly specified as wrong data types. Therefore, before doing a further step for data cleaning, I should transform the wrong data types into correct data types.

In [6]:
column_names = loan.columns
for col in column_names:
    print("{}'s data type is: {}.".format(col,loan.select(col).dtypes[0][1]))
    print("Pick some values to see: {}.".format(loan.select(col).toPandas()[col].unique()[:5]))
    print('-'*60)

loan_amnt's data type is: int.
Pick some values to see: [ 7000  5000 17000 20000 25000].
------------------------------------------------------------
funded_amnt's data type is: int.
Pick some values to see: [ 7000  5000 17000 20000 25000].
------------------------------------------------------------
funded_amnt_inv's data type is: int.
Pick some values to see: [ 7000  5000 17000 20000 25000].
------------------------------------------------------------
term's data type is: string.
Pick some values to see: [' 36 months' ' 60 months'].
------------------------------------------------------------
int_rate's data type is: double.
Pick some values to see: [0.0734 0.0796 0.109  0.0531 0.1261].
------------------------------------------------------------
installment's data type is: double.
Pick some values to see: [217.23 156.59 555.76 602.21 563.85].
------------------------------------------------------------
grade's data type is: string.
Pick some values to see: ['A' 'B' 'C' 'D' 'G'].
---

Pick some values to see: [0 1].
------------------------------------------------------------
tot_coll_amt's data type is: int.
Pick some values to see: [  696     0  1360   370 11213].
------------------------------------------------------------
tot_cur_bal's data type is: int.
Pick some values to see: [286553  92660  43112 126356  34969].
------------------------------------------------------------
open_acc_6m's data type is: int.
Pick some values to see: [1 0 4 3 2].
------------------------------------------------------------
open_act_il's data type is: int.
Pick some values to see: [4 6 1 8 2].
------------------------------------------------------------
open_il_12m's data type is: int.
Pick some values to see: [0 1 2 3 4].
------------------------------------------------------------
open_il_24m's data type is: int.
Pick some values to see: [1 0 2 6 3].
------------------------------------------------------------
mths_since_rcnt_il's data type is: int.
Pick some values to see: [24.

Pick some values to see: [1 0 2 3 5].
------------------------------------------------------------
sec_app_mort_acc's data type is: int.
Pick some values to see: [1 0 4 5 3].
------------------------------------------------------------
sec_app_open_acc's data type is: int.
Pick some values to see: [13 14  6  8 12].
------------------------------------------------------------
sec_app_revol_util's data type is: double.
Pick some values to see: [0.369 0.695 0.612 0.697 0.288].
------------------------------------------------------------
sec_app_open_act_il's data type is: int.
Pick some values to see: [4 6 1 8 2].
------------------------------------------------------------
sec_app_num_rev_accts's data type is: int.
Pick some values to see: [ 8 15 13 10 11].
------------------------------------------------------------
sec_app_chargeoff_within_12_mths's data type is: int.
Pick some values to see: [ 0  1  4 14  3].
------------------------------------------------------------
sec_app_collect

## 3.1 Remove columns

In [7]:
# 'pymnt_plan' contained only one record across all rows.
(loan.select('pymnt_plan').toPandas()['pymnt_plan'] == 'n').sum()

107864

In [8]:
# After reading the metadata description, 'sub_grade' reveals a similar information 
# as'grade'.
loan = loan.drop('pymnt_plan','sub_grade')

# Now, 114 columns left.
print(len(loan.columns))

114


In [9]:
# Observed the values under 'term' have some spaces.
remove_space = UserDefinedFunction(lambda x: x.strip(), StringType())
loan = loan.withColumn('term', remove_space('term'))

## 3.2 Datetime features

In [10]:
# 6 datetime features in the dataset.
datetime_features = ['issue_d','last_pymnt_d','next_pymnt_d','last_credit_pull_d',
                     'earliest_cr_line','sec_app_earliest_cr_line']
len(datetime_features)

6

In [11]:
# Check whether the data type is correct or not.
loan.select(datetime_features).dtypes

[('issue_d', 'timestamp'),
 ('last_pymnt_d', 'timestamp'),
 ('next_pymnt_d', 'timestamp'),
 ('last_credit_pull_d', 'timestamp'),
 ('earliest_cr_line', 'timestamp'),
 ('sec_app_earliest_cr_line', 'timestamp')]

In [12]:
# For 'issue_d' and 'next_pymnt_d', it's clearly not in 2019 but must be 2018.
# However, for 'last_pymnt_d', the records must be before '2018-03-31' since the dataset
# is in the first quarter in 2018.
loan.select('issue_d','last_pymnt_d','next_pymnt_d').show(5)

+-------------------+-------------------+-------------------+
|            issue_d|       last_pymnt_d|       next_pymnt_d|
+-------------------+-------------------+-------------------+
|2019-03-18 00:00:00|2019-07-18 00:00:00|2019-08-18 00:00:00|
|2019-03-18 00:00:00|2019-07-18 00:00:00|2019-08-18 00:00:00|
|2019-03-18 00:00:00|2019-07-18 00:00:00|2019-08-18 00:00:00|
|2019-03-18 00:00:00|2019-07-18 00:00:00|2019-08-18 00:00:00|
|2019-03-18 00:00:00|2019-07-18 00:00:00|2019-08-18 00:00:00|
+-------------------+-------------------+-------------------+
only showing top 5 rows



In [13]:
# Therefore, 'issue_d' must be subtracted by '1 year'.
# After reading the metadata description and looking at other datasets, 
# I decided to create a date difference between 'last_pymnt_d' and 'next_pymnt_d'.
loan = loan.withColumn("issue_d", date_add("issue_d", -365))

In [14]:
# Before generating more powerful datetime features, first impute the missing values.
# Observed that when 'next_pymnt_d' is null, then the corresponding 'loan_status' is 
# always 'Fully Paid' or 'Charged Off'.
loan.select('next_pymnt_d','loan_status').groupby('next_pymnt_d','loan_status').\
count().show()

+-------------------+------------------+-----+
|       next_pymnt_d|       loan_status|count|
+-------------------+------------------+-----+
|               null|        Fully Paid| 6252|
|               null|       Charged Off|  265|
|2019-09-18 00:00:00|           Current|   25|
|2019-08-18 00:00:00|Late (31-120 days)| 1103|
|2019-08-18 00:00:00|   In Grace Period|  761|
|2019-08-18 00:00:00| Late (16-30 days)|  390|
|2019-07-18 00:00:00|           Current|   10|
|2019-08-18 00:00:00|           Current|99058|
+-------------------+------------------+-----+



In [15]:
# I impute 'next_pymnt_d' missing values by replacing '9999-12-31' since 'Fully Paid' 
# and 'Charged Off' will not have 'next payment day' same as the date difference.
loan = loan.withColumn('next_pymnt_d', 
                       when(loan.next_pymnt_d.isNotNull(), 
                            loan.next_pymnt_d).otherwise(datetime.datetime(9999,12,31)))

In [16]:
# Make sure the correction is correct!
loan = loan.withColumn("diff_last_next_pymnt_d",
                       datediff(loan.next_pymnt_d, loan.last_pymnt_d)
                      )
loan.select('issue_d','last_pymnt_d','next_pymnt_d','diff_last_next_pymnt_d').show(5)

+----------+-------------------+-------------------+----------------------+
|   issue_d|       last_pymnt_d|       next_pymnt_d|diff_last_next_pymnt_d|
+----------+-------------------+-------------------+----------------------+
|2018-03-18|2019-07-18 00:00:00|2019-08-18 00:00:00|                    31|
|2018-03-18|2019-07-18 00:00:00|2019-08-18 00:00:00|                    31|
|2018-03-18|2019-07-18 00:00:00|2019-08-18 00:00:00|                    31|
|2018-03-18|2019-07-18 00:00:00|2019-08-18 00:00:00|                    31|
|2018-03-18|2019-07-18 00:00:00|2019-08-18 00:00:00|                    31|
+----------+-------------------+-------------------+----------------------+
only showing top 5 rows



In [17]:
# In order to have the same 'year unit' as 'last_pymnt_d' and 'next_pymnt_d',
# I chose '2019-03-31' as the end of March to compute the date difference.
loan = loan.withColumn("diff_next_pymnt_end_of_march_d",
                       datediff(loan.next_pymnt_d, lit(datetime.datetime(2019,3,31)))
                      )
loan = loan.withColumn("diff_last_pymnt_end_of_march_d",
                       datediff(loan.last_pymnt_d, lit(datetime.datetime(2019,3,31)))
                      )

In [18]:
# Make sure the date difference is computed correctly!
loan.select('last_pymnt_d','diff_last_pymnt_end_of_march_d',
            'next_pymnt_d','diff_next_pymnt_end_of_march_d').show(5)

+-------------------+------------------------------+-------------------+------------------------------+
|       last_pymnt_d|diff_last_pymnt_end_of_march_d|       next_pymnt_d|diff_next_pymnt_end_of_march_d|
+-------------------+------------------------------+-------------------+------------------------------+
|2019-07-18 00:00:00|                           109|2019-08-18 00:00:00|                           140|
|2019-07-18 00:00:00|                           109|2019-08-18 00:00:00|                           140|
|2019-07-18 00:00:00|                           109|2019-08-18 00:00:00|                           140|
|2019-07-18 00:00:00|                           109|2019-08-18 00:00:00|                           140|
|2019-07-18 00:00:00|                           109|2019-08-18 00:00:00|                           140|
+-------------------+------------------------------+-------------------+------------------------------+
only showing top 5 rows



In [19]:
def transform_datetime(x):
    if x == datetime.date(2018, 1, 18):
        return 'issue_d_level_1'
    elif x == datetime.date(2018, 2, 18):
        return 'issue_d_level_2'
    elif x == datetime.date(2018, 3, 18):
        return 'issue_d_level_3'
    else:
        return 'issue_d_level_na'

trans_issue_d = udf(transform_datetime)

In [20]:
loan = loan.withColumn('issue_d', 
                       trans_issue_d(loan.issue_d))

In [21]:
# Make sure the transformation is correct!
loan.select('issue_d').show(5)

+---------------+
|        issue_d|
+---------------+
|issue_d_level_3|
|issue_d_level_3|
|issue_d_level_3|
|issue_d_level_3|
|issue_d_level_3|
+---------------+
only showing top 5 rows



In [22]:
# Again, it's impossible for 'last_credit_pull_d', 'earliest_cr_line', and 
# 'sec_app_earliest_cr_line' recorded after '2018-03-31'.
loan.select('last_credit_pull_d','earliest_cr_line','sec_app_earliest_cr_line').show(5)

+-------------------+-------------------+------------------------+
| last_credit_pull_d|   earliest_cr_line|sec_app_earliest_cr_line|
+-------------------+-------------------+------------------------+
|2019-07-18 00:00:00|2019-06-04 00:00:00|     2019-06-04 00:00:00|
|2019-07-18 00:00:00|2019-10-01 00:00:00|     2019-10-01 00:00:00|
|2019-07-18 00:00:00|2019-08-07 00:00:00|     2019-08-07 00:00:00|
|2019-07-18 00:00:00|2000-08-01 00:00:00|     2000-08-01 00:00:00|
|2019-07-18 00:00:00|2019-08-05 00:00:00|     2019-08-05 00:00:00|
+-------------------+-------------------+------------------------+
only showing top 5 rows



In [23]:
# Therefore, 'last_credit_pull_d', 'earliest_cr_line', and 'sec_app_earliest_cr_line'
# must be subtracted by '2 years'.
loan = loan.withColumn("last_credit_pull_d", date_add("last_credit_pull_d", -365*2))
loan = loan.withColumn("earliest_cr_line", date_add("earliest_cr_line", -365))
loan = loan.withColumn("sec_app_earliest_cr_line", date_add("sec_app_earliest_cr_line", -365))

In [24]:
# Make sure the correction is correct!
# Notice: Still, records in 'earliest_cr_line' and 'sec_app_earliest_cr_line' are 
# not before the end of the March in 2018.
# Since I subtracted all the records by the same time window, then it's not an issue if 
# some records are still before the end of the March in 2018.
loan.select('last_credit_pull_d','earliest_cr_line','sec_app_earliest_cr_line').show(5)

+------------------+----------------+------------------------+
|last_credit_pull_d|earliest_cr_line|sec_app_earliest_cr_line|
+------------------+----------------+------------------------+
|        2017-07-18|      2018-06-04|              2018-06-04|
|        2017-07-18|      2018-10-01|              2018-10-01|
|        2017-07-18|      2018-08-07|              2018-08-07|
|        2017-07-18|      1999-08-02|              1999-08-02|
|        2017-07-18|      2018-08-05|              2018-08-05|
+------------------+----------------+------------------------+
only showing top 5 rows



In [25]:
# Create three new features, which may be powerful features to predict 'loan_status'.
# In my perspective, use '2018-03-31' as a point of time to compare the date difference
# with 'last_credit_pull_d','earliest_cr_line', and'sec_app_earliest_cr_line might
# be a good way since some records still 'outside' the time range.
loan = loan.withColumn("diff_earliest_cr_line_end_of_march_d",
                       datediff(loan.earliest_cr_line, lit(datetime.datetime(2018,3,31)))
                      )
loan = loan.withColumn("diff_sec_app_earliest_cr_line_end_of_march_d",
                       datediff(loan.sec_app_earliest_cr_line, lit(datetime.datetime(2018,3,31)))
                      )
loan = loan.withColumn("diff_last_credit_pull_end_of_march_d",
                       datediff(loan.last_credit_pull_d, lit(datetime.datetime(2018,3,31)))
                      )

In [26]:
loan.select('diff_earliest_cr_line_end_of_march_d',
            'diff_sec_app_earliest_cr_line_end_of_march_d',
            'diff_last_credit_pull_end_of_march_d').show(5)

+------------------------------------+--------------------------------------------+------------------------------------+
|diff_earliest_cr_line_end_of_march_d|diff_sec_app_earliest_cr_line_end_of_march_d|diff_last_credit_pull_end_of_march_d|
+------------------------------------+--------------------------------------------+------------------------------------+
|                                  65|                                          65|                                -256|
|                                 184|                                         184|                                -256|
|                                 129|                                         129|                                -256|
|                               -6816|                                       -6816|                                -256|
|                                 127|                                         127|                                -256|
+-------------------------------

In [27]:
# After generating new time-related features which are more meaningful for 
# a modeling perspective, the original timestamp features should be dropped off 
# from the dataset.
loan = loan.drop('last_pymnt_d','next_pymnt_d',
                 'last_credit_pull_d','earliest_cr_line','sec_app_earliest_cr_line'
                )

In [28]:
# Now, 115 columns left.
len(loan.columns)

115

# 4. Inspection - Missing data
- Find out what is reason to make such missing data.

In [31]:
column_names = loan.columns

In [32]:
# Find the rest of features which still have missing records.
numeric_feat = {}
string_feat = {}

i = 0
for col in column_names:
    try:
        na_count = loan.filter((loan[col] == "") | loan[col].isNull() | isnan(loan[col])).count()
        if na_count != 0:
            print('Column: {} contained {} missing values.'.format(col, na_count))
            i += 1
            if loan.select(col).dtypes[0][1] != 'string':
                numeric_feat[col] = na_count
            elif loan.select(col).dtypes[0][1] == 'string':
                string_feat[col] = na_count
    except:
        print("Error column: {}.".format(col))
print()
print(i, 'features still contain missing values.')

Column: emp_title contained 9613 missing values.
Column: dti contained 262 missing values.
Column: mths_since_last_delinq contained 60695 missing values.
Column: mths_since_last_record contained 92595 missing values.
Column: revol_util contained 149 missing values.
Column: mths_since_last_major_derog contained 83103 missing values.
Column: mths_since_rcnt_il contained 4288 missing values.
Column: il_util contained 17984 missing values.
Column: all_util contained 28 missing values.
Column: avg_cur_bal contained 10 missing values.
Column: bc_open_to_buy contained 1510 missing values.
Column: bc_util contained 1561 missing values.
Column: mo_sin_old_il_acct contained 4288 missing values.
Column: mths_since_recent_bc contained 1421 missing values.
Column: mths_since_recent_bc_dlq contained 86566 missing values.
Column: mths_since_recent_inq contained 13648 missing values.
Column: mths_since_recent_revol_delinq contained 77038 missing values.
Column: num_tl_120dpd_2m contained 3508 missing 

## 4.1 Missing at random or not random

In [33]:
sqlCtx.registerDataFrameAsTable(loan, "loan")

In [34]:
# Helper function for inspecting whether a missing column has some relationship with 
# an output variable, meaning the reason to cause such missing data is not random!
def inspect_na_at_random(missing_column, output_variable):
    """
    missing_column: the name of a missing column.
    output_variable: the name of an output column.
    Return: A table shows the count value of different levels of an output variable 
    group by two values, NA and NOT NA based on the missing column.
    """
    
    df_null_group = sqlCtx.sql("SELECT CASE WHEN "+missing_column+ " IS NULL THEN 'NA'\
    ELSE 'NOT NA' END AS NA_OR_NOT_NA,"+output_variable+" FROM loan")
    
    sqlCtx.registerDataFrameAsTable(df_null_group, "df_null_group")
    
    df_null_loan_status = sqlCtx.sql("SELECT NA_OR_NOT_NA,"+output_variable+",COUNT("+\
                                     output_variable+")\
                                     FROM df_null_group \
                                     GROUP BY NA_OR_NOT_NA,"+output_variable+" \
                                     ORDER BY NA_OR_NOT_NA,"+output_variable)
    df_null_loan_status.show()

In [35]:
output_variable = ['loan_status', 'application_type']
for each in numeric_feat:
    for output in output_variable:
        print('missing column: {} versus output column {}.'.format(each,output))
        inspect_na_at_random(missing_column=each,
                             output_variable=output)

missing column: dti versus output column loan_status.
+------------+------------------+------------------+
|NA_OR_NOT_NA|       loan_status|count(loan_status)|
+------------+------------------+------------------+
|          NA|           Current|               236|
|          NA|        Fully Paid|                15|
|          NA|   In Grace Period|                 3|
|          NA| Late (16-30 days)|                 5|
|          NA|Late (31-120 days)|                 3|
|      NOT NA|       Charged Off|               265|
|      NOT NA|           Current|             98857|
|      NOT NA|        Fully Paid|              6237|
|      NOT NA|   In Grace Period|               758|
|      NOT NA| Late (16-30 days)|               385|
|      NOT NA|Late (31-120 days)|              1100|
+------------+------------------+------------------+

missing column: dti versus output column application_type.
+------------+----------------+-----------------------+
|NA_OR_NOT_NA|application_type|coun

+------------+------------------+------------------+
|NA_OR_NOT_NA|       loan_status|count(loan_status)|
+------------+------------------+------------------+
|          NA|       Charged Off|                65|
|          NA|           Current|             16399|
|          NA|        Fully Paid|              1078|
|          NA|   In Grace Period|               127|
|          NA| Late (16-30 days)|                75|
|          NA|Late (31-120 days)|               240|
|      NOT NA|       Charged Off|               200|
|      NOT NA|           Current|             82694|
|      NOT NA|        Fully Paid|              5174|
|      NOT NA|   In Grace Period|               634|
|      NOT NA| Late (16-30 days)|               315|
|      NOT NA|Late (31-120 days)|               863|
+------------+------------------+------------------+

missing column: il_util versus output column application_type.
+------------+----------------+-----------------------+
|NA_OR_NOT_NA|application_type|c

+------------+------------------+------------------+
|NA_OR_NOT_NA|       loan_status|count(loan_status)|
+------------+------------------+------------------+
|          NA|       Charged Off|                 3|
|          NA|           Current|              1266|
|          NA|        Fully Paid|                93|
|          NA|   In Grace Period|                25|
|          NA| Late (16-30 days)|                12|
|          NA|Late (31-120 days)|                22|
|      NOT NA|       Charged Off|               262|
|      NOT NA|           Current|             97827|
|      NOT NA|        Fully Paid|              6159|
|      NOT NA|   In Grace Period|               736|
|      NOT NA| Late (16-30 days)|               378|
|      NOT NA|Late (31-120 days)|              1081|
+------------+------------------+------------------+

missing column: mths_since_recent_bc versus output column application_type.
+------------+----------------+-----------------------+
|NA_OR_NOT_NA|appli

+------------+------------------+------------------+
|NA_OR_NOT_NA|       loan_status|count(loan_status)|
+------------+------------------+------------------+
|          NA|       Charged Off|                 5|
|          NA|           Current|              1348|
|          NA|        Fully Paid|                99|
|          NA|   In Grace Period|                25|
|          NA| Late (16-30 days)|                12|
|          NA|Late (31-120 days)|                25|
|      NOT NA|       Charged Off|               260|
|      NOT NA|           Current|             97745|
|      NOT NA|        Fully Paid|              6153|
|      NOT NA|   In Grace Period|               736|
|      NOT NA| Late (16-30 days)|               378|
|      NOT NA|Late (31-120 days)|              1078|
+------------+------------------+------------------+

missing column: percent_bc_gt_75 versus output column application_type.
+------------+----------------+-----------------------+
|NA_OR_NOT_NA|applicati

## Missing data inspection - Logistic Regression

In [38]:
def missing_logistic_regression(dataframe, missing_column):
    '''
    dataframe: a pyspark dataframe.
    missing_column: a column name of a missing column.
    Outputs: Return the top three predictors that cause such missing values in the missing
    column estimated by Logistic Regression.
    '''
    # First, drop off missing columns in the dataframe.
    columns = dataframe.columns
    columns.remove(missing_column)
    
    for col in columns:
        count_missing = dataframe.filter(dataframe[col].isNull()).count()
        if count_missing != 0:
            dataframe = dataframe.drop(col)

    # Group missing data into two, 'NA' or 'NOT NA'.
    sqlCtx.registerDataFrameAsTable(dataframe, "dataframe")
    
    dataframe = sqlCtx.sql("SELECT CASE WHEN "+missing_column+ " IS NULL THEN 'NA'\
    ELSE 'NOT NA' END AS NA_OR_NOT_NA, * FROM dataframe")
    
    # Transform strings into numbers using StringIndexer.
    columns = dataframe.columns
    cate_columns = []
    for each in columns:
        if dataframe.select(each).dtypes[0][1] == 'string':
            cate_columns.append(each)

    for each in cate_columns:
        indexer = StringIndexer(inputCol=each, outputCol=each+"_cat")
        dataframe = indexer.fit(dataframe).transform(dataframe)
        dataframe = dataframe.drop(each)
        
    # LR modeling: preprocessing
    columns = dataframe.columns
    columns.remove(missing_column)
    columns.remove('NA_OR_NOT_NA_cat')
    
    # Assemble all the 'predictors' into a vector.
    assembler = VectorAssembler(inputCols=columns, outputCol="features")
    dataframe = assembler.transform(dataframe) 
    
    # Standardization to obtain 'fair' coefficient estimations.
    std_scaler = StandardScaler(inputCol="features", outputCol="scaled_features",\
                                withStd=True, withMean=True)
    dataframe = std_scaler.fit(dataframe).transform(dataframe)
    
    # LR modeling
    lr = LogisticRegression(featuresCol='scaled_features', 
                            labelCol='NA_OR_NOT_NA_cat', 
                            predictionCol='prediction')
    
    final_data = dataframe.select('scaled_features', 'NA_OR_NOT_NA_cat')
    lr_model = lr.fit(final_data)
    
    # Coefficient report
    coeffs = lr_model.coefficients.toArray()
    sorted_coeff_indexes = (abs(coeffs)).argsort()[-3:]
    print(len(coeffs), len(columns))
    print('Modeling Results:')
    for index in sorted_coeff_indexes:
        print('Column name is {}, and the corresponding coeff is {}.'.format(
            columns[index], coeffs[index]))

In [39]:
for missing_num_col in numeric_feat:
    print('Processing missing column {}.'.format(missing_num_col))
    missing_logistic_regression(dataframe=loan,
                            missing_column=missing_num_col)
    print('-'*50)
    print()

Processing missing column dti.
90 90
Modeling Results:
Column name is open_rv_24m, and the corresponding coeff is 312.04082507191424.
Column name is application_type_cat, and the corresponding coeff is 398.7735922209789.
Column name is annual_inc, and the corresponding coeff is -14621.13446432049.
--------------------------------------------------

Processing missing column mths_since_last_delinq.
90 90
Modeling Results:
Column name is num_tl_90g_dpd_24m, and the corresponding coeff is -3.233735676645961.
Column name is num_accts_ever_120_pd, and the corresponding coeff is 4.373896596504162.
Column name is delinq_2yrs, and the corresponding coeff is 6.911252414710629.
--------------------------------------------------

Processing missing column mths_since_last_record.
90 90
Modeling Results:
Column name is tax_liens, and the corresponding coeff is 4.802290436797678.
Column name is pub_rec_bankruptcies, and the corresponding coeff is 9.584382157762667.
Column name is pub_rec, and the co

90 90
Modeling Results:
Column name is num_op_rev_tl, and the corresponding coeff is -18.313897679483695.
Column name is total_bc_limit, and the corresponding coeff is -19.69904919856108.
Column name is num_bc_tl, and the corresponding coeff is -23.462781553639278.
--------------------------------------------------



In [76]:
# 'total_bc_limit': Total bankcard high credit/credit limit.
# 'percent_bc_gt_75': Percentage of all bankcard accounts > 75% of limit.
# When 'percent_bc_gt_75' is missing, the corresponding 'total_bc_limit' value is always 0.
# It may indicate that 0 limit and thus no backcard accounts.
inspect_na_at_random(missing_column='percent_bc_gt_75',
                             output_variable='total_bc_limit')

+------------+--------------+---------------------+
|NA_OR_NOT_NA|total_bc_limit|count(total_bc_limit)|
+------------+--------------+---------------------+
|          NA|             0|                 1514|
|      NOT NA|           100|                    2|
|      NOT NA|           200|                   19|
|      NOT NA|           300|                  106|
|      NOT NA|           400|                   22|
|      NOT NA|           500|                  348|
|      NOT NA|           600|                   61|
|      NOT NA|           700|                   77|
|      NOT NA|           750|                    3|
|      NOT NA|           800|                  202|
|      NOT NA|           900|                   62|
|      NOT NA|          1000|                  424|
|      NOT NA|          1050|                    1|
|      NOT NA|          1100|                   88|
|      NOT NA|          1200|                   97|
|      NOT NA|          1250|                    2|
|      NOT N

In [77]:
# After looking all the top three features for each missing column,
# the pair between 'mths_since_last_record' and 'pub_rec' is the most significant.
# When mths_since_last_record is missing, the corresponding 'pub_rec' is always 0.
# It's reasonable that when 'pub_rec' is 0, there is no record available for 
# 'mths_since_last_record'.
inspect_na_at_random(missing_column='mths_since_last_record',
                             output_variable='pub_rec')

+------------+-------+--------------+
|NA_OR_NOT_NA|pub_rec|count(pub_rec)|
+------------+-------+--------------+
|          NA|      0|         92595|
|      NOT NA|      1|         13922|
|      NOT NA|      2|           866|
|      NOT NA|      3|           265|
|      NOT NA|      4|           115|
|      NOT NA|      5|            51|
|      NOT NA|      6|            20|
|      NOT NA|      7|            11|
|      NOT NA|      8|             6|
|      NOT NA|      9|             5|
|      NOT NA|     10|             2|
|      NOT NA|     13|             3|
|      NOT NA|     19|             1|
|      NOT NA|     24|             1|
|      NOT NA|     52|             1|
+------------+-------+--------------+

