In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.set_option("display.max_columns", 100)

In [53]:
train = pd.read_csv('./cs-training.csv', index_col=0)

In [54]:
test = pd.read_csv('./cs-test.csv', index_col=0)

In [55]:
test.head(2)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,,0.885519,43,0,0.177513,5700.0,4,0,0,0,0.0
2,,0.463295,57,0,0.527237,9141.0,15,0,4,0,2.0


In [56]:
train.head(2)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0


In [57]:
test.shape

(101503, 11)

In [58]:
train.shape

(150000, 11)

In [59]:
combine = pd.concat([train, test], axis=0)

In [60]:
combine.head(2)

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1.0,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0.0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0


In [61]:
combine.columns = ['target', 'unsecuredline_utilization', 'age', 'count_of_30-59_days_past_due_not_worse', 'debt_ratio', 'monthly_income', 'count_open_creditlines_and_loans', 'count_90_days_late',
                'count_real_estate_loans_or_lines', 'count_of_60-89_days_past_due_not_worse', 'count_dependents']

In [62]:
combine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 251503 entries, 1 to 101503
Data columns (total 11 columns):
target                                    150000 non-null float64
unsecuredline_utilization                 251503 non-null float64
age                                       251503 non-null int64
count_of_30-59_days_past_due_not_worse    251503 non-null int64
debt_ratio                                251503 non-null float64
monthly_income                            201669 non-null float64
count_open_creditlines_and_loans          251503 non-null int64
count_90_days_late                        251503 non-null int64
count_real_estate_loans_or_lines          251503 non-null int64
count_of_60-89_days_past_due_not_worse    251503 non-null int64
count_dependents                          244953 non-null float64
dtypes: float64(5), int64(6)
memory usage: 23.0 MB


In [18]:
target_dist = (
    train
    .groupby("target")
    .agg({"age": "count"})
    .reset_index()
)

In [19]:
target_dist

Unnamed: 0,target,age
0,0,139974
1,1,10026


In [63]:
missing_dpt = combine[combine.count_dependents.isnull()]
missing_dpt_target_dist = (
    missing_dpt
    .groupby("target")
    .agg({"age": "count"})
    .reset_index()
)

In [64]:
missing_dpt_target_dist

Unnamed: 0,target,age
0,0.0,3745
1,1.0,179


In [66]:
combine.count_dependents.describe()

count    244953.000000
mean          0.761995
std           1.123905
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max          43.000000
Name: count_dependents, dtype: float64

In [67]:
# impute null values in count_dependents column with median value, i.e. 0
combine.count_dependents.fillna(0, inplace=True)

In [69]:
missing_income = combine[combine.monthly_income.isnull()]
missing_income_target_dist = (
    missing_income
    .groupby("target")
    .agg({"age": "count"})
    .reset_index()
)
missing_income_target_dist

Unnamed: 0,target,age
0,0.0,28062
1,1.0,1669


In [75]:
combine.age.sort_values().unique()

array([  0,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,
        33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,
        46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,
        59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,
        72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
        85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
        98,  99, 100, 101, 102, 103, 104, 105, 107, 109], dtype=int64)

In [84]:
combine.query("monthly_income == 0").debt_ratio.describe()

count      2654.000000
mean       1713.872645
std        5386.139965
min           0.000000
25%          93.000000
50%         858.500000
75%        2175.250000
max      202990.000000
Name: debt_ratio, dtype: float64

In [104]:
combine.query("monthly_income > 0").debt_ratio.describe()

count    199015.000000
mean          5.009825
std         164.243734
min           0.000000
25%           0.141114
50%           0.292142
75%           0.472765
max       61106.500000
Name: debt_ratio, dtype: float64

In [96]:
combine.query("monthly_income > 0").debt_ratio.quantile(q=0.99)

2.9688210751599997

In [105]:
combine.monthly_income.fillna(-1, inplace=True)

In [111]:
combine.loc[combine.query("monthly_income == -1 and debt_ratio > 3").index.values, 'monthly_income'] = 0

In [115]:
len(combine.query("monthly_income == -1"))

3080

In [119]:
combine.query("monthly_income == -1 and (target == 1 or target == 0 )").age.unique()

array([ 62,  63,  28,  29,  83,  97,  91,  82,  86,  34,  90,  70,  84,
        57,  89,  69,  68,  92,  48,  30,  81,  60,  25,  22,  41,  85,
        24,  55,  61,  46,  56,  31,  40,  75,  66,  42,  59,  79,  54,
        50,  88,  73,  38,  80,  76,  71,  78,  44,  74,  52,  51,  64,
        77,  93,  37,  65,  39,  23,  45,  21,  43,  36,  67,  35,  26,
        27,  98,  87,  49,  94,  72,  58,  53,  95,  33,  99,  32,  47,
       105,  96, 102], dtype=int64)