In [30]:
import pandas as pd
import numpy as np
import scipy.stats as st
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('data/credit_scoring_sample.csv', sep=';')
df

Unnamed: 0,SeriousDlqin2yrs,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,MonthlyIncome,NumberOfDependents
0,0,64,0,0.249908,0,0,8158.0,0.0
1,0,58,0,3870.000000,0,0,,0.0
2,0,41,0,0.456127,0,0,6666.0,0.0
3,0,43,0,0.000190,0,0,10500.0,2.0
4,1,49,0,0.271820,0,0,400.0,0.0
...,...,...,...,...,...,...,...,...
45058,1,31,0,0.824725,0,0,3000.0,1.0
45059,0,49,0,6530.000000,0,0,0.0,5.0
45060,1,38,0,0.475841,0,0,3000.0,2.0
45061,0,47,1,0.485198,0,0,11720.0,5.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45063 entries, 0 to 45062
Data columns (total 8 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   SeriousDlqin2yrs                      45063 non-null  int64  
 1   age                                   45063 non-null  int64  
 2   NumberOfTime30-59DaysPastDueNotWorse  45063 non-null  int64  
 3   DebtRatio                             45063 non-null  float64
 4   NumberOfTimes90DaysLate               45063 non-null  int64  
 5   NumberOfTime60-89DaysPastDueNotWorse  45063 non-null  int64  
 6   MonthlyIncome                         36420 non-null  float64
 7   NumberOfDependents                    43946 non-null  float64
dtypes: float64(3), int64(5)
memory usage: 2.8 MB


In [4]:
def fill_nan(table):
    for col in table.columns:
        table[col] = table[col].fillna(table[col].median())
    return table

In [5]:
df = fill_nan(df)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45063 entries, 0 to 45062
Data columns (total 8 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   SeriousDlqin2yrs                      45063 non-null  int64  
 1   age                                   45063 non-null  int64  
 2   NumberOfTime30-59DaysPastDueNotWorse  45063 non-null  int64  
 3   DebtRatio                             45063 non-null  float64
 4   NumberOfTimes90DaysLate               45063 non-null  int64  
 5   NumberOfTime60-89DaysPastDueNotWorse  45063 non-null  int64  
 6   MonthlyIncome                         45063 non-null  float64
 7   NumberOfDependents                    45063 non-null  float64
dtypes: float64(3), int64(5)
memory usage: 2.8 MB


In [7]:
# Question 1. There are 5 jurors in a courtroom. Each of them can correctly identify the guilt of the defendant 
# with 70% probability, independent of one another. What is the probability that the jurors will jointly reach 
# the correct verdict if the final decision is by majority vote?

p = 0.7
10*p**3*(1-p)**2 + 5*p**4*(1-p) + p**5

0.8369199999999999

In [8]:
df["SeriousDlqin2yrs"].value_counts(normalize=True)

SeriousDlqin2yrs
0    0.777511
1    0.222489
Name: proportion, dtype: float64

In [9]:
X, y = df.drop('SeriousDlqin2yrs', axis=1), df['SeriousDlqin2yrs']

In [10]:
# Question 2. Make an interval estimate of the average age for the customers who delayed repayment 
# at the 90% confidence level. Use the example from the article as reference, if needed. Also, 
# use np.random.seed(0) as before. What is the resulting interval estimate?

st.t.interval(confidence=0.9, df=X.shape[0], loc=df[df['SeriousDlqin2yrs'] == 1]['age'].mean(), \
              scale=st.sem(df[df['SeriousDlqin2yrs'] == 1]['age']))

(45.714408107037464, 46.13877362047102)

In [11]:
lr = LogisticRegression(random_state=5, class_weight="balanced", max_iter=10000)
parameters = {"C": (0.0001, 0.001, 0.01, 0.1, 1, 10)}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)

In [12]:
# Question 3. Perform a Grid Search with the scoring metric “roc_auc” for the parameter C. 
# Which value of the parameter C is optimal?
search = GridSearchCV(lr, parameters, n_jobs=-1, scoring="roc_auc", cv=skf)
search.fit(X, y)
search.best_estimator_

In [13]:
# Question 4. Can we consider the best model stable? The model is stable if the standard deviation on validation \
# is less than 0.5%. Save the ROC AUC value of the best model; it will be useful for the following tasks

search.cv_results_['std_test_score'].max()/search.best_score_ * 100

# Answer: No

1.0059881123276648

In [15]:
# Question 5. Feature importance is defined by the absolute value of its corresponding coefficient. 
# First you need to normalize all the feature values so that it will be correct to compare them. 
# What is the most important feature for the best logistic regression model?

log_reg = LogisticRegression(C=0.001, random_state=5, class_weight='balanced')
scl = StandardScaler()
log_reg.fit(scl.fit_transform(X), y)

In [16]:
features_importance = pd.DataFrame({"abs_importance": np.abs(log_reg.coef_[0]), "feature": X.columns})
features_importance.sort_values('abs_importance', ascending=False)

# Answer: NumberOfTime30-59DaysPastDueNotWorse

Unnamed: 0,abs_importance,feature
1,0.723427,NumberOfTime30-59DaysPastDueNotWorse
3,0.516788,NumberOfTimes90DaysLate
0,0.416702,age
4,0.193558,NumberOfTime60-89DaysPastDueNotWorse
5,0.163146,MonthlyIncome
6,0.101443,NumberOfDependents
2,0.024096,DebtRatio


In [17]:
# Question 6. Calculate how much DebtRatio affects our prediction using the softmax function. What is its value?

print((np.exp(log_reg.coef_[0]) / np.sum(np.exp(log_reg.coef_[0])))[2])

0.11426375283065274


In [18]:
# Question 7. Let’s see how we can interpret the impact of our features. 
# For this, recalculate the logistic regression with absolute values, that is without scaling. 
# Next, modify the customer’s age by adding 20 years, keeping the other features unchanged. 
# How many times will the chance that the customer will not repay their debt increase?

log_reg.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
log_reg.coef_

array([[-1.32152905e-02,  4.83302285e-01, -1.18165361e-05,
         4.24456712e-01,  2.22727898e-01, -1.17611138e-05,
         1.25391074e-01]])

In [20]:
features_importance = pd.DataFrame({"abs_importance": np.abs(log_reg.coef_[0]), "feature": X.columns})
features_importance.sort_values('abs_importance', ascending=False)

Unnamed: 0,abs_importance,feature
1,0.483302,NumberOfTime30-59DaysPastDueNotWorse
3,0.424457,NumberOfTimes90DaysLate
4,0.222728,NumberOfTime60-89DaysPastDueNotWorse
6,0.125391,NumberOfDependents
0,0.013215,age
2,1.2e-05,DebtRatio
5,1.2e-05,MonthlyIncome


In [21]:
np.exp(log_reg.coef_[0][0] * 20)

0.7677387222738693

In [22]:
# Question 8. How much higher is the ROC AUC of the best random forest model than that of the best logistic regression 
# on validation?


rf = RandomForestClassifier(
    n_estimators=100, n_jobs=-1, random_state=42, class_weight="balanced"
)

In [23]:
parameters = {
    "max_features": [1, 2, 4],
    "min_samples_leaf": [3, 5, 7, 9],
    "max_depth": [5, 10, 15],
}

In [24]:
rf_search = GridSearchCV(rf, param_grid=parameters, cv=skf, n_jobs=-1, scoring="roc_auc")

In [25]:
rf_search.fit(X, y)

In [27]:
rf_search.best_score_ - search.best_score_
# Answer: 0.269

0.026866475306627002

In [29]:
# Question 9. What feature has the weakest impact in Random Forest model?

best_rf = rf_search.best_estimator_
best_rf.feature_importances_

features_importance = pd.DataFrame({"abs_importance": np.abs(best_rf.feature_importances_), "feature": X.columns})
features_importance.sort_values('abs_importance', ascending=False)

# Answer: NumberOfDependents

Unnamed: 0,abs_importance,feature
1,0.30029,NumberOfTime30-59DaysPastDueNotWorse
3,0.278749,NumberOfTimes90DaysLate
4,0.156534,NumberOfTime60-89DaysPastDueNotWorse
0,0.11586,age
2,0.076082,DebtRatio
5,0.057994,MonthlyIncome
6,0.014491,NumberOfDependents


In [None]:
# Question 10. What is the most significant advantage of using Logistic Regression versus Random Forest for this problem?

# Feature interpretability;

In [34]:
# Question 11. Fit a bagging classifier with random_state=42. For the base classifiers, use 100 logistic regressors 
# and use RandomizedSearchCV instead of GridSearchCV. It will take a lot of time to iterate over all 54 variants, 
# so set the maximum number of iterations for RandomizedSearchCV to 20. Don’t forget to set the parameters cv and 
# random_state=1. What is the best ROC AUC you achieve?

bagg_clf = BaggingClassifier(estimator=LogisticRegression(class_weight='balanced'), random_state=42, n_estimators=100, n_jobs=-1)
parameters = {
    "max_features": [2, 3, 4],
    "max_samples": [0.5, 0.7, 0.9],
    "base_estimator__C": [0.0001, 0.001, 0.01, 1, 10, 100],
}
rscv = RandomizedSearchCV(bagg_clf, param_distributions=parameters, random_state=1, cv=skf, scoring='roc_auc', n_iter=20, n_jobs=-1)
rscv.fit(X, y)
rscv.best_score_

  clone(base_estimator).set_params(**self.best_params_)


0.8087951623513827

In [35]:
# Question 12. Give an interpretation of the best parameters for bagging. 
# Why are these values of max_features and max_samples the best?

rscv.best_params_

# Answer: Less correlation between single models;

{'max_samples': 0.9, 'max_features': 2, 'base_estimator__C': 100}