In [120]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import binom

In [122]:
Loans = pd.read_csv('Task 3 and 4_Loan_Data.csv')

In [124]:
Loans.head(10)

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0
5,4661159,0,5376.886873,7189.121298,85529.84591,2,697,0
6,8291909,1,3634.057471,7085.980095,68691.57707,6,722,0
7,4616950,4,3302.172238,13067.57021,50352.16821,3,545,1
8,3395789,0,2938.325123,1918.404472,53497.37754,4,676,0
9,4045948,0,5396.366774,5298.824524,92349.55399,2,447,0


## Maximising the Log-likelihood (LL)

In [126]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, roc_auc_score

In [200]:
def calculate_initial_ll(data,no_of_buckets): # this calculates LL using bounds such that each bucket has equal length 
    data['Buckets'] = pd.cut(data['fico_score'], bins = no_of_buckets, labels=False)
    ll = 0
    for i in range(no_of_buckets):
        buckets = data[data['Buckets'] == i]
        if not buckets.empty:
            n_i = len(buckets)
            k_i = buckets['default'].sum()
            p_i = k_i/n_i if n_i > 0 else 0
            ll += binom.logpmf(k_i,n_i,p_i)
    return ll

initial_ll = calculate_initial_ll(Loans,4)
initial_ll

-13.463435690607817

In [178]:
def calculate_ll(data,no_of_buckets,bounds): # more generalised function for calculating ll with given no of buckets and the bounds
    data['Buckets'] = pd.cut(data['fico_score'], bins = bounds, labels=False)
    ll = 0
    for i in range(no_of_buckets):
        buckets = data[data['Buckets'] == i]
        if not buckets.empty:
            n_i = len(buckets)
            k_i = buckets['default'].sum()
            p_i = k_i/n_i if n_i > 0 else 0
            ll += binom.logpmf(k_i,n_i,p_i)
    return ll

bounds = np.linspace(Loans['fico_score'].min(),Loans['fico_score'].max(),15)
calculate_ll(Loans,14,bounds)

-33.598696267881586

In [202]:
def optimize_ll(data,test_bucket_numbers):
    optimal_ll = float('-inf')
    optimal_no_of_buckets = test_bucket_numbers[0]
    optimal_bounds = None

    for n in test_bucket_numbers:
        print(f'Testing with {n} buckets')
        bounds = np.linspace(data['fico_score'].min(),data['fico_score'].max(),n+1)
        initial_ll = calculate_initial_ll(data,n)

        # optimizing the bounds
        
        bounds_optimized = True
        iterations = 0
        final_ll = initial_ll
        
        while bounds_optimized and iterations < 100:
            bounds_optimized = False
            for i in range (n): #isnt that just n
                initial_bound = bounds[i]

                for shift in [-10,10]:
                    bounds[i] += shift
                    data['Buckets'] = pd.cut(data['fico_score'], bins = bounds, labels=False, include_lowest = True)
                    new_ll = calculate_ll(data,n,bounds)
                    if new_ll > final_ll:
                        final_ll = new_ll
                        bounds_optimized = True
                    else:
                        bounds[i] = initial_bound
            iterations += 1

        print(f'Optimized bounds for n={n} are {bounds}, giving semi-optimal LL {final_ll}')

        if final_ll > optimal_ll:
            optimal_ll = final_ll
            optimal_no_of_buckets = n
            optimal_bounds = bounds

    print(f'Optimal LL of {optimal_ll} is obtained when n={optimal_no_of_buckets} and bounds are {optimal_bounds}')
    return optimal_ll, optimal_no_of_buckets, optimal_bounds

optimize_ll(Loans,[2,3,4,5,6,7,8,9,10])

Testing with 2 buckets
Optimized bounds for n=2 are [788. 819. 850.], giving semi-optimal LL -0.9716930352791294
Testing with 3 buckets
Optimized bounds for n=3 are [458.         555.33333333 742.66666667 850.        ], giving semi-optimal LL -9.94262353768576
Testing with 4 buckets
Optimized bounds for n=4 are [438.  518.5 629.  759.5 850. ], giving semi-optimal LL -12.73894431204495
Testing with 5 buckets
Optimized bounds for n=5 are [418.  496.4 584.8 673.2 761.6 850. ], giving semi-optimal LL -15.581549172652764
Testing with 6 buckets
Optimized bounds for n=6 are [418.         481.66666667 555.33333333 629.         702.66666667
 776.33333333 850.        ], giving semi-optimal LL -17.87221718202465
Testing with 7 buckets
Optimized bounds for n=7 are [428.         471.14285714 534.28571429 597.42857143 660.57142857
 733.71428571 796.85714286 850.        ], giving semi-optimal LL -19.7444508005168
Testing with 8 buckets
Optimized bounds for n=8 are [418.   463.25 518.5  573.75 629.   

(-0.9716930352791294, 2, array([788., 819., 850.]))