# Task 4

After experimenting with various approaches, I decided to take the following approach. Similar to leaf purity in decision trees, I minimize the entropy of each bucket. This has the same principle as minimizing MSE or maximizing log likelihood.

In [2]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('Task_3_and_4_Loan_Data.csv')
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [36]:
class FicoBucketerFinal:

    def __init__(self, num_buckets=10, min_bucket_size=100):
        self.num_buckets = num_buckets
        self.min_bucket_size = min_bucket_size
        self.boundaries = []

    def _entropy(self, k, n):
        if k == 0 or k == n or n == 0:
            return 0
        p = k / n
        return -p * np.log2(p) - (1 - p) * np.log2(1 - p)

    def _total_entropy(self, df):
        n = len(df)
        k = df['default'].sum() 

        return self._entropy(k, n) * n

    def _find_best_split(self, df): #like decision tree logic finds splits 
        best_score = float('inf')
        
        best_split = None 
        n = len(df)

        for i in range(self.min_bucket_size, n - self.min_bucket_size):
            left = df.iloc[:i]
            right = df.iloc[i:]

            score = self._total_entropy(left) + self._total_entropy(right)

            if score < best_score:
                best_score = score
                best_split = df.iloc[i]['fico_score']

        return best_split

    def fit(self, fico_scores, defaults):
        df = pd.DataFrame({'fico_score': fico_scores, 'default': defaults})
        df = df.sort_values('fico_score').reset_index(drop=True)

        #print(df)

        from queue import PriorityQueue
        pq = PriorityQueue()
        pq.put((-self._total_entropy(df), df))

        buckets = []

        while not pq.empty() and len(buckets) + pq.qsize() < self.num_buckets:
            blank, current_df = pq.get()

            split_val = self._find_best_split(current_df)
            if split_val is None:
                buckets.append(current_df)
                continue

            left = current_df[current_df['fico_score'] < split_val]
            right = current_df[current_df['fico_score'] >= split_val]

            if len(left) < self.min_bucket_size or len(right) < self.min_bucket_size:
                buckets.append(current_df)
                continue

            pq.put((-self._total_entropy(left), left))
            pq.put((-self._total_entropy(right), right))

        while not pq.empty():
            a, b = pq.get()
            buckets.append(b)

        cutoffs = []
        for b in buckets[1:]:
            min_val = b['fico_score'].min()
            if min_val not in cutoffs:
                cutoffs.append(min_val)
        self.boundaries = sorted(cutoffs)

    def get_boundaries(self):
        return [300] + self.boundaries + [850] #two outside bounds set

    def map(self, score):
        for i, b in enumerate(self.boundaries):
            if score < b:
                return i
        return len(self.boundaries)

    def transform(self, fico_scores):
        return [self.map(s) for s in fico_scores]


In [34]:
bucketer = FicoBucketerFinal(num_buckets=10, min_bucket_size=300)
bucketer.fit(df['fico_score'], df['default'])

boundaries = bucketer.get_boundaries()
bucket_ranges = [(boundaries[i], boundaries[i + 1] - 1 if i + 1 < len(boundaries) else 850) for i in range(len(boundaries) - 1)]

for i, (low, high) in enumerate(bucket_ranges):
    print(f"Bucket {i}: FICO {low}–{high}")

Bucket 0: FICO 300–407
Bucket 1: FICO 408–552
Bucket 2: FICO 553–604
Bucket 3: FICO 605–610
Bucket 4: FICO 611–633
Bucket 5: FICO 634–639
Bucket 6: FICO 640–658
Bucket 7: FICO 659–695
Bucket 8: FICO 696–718
Bucket 9: FICO 719–849


We get meaningful splits