# Task 4

In [2]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('Task_3_and_4_Loan_Data.csv')
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [14]:
class FicoQuantizer:
    def __init__(self, buckets_low=5, buckets_high=5, split_point=600):
        self.buckets_low = buckets_low
        self.buckets_high = buckets_high
        self.split_point = split_point
        self.boundaries = []

    def _log_likelihood(self, y):
        ni = len(y)
        ki = y.sum()
        if ki == 0 or ki == ni:
            return 0
        pi = ki / ni
        return ki * np.log(pi) + (ni - ki) * np.log(1 - pi)

    def _recursive_split(self, data, buckets_left):
        if buckets_left == 1 or len(data) < 2:
            return []
        best_score = -np.inf
        best_split = None
        for i in range(1, len(data)):
            left = data.iloc[:i]
            right = data.iloc[i:]
            score = self._log_likelihood(left['default']) + self._log_likelihood(right['default'])
            if score > best_score:
                best_score = score
                best_split = i
        split_val = data.iloc[best_split]['fico']
        left = data.iloc[:best_split]
        right = data.iloc[best_split:]


        return [split_val] + self._recursive_split(left, buckets_left - 1) + self._recursive_split(right, buckets_left - 1)

    def fit(self, fico_scores, defaults):
        data = pd.DataFrame({'fico': fico_scores, 'default': defaults})
        data = data.sort_values('fico').reset_index(drop=True)

        low_data = data[data['fico'] < self.split_point]
        high_data = data[data['fico'] >= self.split_point]

        low_bounds = self._recursive_split(low_data, self.buckets_low) if len(low_data) > 0 else []
        high_bounds = self._recursive_split(high_data, self.buckets_high) if len(high_data) > 0 else []

        self.boundaries = sorted(set(low_bounds + high_bounds))

    def map_fico_to_rating(self, fico_score):
        for i, b in enumerate(self.boundaries):
            if fico_score < b:
                return i
        return len(self.boundaries)

    def transform(self, fico_scores):
        return [self.map_fico_to_rating(f) for f in fico_scores]






In [16]:

data = df;

quantizer = FicoQuantizer(buckets_low=5, buckets_high=5, split_point=600)
quantizer.fit(data['fico_score'], data['default'])
data['rating_bucket'] = quantizer.transform(data['fico_score'])

print(data[['customer_id', 'fico_score', 'rating_bucket', 'default']])
print("\nBucket boundaries (split at 600):")
print(quantizer.boundaries)


      customer_id  fico_score  rating_bucket  default
0         8153374         605             13        0
1         7442532         572              7        1
2         2256073         602             12        0
3         4885975         612             15        0
4         4700614         631             15        0
...           ...         ...            ...      ...
9995      3972488         697             19        0
9996      6184073         615             15        0
9997      6694516         596              9        0
9998      3942961         647             15        0
9999      5533570         757             21        0

[10000 rows x 4 columns]

Bucket boundaries (split at 600):
[409, 444, 502, 503, 521, 526, 553, 578, 580, 598, 599, 600, 605, 608, 611, 649, 650, 695, 696, 732, 740]


We get meaningful splits