# **Predictive Default Risk Assessor V.01**

# TODO

* Base model 
* Comparison
* Specialised
* For small entities - Examples?
* Backtest - All sectors 
* Understanding the model across all sectors/industries
* Any markets - consumer goods, industries
* UI last step after backtesting

In [1]:
model_inputs = {
    "profitability": {
        "class_weight": 0.30,
        "weights": [1.0], 
        "metrics": {
            "oper_margin": {
                "lower_is_better": False,
                "thresholds": [
                    (40, float("inf")),
                    (35, 39),
                    (30, 34),
                    (25, 29),
                    (20, 24),
                    (15, 19),
                    (10, 14),
                    (5, 9),
                    (float("-inf"), 0)
                ],
            }
        },
    },
    "leverage_coverage": {
        "class_weight": 0.55,
        "weights": [0.4, 0.3, 0.3],
        "metrics": {
            "tot_debt_to_tot_eqy": {
                "lower_is_better": True,
                "thresholds": [
                     (float("-inf"), 2.0),
                     (2.0, 16.0),
                     (16.0, 24.0),
                     (24.0, 33.0),
                     (33.0, 43.0),
                     (43.0, 54.0),
                     (54.0, 68.0),
                     (68.0, 94.0),
                     (94.0, float("inf")),
                ],
            },
            "tot_debt_to_ebitda": {
                "lower_is_better": True,
                "thresholds": [
                    (float("-inf"), 0.09),
                    (0.09, 0.49),
                    (0.49, 0.9),
                    (0.9, 1.36),
                    (1.36, 1.68),
                    (1.68, 2.26),
                    (2.26, 3.27),
                    (3.27, 4.4),
                    (4.4, float("inf")),
                ],
            },
            "ebitda_to_tot_int_exp": {
                "lower_is_better": False,
                "thresholds": [
                    (25, float("inf")),
                    (20, 25),
                    (15, 20),
                    (10, 15),
                    (5, 10),
                    (3, 5),
                    (1, 3),
                    (0, 1),
                    (float("-inf"), 0),
                ],
            },
        },
    },
    "efficiency": {
        "class_weight": 0.15,
        "weights": [0.5, 0.5],
        "metrics": {
            "return_on_asset": {
                "lower_is_better": False,
                "thresholds": [
                    (0.15, float("inf")),
                    (0.10, 0.15),
                    (0.08, 0.10),
                    (0.06, 0.08),
                    (0.04, 0.06),
                    (0.02, 0.04),
                    (0.00, 0.02),
                    (-0.02, 0.00),
                    (float("-inf"), -0.02)
                ],
            },
            "asset_turnover": {
                "lower_is_better": False,
                "thresholds": [
                    (4.0, float("inf")),
                    (3.0, 4.0),
                    (2.0, 3.0),
                    (1.5, 2.0),
                    (1.0, 1.5),
                    (0.75, 1.0),
                    (0.5, 0.75),
                    (0.25, 0.5),
                    (float("-inf"), 0.25)
                ],
            },
        },
    },
}

In [2]:
import pandas as pd
import numpy as np

In [3]:
class CreditRatingCalculator:
    def __init__(self, metrics):
        self.metrics = metrics
        
    def _calculate_metric_score(self, metric, thresholds, inverse):
        for score, (lower, upper) in enumerate(thresholds, start=1):
            if (inverse and metric <= upper) or (not inverse and metric >= lower):
                return score
        return len(thresholds) // 2 # else return the middle score

    def _calculate_category_score(self, category_metrics, ratios):
        total_weighted_score = 0

        for metric, weight in zip(
            category_metrics["metrics"].items(), category_metrics["weights"]
        ):
            metric_name, metric_data = metric
            value = ratios[metric_name]
            score = self._calculate_metric_score(
                value, metric_data["thresholds"], metric_data["lower_is_better"]
            )
            total_weighted_score += score * weight

        return total_weighted_score

    def _calculate_scores(self, ratios):
        scores = {}
        for category, category_data in self.metrics.items():
            category_score = self._calculate_category_score(category_data, ratios)
            scores[category] = category_score
        return scores

    def _calculate_weighted_score(self, scores):
        weights = {
            category: category_data["class_weight"]
            for category, category_data in self.metrics.items()
        }
        return sum(scores[category] * weight for category, weight in weights.items())

    def _determine_credit_rating(self, weighted_score):
        # credit_ratings = [
        #     (1.5, "Aaa"),
        #     (2.5, "Aa"),
        #     (3.5, "A"),
        #     (4.5, "Baa"),
        #     (5.5, "Ba"),
        #     (6.5, "B"),
        #     (7.5, "Caa"),
        #     (8.5, "Ca"),
        #     (float("inf"), "C"),
        # ]
        
        credit_ratings = [
            (2.5, "Aaa"),
            (3.5, "Aa"),
            (4.5, "A"),
            (5.5, "Baa"),
            (6.5, "Ba"),
            (7.5, "B"),
            (8.5, "Caa"),
            (9.5, "Ca"),
            (float("inf"), "C"),
        ]

        for threshold, rating in credit_ratings:
            if weighted_score < threshold:
                return rating

    def calculate_credit_rating(self, ratios):
        self.scores = self._calculate_scores(ratios)
        self.credit_score = self._calculate_weighted_score(self.scores)
        self.credit_rating = self._determine_credit_rating(self.credit_score)

In [4]:
# df = pd.read_csv("research/JALSH Index_dataset_2000_2024_clean.csv", index_col=0, header=[0, 1])
# classfier = pd.read_excel("research/classification_data.xlsx", index_col=0)
metrics = pd.read_excel("research/metrics_full.xlsx", index_col=0)

In [5]:
company = "SAP SJ Equity"

In [6]:
model_metrics = [
    "oper_margin", 
    "tot_debt_to_tot_eqy", 
    "tot_debt_to_ebitda", 
    "ebitda_to_tot_int_exp", 
    "return_on_asset",
    "asset_turnover",
]

ratios = metrics[company].loc[model_metrics].to_dict()

In [7]:
model = CreditRatingCalculator(model_inputs)
model.calculate_credit_rating(ratios)
print(f"Model Inputs:")
display(ratios)
print("")
print(f"Class Scoring: {model.scores}")
print(f"Credit Score: {model.credit_score}")
print(f"Credit Rating: {model.credit_rating}")

Model Inputs:


{'oper_margin': 5.486962000997409,
 'tot_debt_to_tot_eqy': 139.3885721854559,
 'tot_debt_to_ebitda': 3.807108391400997,
 'ebitda_to_tot_int_exp': 5.233890852664169,
 'return_on_asset': 2.051451925429643,
 'asset_turnover': 0.9411697646046825}


Class Scoring: {'profitability': 8.0, 'leverage_coverage': 7.5, 'efficiency': 3.5}
Credit Score: 7.050000000000001
Credit Rating: B


In [2]:
import json

import pandas as pd

In [8]:
ibx = pd.read_excel("dataset/ibx_features_clean.xlsx", index_col=0, header=[0, 1])
jalsh = pd.read_excel("dataset/jalsh_features_clean.xlsx", index_col=0, header=[0, 1])
target = pd.read_excel("dataset/credit.xlsx", index_col=0)

In [11]:
dataset = ibx.join(jalsh)

In [15]:
    dataset[target.index]

SyntaxError: incomplete input (2628743332.py, line 1)

In [44]:
mappings = {
    "Aaa": ["IG1", "IG2", "IG3",],
    "Aa": [ "IG4", "IG5", "IG6"],
    "A": ["IG7", "IG8", "IG9", "IG10"],
    "Baa": ["HY1", "HY2"],
    "Ba": ["HY3", "HY4"],
    "B": ["HY5", "HY6"],
    "Caa": ["DS1", "DS2"],
    "Ca": ["DS3", "DS4"],
    "C": ["DS5"]
}

# Create a reverse mapping dictionary
reverse_mappings = {value: key for key, values in mappings.items() for value in values}

# Map the values in the 'rating' column using the reverse mapping dictionary
prob['rating'] = prob['rsk_bb_issuer_default'].map(lambda x: reverse_mappings.get(x, x))

In [45]:
prob

Unnamed: 0_level_0,rsk_bb_issuer_default,rating
security,Unnamed: 1_level_1,Unnamed: 2_level_1
BTI SJ Equity,IG3,Aaa
PAN SJ Equity,IG8,A
INP SJ Equity,IG7,A
NPN SJ Equity,IG10,A
VKE SJ Equity,IG4,Aa
...,...,...
FFA SJ Equity,IG2,Aaa
SPG SJ Equity,IG10,A
BVT SJ Equity,IG3,Aaa
BTN SJ Equity,IG10,A


In [53]:
credit_ratings = {
    "Aaa": 2.5,
    "Aa": 3.5,
    "A": 4.5,
    "Baa": 5.5,
    "Ba": 6.5,
    "B": 7.5,
    "Caa": 8.5,
    "Ca": 9.5,
    "C": 10.0,
}

# Convert credit ratings to their numeric values
prob['numeric_rating'] = prob['rating'].map(credit_ratings)

In [56]:
y = prob[["numeric_rating"]]

In [69]:
y = y.loc[metrics.columns]

In [73]:
X = metrics.copy()

In [76]:
X = X.loc[model_metrics]

In [86]:
X

Unnamed: 0,ABG SJ Equity,ADH SJ Equity,AEL SJ Equity,AFE SJ Equity,AFH SJ Equity,AFT SJ Equity,AGL SJ Equity,AIL SJ Equity,AIP SJ Equity,AMS SJ Equity,...,TFG SJ Equity,TGA SJ Equity,THA SJ Equity,TKG SJ Equity,TRU SJ Equity,TSG SJ Equity,VKE SJ Equity,VOD SJ Equity,WBO SJ Equity,WHL SJ Equity
oper_margin,28.202722,14.421976,5.841907,8.06148,19.331788,16.912217,18.42702,87.144999,16.014715,22.805459,...,12.665255,14.666294,4.164911,12.965402,19.765067,27.788494,68.366238,26.210947,4.335948,8.61018
tot_debt_to_tot_eqy,242.937113,38.061018,53.463897,47.45933,80.367497,19.268366,43.240614,0.0,18.092223,20.964517,...,59.437803,2.451614,115.038914,56.593172,21.861382,141.812906,73.289447,65.00494,9.188145,124.097944
tot_debt_to_ebitda,,1.329016,1.722145,1.734404,2.281107,0.607097,2.968238,,0.521484,0.779408,...,1.894263,0.279258,2.297316,1.430977,0.565602,2.552621,3.613919,0.793112,0.311257,1.584613
ebitda_to_tot_int_exp,,219.356343,18.225091,7.064499,7.725572,18.198105,8.820816,,25.1399,128.682471,...,9.788604,14.661929,10.029111,8.796523,1496.857073,7.036121,3.830182,13.355185,278.833027,16.34104
return_on_asset,1.233318,8.247937,7.653897,5.113338,0.099075,12.092518,5.62305,4.356048,11.363074,14.951768,...,10.459391,14.996192,4.690065,6.40137,21.802062,8.697806,5.667759,16.143841,5.572577,9.756574
asset_turnover,0.108163,1.11564,1.468364,1.175394,0.016101,1.037714,0.482276,0.063254,1.04675,0.798966,...,1.046913,0.992103,0.724942,0.724252,1.210102,0.573161,0.119474,1.076702,2.148323,1.897337


In [111]:
best.feature_importances_

array([0.08557339, 0.20166739, 0.15088866, 0.16237016, 0.23151656,
       0.16798386])

In [159]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from gplearn.genetic import SymbolicRegressor

In [99]:
X = X.T

In [160]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [169]:
model = DecisionTreeRegressor().fit(X_train, y_train)

In [171]:
model.score(X_train, y_train)

1.0

In [157]:
_ = setup(X.join(y), target="numeric_rating")

Unnamed: 0,Description,Value
0,Session id,2305
1,Target,numeric_rating
2,Target type,Regression
3,Original data shape,"(124, 7)"
4,Transformed data shape,"(124, 7)"
5,Transformed train set shape,"(86, 7)"
6,Transformed test set shape,"(38, 7)"
7,Numeric features,6
8,Rows with missing values,14.5%
9,Preprocess,True



The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.



In [158]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.7553,1.028,0.9783,-0.0332,0.2024,0.2183,0.07
rf,Random Forest Regressor,0.794,1.049,0.9973,-0.0727,0.207,0.2312,0.096
ada,AdaBoost Regressor,0.7965,1.0849,1.016,-0.1115,0.2123,0.2367,0.046
catboost,CatBoost Regressor,0.7837,1.107,1.0163,-0.1255,0.2111,0.2248,1.052
gbr,Gradient Boosting Regressor,0.7689,1.0687,1.0138,-0.1632,0.2096,0.2211,0.041
knn,K Neighbors Regressor,0.8342,1.1215,1.0329,-0.1747,0.2171,0.2432,0.026
lightgbm,Light Gradient Boosting Machine,0.8133,1.1295,1.0427,-0.1916,0.2173,0.2352,0.061
br,Bayesian Ridge,0.8893,1.203,1.061,-0.2154,0.2378,0.2645,0.015
dummy,Dummy Regressor,0.938,1.2651,1.0941,-0.2341,0.2331,0.2763,0.016
omp,Orthogonal Matching Pursuit,0.9313,1.2807,1.0987,-0.3095,0.235,0.2775,0.015



Styler.applymap has been deprecated. Use Styler.map instead.



In [141]:
best.feature_importances_

array([0.11391478, 0.22115645, 0.1555532 , 0.17799284, 0.18690308,
       0.14447966])

In [None]:
from pycaret.regression import get_config

In [138]:
best.score(X_test, y_test)

ValueError: Input X contains NaN.
ExtraTreesRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [139]:
best.score(X_train, y_train)

ValueError: Input X contains NaN.
ExtraTreesRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [148]:
create_api(best, "model")

API successfully created. This function only creates a POST API, it doesn't run it automatically. To run your API, please run this command --> !python model.py


In [149]:
create_docker("model_tree")

Writing requirements.txt
Writing Dockerfile
Dockerfile and requirements.txt successfully created.
    To build image you have to run --> !docker image build -f "Dockerfile" -t IMAGE_NAME:IMAGE_TAG .
            


In [152]:
from pycaret.regression import automl, tune_model, deploy_model, evaluate_model,

In [151]:
tune_model(best)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.8815,1.7062,1.3062,-0.0314,0.2513,0.2168
1,1.0241,1.7206,1.3117,-0.2905,0.2535,0.2305
2,0.7574,0.9256,0.9621,0.0628,0.2104,0.2409
3,0.2704,0.1341,0.3662,0.6982,0.0878,0.0889
4,0.6963,0.6424,0.8015,-0.3693,0.1925,0.2484
5,0.9963,2.0935,1.4469,0.0683,0.2548,0.2158
6,0.9312,1.3577,1.1652,-0.8102,0.2374,0.2117
7,0.5229,0.3709,0.609,0.5684,0.1274,0.1551
8,0.8646,0.8698,0.9326,0.3602,0.2048,0.2703
9,0.8438,0.7418,0.8613,0.0109,0.1868,0.2316


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [156]:
automl()


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

