# **Predictive Default Risk Assessor V.01**

# TODO

* Base model 
* Comparison
* Specialised
* For small entities - Examples?
* Backtest - All sectors 
* Understanding the model across all sectors/industries
* Any markets - consumer goods, industries
* UI last step after backtesting

In [1]:
model_inputs = {
    "profitability": {
        "class_weight": 0.30,
        "weights": [1.0], 
        "metrics": {
            "oper_margin": {
                "lower_is_better": False,
                "thresholds": [
                    (40, float("inf")),
                    (35, 39),
                    (30, 34),
                    (25, 29),
                    (20, 24),
                    (15, 19),
                    (10, 14),
                    (5, 9),
                    (float("-inf"), 0)
                ],
            }
        },
    },
    "leverage_coverage": {
        "class_weight": 0.55,
        "weights": [0.4, 0.3, 0.3],
        "metrics": {
            "tot_debt_to_tot_eqy": {
                "lower_is_better": True,
                "thresholds": [
                     (float("-inf"), 2.0),
                     (2.0, 16.0),
                     (16.0, 24.0),
                     (24.0, 33.0),
                     (33.0, 43.0),
                     (43.0, 54.0),
                     (54.0, 68.0),
                     (68.0, 94.0),
                     (94.0, float("inf")),
                ],
            },
            "tot_debt_to_ebitda": {
                "lower_is_better": True,
                "thresholds": [
                    (float("-inf"), 0.09),
                    (0.09, 0.49),
                    (0.49, 0.9),
                    (0.9, 1.36),
                    (1.36, 1.68),
                    (1.68, 2.26),
                    (2.26, 3.27),
                    (3.27, 4.4),
                    (4.4, float("inf")),
                ],
            },
            "ebitda_to_tot_int_exp": {
                "lower_is_better": False,
                "thresholds": [
                    (25, float("inf")),
                    (20, 25),
                    (15, 20),
                    (10, 15),
                    (5, 10),
                    (3, 5),
                    (1, 3),
                    (0, 1),
                    (float("-inf"), 0),
                ],
            },
        },
    },
    "efficiency": {
        "class_weight": 0.15,
        "weights": [0.5, 0.5],
        "metrics": {
            "return_on_asset": {
                "lower_is_better": False,
                "thresholds": [
                    (0.15, float("inf")),
                    (0.10, 0.15),
                    (0.08, 0.10),
                    (0.06, 0.08),
                    (0.04, 0.06),
                    (0.02, 0.04),
                    (0.00, 0.02),
                    (-0.02, 0.00),
                    (float("-inf"), -0.02)
                ],
            },
            "asset_turnover": {
                "lower_is_better": False,
                "thresholds": [
                    (4.0, float("inf")),
                    (3.0, 4.0),
                    (2.0, 3.0),
                    (1.5, 2.0),
                    (1.0, 1.5),
                    (0.75, 1.0),
                    (0.5, 0.75),
                    (0.25, 0.5),
                    (float("-inf"), 0.25)
                ],
            },
        },
    },
}

In [2]:
import pandas as pd
import numpy as np

In [3]:
class CreditRatingCalculator:
    def __init__(self, metrics):
        self.metrics = metrics
        
    def _calculate_metric_score(self, metric, thresholds, inverse):
        for score, (lower, upper) in enumerate(thresholds, start=1):
            if (inverse and metric <= upper) or (not inverse and metric >= lower):
                return score
        return len(thresholds) // 2 # else return the middle score

    def _calculate_category_score(self, category_metrics, ratios):
        total_weighted_score = 0

        for metric, weight in zip(
            category_metrics["metrics"].items(), category_metrics["weights"]
        ):
            metric_name, metric_data = metric
            value = ratios[metric_name]
            score = self._calculate_metric_score(
                value, metric_data["thresholds"], metric_data["lower_is_better"]
            )
            total_weighted_score += score * weight

        return total_weighted_score

    def _calculate_scores(self, ratios):
        scores = {}
        for category, category_data in self.metrics.items():
            category_score = self._calculate_category_score(category_data, ratios)
            scores[category] = category_score
        return scores

    def _calculate_weighted_score(self, scores):
        weights = {
            category: category_data["class_weight"]
            for category, category_data in self.metrics.items()
        }
        return sum(scores[category] * weight for category, weight in weights.items())

    def _determine_credit_rating(self, weighted_score):
        # credit_ratings = [
        #     (1.5, "Aaa"),
        #     (2.5, "Aa"),
        #     (3.5, "A"),
        #     (4.5, "Baa"),
        #     (5.5, "Ba"),
        #     (6.5, "B"),
        #     (7.5, "Caa"),
        #     (8.5, "Ca"),
        #     (float("inf"), "C"),
        # ]
        
        credit_ratings = [
            (2.5, "Aaa"),
            (3.5, "Aa"),
            (4.5, "A"),
            (5.5, "Baa"),
            (6.5, "Ba"),
            (7.5, "B"),
            (8.5, "Caa"),
            (9.5, "Ca"),
            (float("inf"), "C"),
        ]

        for threshold, rating in credit_ratings:
            if weighted_score < threshold:
                return rating

    def calculate_credit_rating(self, ratios):
        self.scores = self._calculate_scores(ratios)
        self.credit_score = self._calculate_weighted_score(self.scores)
        self.credit_rating = self._determine_credit_rating(self.credit_score)

In [4]:
# df = pd.read_csv("research/JALSH Index_dataset_2000_2024_clean.csv", index_col=0, header=[0, 1])
# classfier = pd.read_excel("research/classification_data.xlsx", index_col=0)
metrics = pd.read_excel("research/metrics_full.xlsx", index_col=0)

In [5]:
company = "SAP SJ Equity"

In [6]:
model_metrics = [
    "oper_margin", 
    "tot_debt_to_tot_eqy", 
    "tot_debt_to_ebitda", 
    "ebitda_to_tot_int_exp", 
    "return_on_asset",
    "asset_turnover",
]

ratios = metrics[company].loc[model_metrics].to_dict()

In [7]:
model = CreditRatingCalculator(model_inputs)
model.calculate_credit_rating(ratios)
print(f"Model Inputs:")
display(ratios)
print("")
print(f"Class Scoring: {model.scores}")
print(f"Credit Score: {model.credit_score}")
print(f"Credit Rating: {model.credit_rating}")

Model Inputs:


{'oper_margin': 5.486962000997409,
 'tot_debt_to_tot_eqy': 139.3885721854559,
 'tot_debt_to_ebitda': 3.807108391400997,
 'ebitda_to_tot_int_exp': 5.233890852664169,
 'return_on_asset': 2.051451925429643,
 'asset_turnover': 0.9411697646046825}


Class Scoring: {'profitability': 8.0, 'leverage_coverage': 7.5, 'efficiency': 3.5}
Credit Score: 7.050000000000001
Credit Rating: B


In [2]:
import json

import pandas as pd

In [8]:
ibx = pd.read_excel("dataset/ibx_features_clean.xlsx", index_col=0, header=[0, 1])
jalsh = pd.read_excel("dataset/jalsh_features_clean.xlsx", index_col=0, header=[0, 1])
target = pd.read_excel("dataset/credit.xlsx", index_col=0)

In [17]:
dataset = ibx.join(jalsh)
features = dataset[target.index]

In [159]:
featuress = [
"OPER_MARGIN", 
"TOT_DEBT_TO_TOT_EQY", 
"TOT_DEBT_TO_EBITDA", 
"EBITDA_TO_TOT_INT_EXP", 
"RETURN_ON_ASSET", 
"ASSET_TURNOVER",
"TOT_DEBT_TO_TOT_ASSET",
"CFO_TO_TOT_DEBT",
"EBITDA_TO_REVENUE",
"FCF_TO_TOTAL_DEBT",
]

In [191]:
jse_stocks = [
'AGL SJ Equity', 'ANG SJ Equity', 'ANH SJ Equity', 'BAW SJ Equity',
'BHG SJ Equity', 'BTI SJ Equity', 'BVT SJ Equity', 'CFR SJ Equity',
'DSY SJ Equity', 'FFB SJ Equity', 'FSR SJ Equity', 'GFI SJ Equity',
'GLN SJ Equity', 'GRT SJ Equity', 'HMN SJ Equity', 'MNP SJ Equity',
'MSP SJ Equity', 'MTN SJ Equity', 'NRP SJ Equity', 'PPH SJ Equity',
'PRX SJ Equity', 'RDF SJ Equity', 'S32 SJ Equity', 'SAP SJ Equity',
'SNT SJ Equity', 'SOL SJ Equity', 'SSW SJ Equity', 'TKG SJ Equity',]

In [161]:
df = features.copy()

In [162]:
filtered_df = df.loc[:, (slice(None), featuress)]
sorted_df = df.sort_index(axis=1, level=0)

In [165]:
s = sorted_df.ffill()

In [168]:
f = s.iloc[[-1]]

In [205]:
XX = dd.T.loc[featuress].dropna(axis=1).T

In [216]:
XX = dd.T.loc[featuress].fillna(0).T

In [33]:
f = features.ffill().iloc[[-1]]

In [169]:
dd = pd.concat([f[col] for col in target.index], axis=0)

In [170]:
dd.index = target.index

In [175]:
dd.dropna(axis=1)[featuress]

KeyError: "['TOT_DEBT_TO_EBITDA', 'EBITDA_TO_TOT_INT_EXP', 'CFO_TO_TOT_DEBT', 'EBITDA_TO_REVENUE', 'FCF_TO_TOTAL_DEBT'] not in index"

In [49]:
dd.columns.name = None

In [55]:
target.to_excel("dataset/target.xlsx")

In [56]:
dd.to_excel("dataset/features.xlsx")

In [59]:
target.head()

Unnamed: 0,credit_rating
AGL SJ Equity,Baa2
ANG SJ Equity,Baa3
ANH SJ Equity,A3
BAW SJ Equity,Ba2
BHG SJ Equity,A1


In [66]:
# Create a reverse mapping dictionary
reverse_mappings = {value: key for key, values in mappings.items() for value in values}

# Map the values in the 'rating' column using the reverse mapping dictionary
target['rating'] = target['credit_rating'].map(lambda x: reverse_mappings.get(x, x))

In [67]:
target

Unnamed: 0,credit_rating,rating
AGL SJ Equity,Baa2,Baa
ANG SJ Equity,Baa3,Baa
ANH SJ Equity,A3,A
BAW SJ Equity,Ba2,Ba
BHG SJ Equity,A1,A
...,...,...
UGPA3 BS Equity,Ba1,Ba
USIM5 BS Equity,Ba2,Ba
VALE3 BS Equity,Baa3,Baa
VAMO3 BS Equity,BB-,Ba


In [68]:
credit_ratings = {
    "Aaa": 2.5,
    "Aa": 3.5,
    "A": 4.5,
    "Baa": 5.5,
    "Ba": 6.5,
    "B": 7.5,
    "Caa": 8.5,
    "Ca": 9.5,
    "C": 10.0,
}

# Convert credit ratings to their numeric values
target['numeric_rating'] = target['rating'].map(credit_ratings)

In [141]:
target.to_excel("dataset/target.xlsx")

In [71]:
y = target[["numeric_rating"]]

In [82]:
X = dd.copy()

In [115]:
X = X.dropna(how='all')

In [119]:
X = X.dropna(axis=1)

In [127]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from gplearn.genetic import SymbolicRegressor

from pycaret.regression import *

In [210]:
model = DecisionTreeRegressor().fit(X_train, y_train)

In [130]:
model.score(X_train, y_train)

1.0

In [132]:
best.score(X_test, y_test)

0.6351896103896102

In [134]:
best.

('criterion',
 'max_depth',
 'min_samples_split',
 'min_samples_leaf',
 'min_weight_fraction_leaf',
 'max_features',
 'max_leaf_nodes',
 'min_impurity_decrease',
 'random_state',
 'ccp_alpha',
 'monotonic_cst')

In [221]:
_ = setup(XX.join(y), target="numeric_rating", preprocess=True, fold=10)

Unnamed: 0,Description,Value
0,Session id,6065
1,Target,numeric_rating
2,Target type,Regression
3,Original data shape,"(70, 11)"
4,Transformed data shape,"(70, 11)"
5,Transformed train set shape,"(49, 11)"
6,Transformed test set shape,"(21, 11)"
7,Numeric features,10
8,Preprocess,True
9,Imputation type,simple




In [222]:
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dummy,Dummy Regressor,0.5745,0.5315,0.6771,-0.0902,0.0973,0.0985,0.016
omp,Orthogonal Matching Pursuit,0.5772,0.5315,0.6791,-0.1082,0.0974,0.0987,0.015
br,Bayesian Ridge,0.5978,0.5753,0.7067,-0.2118,0.1012,0.1022,0.016
lasso,Lasso Regression,0.5948,0.587,0.7126,-0.2477,0.1019,0.1014,0.016
llar,Lasso Least Angle Regression,0.5948,0.587,0.7126,-0.2477,0.1019,0.1014,0.017
rf,Random Forest Regressor,0.5919,0.5577,0.7024,-0.2664,0.1005,0.1007,0.1
en,Elastic Net,0.5865,0.5841,0.7147,-0.2696,0.102,0.1001,0.015
ada,AdaBoost Regressor,0.5942,0.5499,0.7087,-0.3358,0.1019,0.1009,0.043
catboost,CatBoost Regressor,0.6117,0.5759,0.722,-0.3812,0.1036,0.104,0.876
lightgbm,Light Gradient Boosting Machine,0.5701,0.5801,0.7286,-0.393,0.1042,0.0977,0.053


  .applymap(highlight_cols, subset=["TT (Sec)"])


In [226]:
best.score(_.X_test, _.y_test)

-0.15497418957328013

In [213]:
pd.DataFrame(best.feature_importances_, best.feature_names_in_).sort_values(0)

Unnamed: 0,0
OPER_MARGIN,0.032521
FCF_TO_TOTAL_DEBT,0.040124
RETURN_ON_ASSET,0.053191
TOT_DEBT_TO_TOT_EQY,0.05482
CFO_TO_TOT_DEBT,0.06494
ASSET_TURNOVER,0.065514
EBITDA_TO_REVENUE,0.067115
TOT_DEBT_TO_EBITDA,0.074195
EBITDA_TO_TOT_INT_EXP,0.235654
TOT_DEBT_TO_TOT_ASSET,0.311926


In [None]:
{'oper_margin': 5.486962000997409,
 'tot_debt_to_tot_eqy': 139.3885721854559,
 'tot_debt_to_ebitda': 3.807108391400997,
 'ebitda_to_tot_int_exp': 5.233890852664169,
 'return_on_asset': 2.051451925429643,
 'asset_turnover': 0.9411697646046825}

In [142]:
tune_model(best)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.435,0.3672,0.6059,0.1045,0.0934,0.08
1,0.4053,0.2055,0.4533,0.1438,0.0655,0.0695
2,0.5277,0.3637,0.6031,-0.0103,0.0826,0.0846
3,0.4401,0.341,0.5839,0.1684,0.0901,0.0801
4,0.3435,0.1535,0.3918,0.1118,0.056,0.0571
Mean,0.4303,0.2862,0.5276,0.1036,0.0775,0.0743
Std,0.0596,0.0891,0.0883,0.0614,0.0144,0.0099


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [156]:
automl()


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



In [143]:
best