In [10]:
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [98]:
final_df = pd.read_csv(r"D:\Canadian Business Analysis Project\data\final_processed_data.csv")

threshold = final_df["insolvencies"].quantile(0.4)

final_df["high_risk"] = (final_df["insolvencies"] > threshold).astype(int)

print("Threshold:", threshold)
print(final_df["high_risk"].value_counts())

Threshold: 86.0
high_risk
1    36
0    24
Name: count, dtype: int64


In [99]:
FEATURES = [
    "closures_lag1",
    "gdp",
    "gdp_qoq_growth",
    "gdp_volatility_4q",
    "cpi",
    "cpi_qoq_growth",
    "sector_code"
]                                      # Dropped CPI_yoy_growth and CPI_volatility_4q since they had no correlation with our output

X = final_df[FEATURES]
y = final_df["high_risk"]


In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [101]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [200, 400],
    'max_depth' : [None, 5],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1,2],
    'class_weight': ["balanced"]
}

grid = GridSearchCV (
    rf,
    param_grid=param_grid,
    scoring='recall',
    cv=5,
    n_jobs=-1,
    verbose=1
)


In [102]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'class_weight': ['balanced'], 'max_depth': [None, 5], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 5], ...}"
,scoring,'recall'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [103]:
best_rf = grid.best_estimator_
print("Best CV recall(1): " , grid.best_score_)

Best CV recall(1):  0.8800000000000001


In [104]:
from sklearn.metrics import confusion_matrix, classification_report
preds = best_rf.predict(X_test)

print("Confusion Matrix:", confusion_matrix(y_test, preds))

print("Classification Report:")
print(classification_report(y_test, preds))


Confusion Matrix: [[ 4  3]
 [ 1 10]]
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.57      0.67         7
           1       0.77      0.91      0.83        11

    accuracy                           0.78        18
   macro avg       0.78      0.74      0.75        18
weighted avg       0.78      0.78      0.77        18



In [106]:
from sklearn.metrics import roc_auc_score

probs = best_rf.predict_proba(X_test)[:, 1]
print("ROC AUC:", roc_auc_score(y_test, probs))

ROC AUC: 0.8051948051948052


In [107]:
importance = pd.Series(
    best_rf.feature_importances_,
    index=FEATURES
).sort_values(ascending=False)

print("\nFeature Importance (RF)")
print(importance)



Feature Importance (RF)
sector_code          0.264126
gdp_qoq_growth       0.241560
gdp                  0.222975
gdp_volatility_4q    0.149316
closures_lag1        0.066445
cpi                  0.030408
cpi_qoq_growth       0.025168
dtype: float64


In [108]:
final_df["risk_prob"] = best_rf.predict_proba(final_df[FEATURES])[:, 1]

THRESHOLD = 0.4
final_df["predicted_risk"] = (final_df["risk_prob"] >= THRESHOLD).astype(int)

In [110]:
sector_risk = (
    final_df
    .groupby(["sector_code", "sector"])
    .agg(
        avg_risk_prob = ("risk_prob", "mean"),
        high_risk_rate = ("predicted_risk", "mean"),
        avg_insolvencies=("insolvencies", "mean")
    )
    .reset_index()
    .sort_values("avg_risk_prob", ascending=False)
)

In [111]:
sector_risk["risk_label"] = pd.cut(
    sector_risk["avg_risk_prob"],
    bins=[0, 0.4, 0.7, 1.0],
    labels = ["Low Risk", "Medium Risk", "High Risk"]
)

In [115]:
sector_risk

Unnamed: 0,sector_code,sector,avg_risk_prob,high_risk_rate,avg_insolvencies,risk_label
4,31,Manufacturing,0.951667,1.0,290.0,High Risk
3,23,Construction,0.913333,1.0,498.333333,High Risk
10,53,Real Estate and Rental and Leasing,0.901667,1.0,123.0,High Risk
18,81,Other Services (except Public Administration),0.88,1.0,260.666667,High Risk
5,41,Wholesale Trade,0.876667,1.0,186.0,High Risk
7,48,Transportation and Warehousing,0.871667,1.0,302.333333,High Risk
17,72,Accommodation and Food Services,0.793333,1.0,574.333333,High Risk
11,54,"Professional, Scientific and Technical Services",0.755,1.0,275.666667,High Risk
6,44,Retail Trade,0.74,1.0,439.333333,High Risk
13,56,"Administrative and Support, Waste Management a...",0.721667,0.666667,186.666667,High Risk


In [117]:
sector_risk.to_csv(r'D:\Canadian Business Analysis Project\data\final_trained_data.csv', index = False)