In [4]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler,
    OneHotEncoder, LabelEncoder, OrdinalEncoder,
    PolynomialFeatures
)
from sklearn.impute import SimpleImputer, KNNImputer

import scipy.stats as stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import statsmodels.formula.api as smf

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Data preparing and preprocessing

In [5]:
df = pd.read_csv('./data/Global_Development_Indicators_2000_2020.csv')

In [6]:
df.fillna(df.mean(numeric_only=True), inplace=True)

In [7]:
print(df.isna().sum())

year                                      0
country_code                              0
country_name                              0
region                                 1008
income_group                           1029
currency_unit                          1008
gdp_usd                                   0
population                                0
gdp_per_capita                            0
inflation_rate                            0
unemployment_rate                         0
fdi_pct_gdp                               0
co2_emissions_kt                          0
energy_use_per_capita                     0
renewable_energy_pct                      0
forest_area_pct                           0
electricity_access_pct                    0
life_expectancy                           0
child_mortality                           0
school_enrollment_secondary               0
health_expenditure_pct_gdp                0
hospital_beds_per_1000                    0
physicians_per_1000             

In [8]:
df = df.drop(columns=['region', 'income_group', 'currency_unit'])

In [9]:
print(df.isna().sum())

year                                   0
country_code                           0
country_name                           0
gdp_usd                                0
population                             0
gdp_per_capita                         0
inflation_rate                         0
unemployment_rate                      0
fdi_pct_gdp                            0
co2_emissions_kt                       0
energy_use_per_capita                  0
renewable_energy_pct                   0
forest_area_pct                        0
electricity_access_pct                 0
life_expectancy                        0
child_mortality                        0
school_enrollment_secondary            0
health_expenditure_pct_gdp             0
hospital_beds_per_1000                 0
physicians_per_1000                    0
internet_usage_pct                     0
mobile_subscriptions_per_100           0
calculated_gdp_per_capita              0
real_economic_growth_indicator         0
econ_opportunity

In [10]:
df_cleared = df.drop(columns=['years_since_2000', 'years_since_century', 'is_pandemic_period', 'human_development_composite', 'year', 'country_code', 'country_name', 'governance_quality_index', 'internet_usage_pct', 'mobile_subscriptions_per_100', 'education_health_ratio', 'global_development_resilience_index', 'co2_intensity_per_million_gdp'])

In [11]:
df_cleared.describe()

Unnamed: 0,gdp_usd,population,gdp_per_capita,inflation_rate,unemployment_rate,fdi_pct_gdp,co2_emissions_kt,energy_use_per_capita,renewable_energy_pct,forest_area_pct,...,green_transition_score,ecological_preservation_index,renewable_energy_efficiency,healthcare_capacity_index,digital_connectivity_index,health_development_ratio,human_development_index,climate_vulnerability_index,digital_readiness_score,global_resilience_score
count,5556.0,5556.0,5556.0,5556.0,5556.0,5556.0,5556.0,5556.0,5556.0,5556.0,...,5556.0,5556.0,5556.0,5556.0,5556.0,5556.0,5556.0,5556.0,5556.0,5556.0
mean,2008355000000.0,280341200.0,13795.817996,5.672921,7.931009,8.257269,1121548.0,2348.102738,30.136454,32.321885,...,30.136454,1.423242,6.451143,32.689495,4.028989,5.692028,0.655851,0.355641,0.370564,0.444183
std,7145199000000.0,883368600.0,21356.152812,15.885464,5.36619,54.873561,3235446.0,2029.383933,27.030396,22.835904,...,27.030396,1.592646,15.14588,67.500723,3.721305,6.33425,0.253182,0.098701,0.293003,0.260637
min,13196540.0,9392.0,111.927225,-18.10863,0.1,-1275.189986,0.0,9.54806,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.175371,-0.259046,0.046064,0.0,-2.748146
25%,6427532000.0,1362497.0,1523.496083,2.028945,4.47925,1.57523,4875.0,1219.826064,7.038275,12.731705,...,7.038275,0.04251,0.934828,0.0,0.385371,1.172355,0.507309,0.279549,0.094469,0.365905
50%,46872790000.0,9460952.0,5071.950964,4.108692,7.016,3.164215,63485.0,2348.102738,27.649244,31.242095,...,27.649244,1.049848,6.451143,0.0,3.151044,3.954014,0.655851,0.367475,0.332576,0.444183
75%,628072400000.0,59159110.0,15383.866888,5.672921,9.50925,7.301829,1121548.0,2348.102738,43.762501,46.377833,...,43.762501,2.136633,6.451143,32.689495,7.317161,5.692028,0.870724,0.436504,0.617325,0.532583
max,87568050000000.0,7761620000.0,189487.147128,557.201817,37.25,1709.765678,34041050.0,22120.430302,98.342903,98.33891,...,98.342903,8.682123,580.855155,521.652851,17.01063,41.280488,1.171897,0.5,1.570052,1.549244


In [12]:
X_reg = df_cleared.loc[:, df_cleared.columns != 'inflation_rate']
y_reg = df_cleared.loc[:, ['inflation_rate']]

In [13]:
from sklearn.model_selection import train_test_split
X_train_linreg, X_test_linreg, y_train_linreg, y_test_linreg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

# Multicolinearity check

In [14]:
constant_features = [col for col in X_reg.columns if X_reg[col].nunique() == 1]
print("Constant features:", constant_features)

Constant features: []


In [15]:
from statsmodels.tools.tools import add_constant

X_reg_check = X_reg

vif_df = pd.DataFrame()
vif_df["feature"] = X_reg_check.columns
vif_df["VIF"] = [variance_inflation_factor(X_reg_check.values, i) for i in range(X_reg_check.shape[1])]

In [16]:
vif_df

Unnamed: 0,feature,VIF
0,gdp_usd,3.531146
1,population,3.981302
2,gdp_per_capita,2591.562
3,unemployment_rate,1.187858
4,fdi_pct_gdp,1.131844
5,co2_emissions_kt,5.352847
6,energy_use_per_capita,2.27481
7,renewable_energy_pct,167199400000.0
8,forest_area_pct,4.111555
9,electricity_access_pct,5.121818


In [19]:
multicolineared_columns = ['internet_usage_pct', 'mobile_subscriptions_per_100', 'education_health_ratio', 'global_development_resilience_index', 'co2_intensity_per_million_gdp']

# Linear Regresion

In [20]:
model_linreg = sm.OLS(y_train_linreg, X_train_linreg).fit()
print(model_linreg.summary())

                            OLS Regression Results                            
Dep. Variable:         inflation_rate   R-squared:                       0.028
Model:                            OLS   Adj. R-squared:                  0.022
Method:                 Least Squares   F-statistic:                     4.186
Date:                Mon, 19 May 2025   Prob (F-statistic):           2.80e-12
Time:                        15:45:48   Log-Likelihood:                -16150.
No. Observations:                3889   AIC:                         3.236e+04
Df Residuals:                    3861   BIC:                         3.253e+04
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
gdp_usd     

In [21]:
preds_new = model_linreg.predict(X_test_linreg)
print(preds_new)

4456     7.300157
4626     9.740071
2802     4.021171
230      6.912679
3872     5.747388
          ...    
3166     3.330062
5512     2.257291
1427     2.380475
3869     4.267966
5290    12.518856
Length: 1667, dtype: float64


In [22]:
r2 = model_linreg.rsquared
print('R^2:', r2)

R^2: 0.028438804428820075


In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [24]:
mae_linreg = mean_absolute_error(y_test_linreg, preds_new)
mse_linreg = mean_squared_error(y_test_linreg, preds_new)
rmse_linreg = mean_squared_error(y_test_linreg, preds_new)
r2_linreg = r2_score(y_test_linreg, preds_new)

print(f'RMSE: {rmse_linreg}')
print(f'R^2: {r2_linreg}')
print(f'MAE: {mae_linreg}')
print(f'MSE: {mse_linreg}')

RMSE: 261.6237722008474
R^2: 0.038026047739232904
MAE: 4.1611325886124755
MSE: 261.6237722008474


# Logistic Regression

In [25]:
df_cleared_class = df.drop(columns=['years_since_2000', 'years_since_century', 'human_development_composite', 'year', 'country_code', 'country_name', 'governance_quality_index', 'internet_usage_pct', 'mobile_subscriptions_per_100', 'education_health_ratio', 'global_development_resilience_index', 'co2_intensity_per_million_gdp'])

In [26]:
X_class = df_cleared_class.loc[:, df_cleared_class.columns != 'is_pandemic_period']
y_class = df_cleared_class.loc[:, ['is_pandemic_period']]

In [27]:
y_class = np.ravel(y_class)
y_reg = np.ravel(y_reg)

In [28]:
X_train_logreg, X_test_logreg, y_train_logreg, y_test_logreg = train_test_split(X_class, y_class, test_size=0.3, random_state=42)

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

## Logistic regression with LBFGS, L2 regulirization, tolerance 0.001, Regularization factor - 1/C = 0.05 

\begin{aligned}
& \text{Input: Initial guess } w_0, \text{ parameters } m \\
& \text{Initialize: } H_0 = I \text{ (identity matrix)}, s_0 = \text{undefined}, y_0 = \text{undefined} \\
& \text{For } k = 1, 2, \dots \text{ until convergence:} \\
& \quad 1. \text{Compute the gradient: } g_k = \nabla f(w_{k-1}) \\
& \quad 2. \text{Compute the search direction: } p_k = -H_{k-1} g_k \\
& \quad 3. \text{Perform a line search to find a step size } \alpha_k > 0 \text{ such that } f(w_{k-1} + \alpha_k p_k) \text{ is sufficiently decreased.} \\
& \quad 4. \text{Update the parameters: } w_k = w_{k-1} + \alpha_k p_k \\
& \quad 5. \text{Compute } s_{k-1} = w_k - w_{k-1} = \alpha_k p_k \\
& \quad 6. \text{Compute } y_{k-1} = g_k - g_{k-1} \\
& \quad 7. \text{Update the inverse Hessian approximation } H_k \text{ using the L-BFGS update rule based on } (s_{k-1}, y_{k-1}) \text{ and potentially the } m \text{ most recent pairs.} \\
& \quad \quad \text{The two-loop recursion for updating } H_k \text{ efficiently is often used here.} \\
& \text{Output: } w_k \text{ (the approximate minimizer)}
\end{aligned}

In [None]:
model_logit = LogisticRegression(penalty='l2', tol=1e-4, C=20, solver='lbfgs')
model_logit.fit(X_train_logreg, y_train_logreg)

probs_logit = model_logit.predict_proba(X_test_logreg)[:, 1]
preds_logit = model_logit.predict(X_test_logreg)

print('Accuracy:', accuracy_score(y_test_logreg, preds_logit))
print('ROC AUC:', roc_auc_score(y_test_logreg, probs_logit))
print('Log Loss:', log_loss(y_test_logreg, probs_logit))

## Logistic regression with saga, elasticnet, tolerance 0.01, Regularization factor - 1/C = 0.05, l1_ration=0.7
Apparently in those kind of economical and high numerical dimension what really matters it is solver, how actually the gradient in built, is it counting all numerical features, or trying avoid some. We can actually see that Saga performing better than LBFGS, because of stochastic nature of SAGA and elasticnet regulirization factor. Moreover, SAGA saving information of previous step gradients, it is giving better performance of gradient how to understand the distribution and loss function antology. 
$$
\min_{w} \left[ f(w) = \frac{1}{n} \sum_{i=1}^{n} f_i(w) + \lambda R(w) \right]
$$

$$
w^{k+1} = w^k - \eta \left( \nabla f_j(w^k) - \alpha_j + \frac{1}{n} \sum_{i=1}^n \alpha_i + \lambda \nabla R(w^k) \right)
$$


In [None]:
model_logit = LogisticRegression(penalty='elasticnet', tol=1e-3, C=100, solver='saga', l1_ratio=0.7)
model_logit.fit(X_train_logreg, y_train_logreg)

probs = model_logit.predict_proba(X_test_logreg)[:, 1]
preds = model_logit.predict(X_test_logreg)

print('Accuracy:', accuracy_score(y_test_logreg, preds))
print('ROC AUC:', roc_auc_score(y_test_logreg, probs))
print('Log Loss:', log_loss(y_test_logreg, probs))

acc_logit = accuracy_score(y_test_logreg, preds)
roc_auc_score_logit = roc_auc_score(y_test_logreg, probs)
log_loss_logit = log_loss(y_test_logreg, probs)

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score

In [None]:
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_class, y_class, test_size=0.3, random_state=42)

X_train_tree_clf = X_train_clf
y_train_tree_clf = np.ravel(y_train_clf)

from sklearn.model_selection import cross_val_score

cv_scores_decisionTree_clf = []

max_depth_range = range(1, 26)

for i in max_depth_range:
    tree_clf = DecisionTreeClassifier(max_depth=i, random_state=42, criterion='entropy')
    scores = cross_val_score(tree_clf, X_train_tree_clf, y_train_tree_clf, cv=5, scoring='f1')
    cv_scores_decisionTree_clf.append(scores.mean())
    i+=5

best_maxdepth_tree_clf = max_depth_range[np.argmax(cv_scores_decisionTree_clf)]
print(f'Best k: {best_maxdepth_tree_clf} with f1 score: {max(cv_scores_decisionTree_clf):.4f}')

plt.plot(max_depth_range, cv_scores_decisionTree_clf, marker='o')
plt.xlabel('k')
plt.ylabel('CV f1_score')
plt.title('Decision Tree Classifier Cross-Validation f1_score')
plt.grid(True)
plt.show()

In [None]:
best_f1_score_tree_clf = max(cv_scores_decisionTree_clf)

# Decision Tree Regressor

In [None]:
X_train_tree_reg = X_train_linreg
y_train_tree_reg = np.ravel(y_train_linreg)

from sklearn.model_selection import cross_val_score

cv_scores_decisionTree_reg = []

max_depth_range = range(1, 26)

for i in max_depth_range:
    tree_reg = DecisionTreeRegressor(max_depth=i, random_state=42, criterion='absolute_error')
    scores = cross_val_score(tree_reg, X_train_tree_reg, y_train_tree_reg, cv=5, scoring='d2_absolute_error_score')
    cv_scores_decisionTree_reg.append(scores.mean())
    i+=5

best_maxdepth_tree_reg = max_depth_range[np.argmax(cv_scores_decisionTree_reg)]
print(f'Best k: {best_maxdepth_tree_reg} with d2_absolute_error score: {max(cv_scores_decisionTree_reg):.4f}')

plt.plot(max_depth_range, cv_scores_decisionTree_reg, marker='o')
plt.xlabel('k')
plt.ylabel('CV absolute_error')
plt.title('Decision Tree Regressor Cross-Validation absolute error')
plt.grid(True)
plt.show()


In [None]:
best_d2_mae_dtree_reg = max(cv_scores_decisionTree_reg)
print(best_d2_mae_dtree_reg)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

y_train_clf = np.ravel(y_train_clf)

forest_clf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42, criterion='entropy')
forest_clf.fit(X_train_clf, y_train_clf)
y_pred_forest_clf = forest_clf.predict(X_test_clf)

print('Accuracy:', accuracy_score(y_test_clf, y_pred_forest_clf) )

acc_forest_clf = accuracy_score(y_test_clf, y_pred_forest_clf)

In [None]:
y_train_tree_reg = np.ravel(y_train_tree_reg)

reg = RandomForestRegressor(n_estimators=5, max_depth=5, random_state=42, criterion='absolute_error')
reg.fit(X_train_tree_reg, y_train_tree_reg)
y_pred_forest_reg = reg.predict(X_test_linreg)

print('MAE:', mean_absolute_error(y_test_linreg, y_pred_forest_reg))

mae_random_reg = mean_absolute_error(y_test_linreg, y_pred_forest_reg)

# Naive Bayess Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

X_NB = X_class
y_NB = np.ravel(y_class)

X_train_NB, X_test_NB, y_train_NB, y_test_NB = train_test_split(X_NB, y_NB, random_state=42, test_size=0.3)


NB = GaussianNB()
NB.fit(X_train_NB, y_train_NB)
y_pred_NB = NB.predict(X_test_NB)

print('Accuracy:', accuracy_score(y_test_NB, y_pred_NB))

acc_nb = accuracy_score(y_test_NB, y_pred_NB)

## Results of Naive Bayes Classifier showing that still the dataset having really good composition and distribution, extremely predictable, cleared from colinearity, having thorough feature clearing, and numerical nature. Nevertheless, we can assume that this high numerical dependence giving such a low sensivity for events and probability distribution for this type of dataset. During the whole analysis we can mark up that high level of numerical nature of dataset giving outstanding results for region classifying and regression, but in a case of probability classifying it gives gradually lower results of Accuracy.

# kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import cross_val_score

k_range = [i for i in range(1, 22) if i % 2 != 0]
cv_scores_knn_clf = []

X_knn_clf = X_class
y_knn_clf = np.ravel(y_class)


for k in k_range:
    kNN_clf = KNeighborsClassifier(n_neighbors=k, metric='manhattan')
    scores = cross_val_score(kNN_clf, X_knn_clf, y_knn_clf, cv=5, scoring='accuracy')
    cv_scores_knn_clf.append(scores.mean())

best_k_clf = k_range[np.argmax(cv_scores_knn_clf)]
print(f'Best k: {best_k_clf} with accuracy: {max(cv_scores_knn_clf):.4f}')

plt.plot(k_range, cv_scores_knn_clf, marker='o')
plt.xlabel('k')
plt.ylabel('CV Accuracy')
plt.title('kNN_clf Cross-Validation Accuracy')
plt.grid(True)
plt.show()
print(cv_scores_knn_clf)

In [None]:
best_acc_knn_clf = max(cv_scores_knn_clf)

In [None]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import matplotlib.pyplot as plt

k_range = [i for i in range(1, 22) if i%2!=0]
cv_scores_knn_reg = []

X_knn_reg = X_reg
y_knn_reg = y_reg


for k in k_range:
    kNN_reg = KNeighborsRegressor(n_neighbors=k, metric='minkowski')
    scores = cross_val_score(kNN_reg, X_knn_reg, y_knn_reg, cv=5, scoring='neg_mean_absolute_error')
    cv_scores_knn_reg.append(scores.mean())

best_k_reg = k_range[np.argmax(cv_scores_knn_reg)]
best_neg_mae_knn_reg = max(cv_scores_knn_reg)
best_mae_knnreg = -best_neg_mae_knn_reg 

print(f'Best k (based on MAE): {best_k_reg} with MAE: {best_mae_knnreg:.4f}')

plt.plot(k_range, [-score for score in cv_scores_knn_reg], marker='o') # Plot the actual MAE values
plt.xlabel('k')
plt.ylabel('CV MAE')
plt.title('kNN_reg Cross-Validation MAE')
plt.grid(True)
plt.show()

cv_scores_knn_reg_cleared = [-score for score in cv_scores_knn_reg]

# SVC

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'kernel': ['rbf'],
    'gamma': ['scale', 'auto']
}

X_svc = X_class
y_svc = np.ravel(y_class)

X_train_svc, X_test_svc, y_train_svc, y_test_svc = train_test_split(X_svc, y_svc, test_size=0.3, random_state=42)

grid = GridSearchCV(SVC(class_weight='balanced'), param_grid, cv=5)
grid.fit(X_train_svc, y_train_svc)
best_model = grid.best_estimator_

best_model.fit(X_train_svc, y_train_svc)
y_preds_svc = best_model.predict(X_test_svc)

print('Accuracy:', accuracy_score(y_preds_svc, y_test_svc))
print('F1 score:', f1_score(y_preds_svc, y_test_svc))

acc_svc = accuracy_score(y_preds_svc, y_test_svc)
f1_svc = f1_score(y_preds_svc, y_test_svc)

## Overall we can see that actually making predictions and trying to model the pandemic period it is actually wrong from the view of statistical distribution, because model not trying to predict the pandemic period, it is just choosing certain region where exactly it should be with certain economical, social and political features. Definitely, model not relying on year, country, or others. It is just a modeling of economical features which could marking up the situation of pandemic crisis. Nevertheless, it is better not to model the region of features where you having same class. 

In [None]:
import json

# ===== Regression ===== #
# Linear regression 
## mae_linreg
## mse_linreg
## rmse_linreg
## r2_linreg

# Decision tree regressor
## best_maxdepth_tree_reg
## best_d2_mae_dtree_reg
## cv_scores_decisionTree_reg

# Random forest regressor
## mae_random_reg

# kNN regressor
## cv_scores_knn_reg_cleared
## best_k_reg
## best_mae_knnreg

# ===== Classification ===== #
# Logistic regression
## acc_logit
## roc_auc_score_logit
## log_loss_logit

# Decision tree classifier
## best_f1_score_tree_clf
## best_maxdepth_tree_clf
## cv_scores_decisionTree_clf

# Random forest classifier
## acc_forest_clf

# Naive Bayes classifier
## acc_nb

# kNN classifier
## best_acc_knn_clf
## cv_scores_knn_clf
## best_k_clf

# SVC
## acc_svc
## f1_svc

In [None]:
import json

def export_model_results(
        # Регрессия — linear
        linear_r2, linear_rmse, linear_mae, linear_mse,

        # Регрессия — dtree
        dtree_d2mae, dtree_cv_d2mae, dtree_best_maxdepth,

        # Регрессия — random forest
        rf_reg_mae,

        # Регрессия — knn
        knn_reg_mae, knn_reg_cv_mae, knn_reg_best_k,

        # Классификация — logistic
        log_accuracy, log_roc_auc, log_loss,

        # Классификация — dtree
        dtree_clf_f1, dtree_clf_cv_f1, dtree_clf_best_maxdepth,

        # Классификация — random forest
        rf_clf_accuracy,

        # Классификация — naive Bayes
        naiveb_accuracy,

        # Классификация — knn
        knn_clf_accuracy, knn_clf_cv_accuracy, knn_clf_best_k,

        # Классификация — SVC
        svc_accuracy, svc_f1,

        # Имя файла (по умолчанию)
        filename="supervised.json"
):
    results = {
        "regression": {
            "linear": {
                "r2": linear_r2,
                "rmse": linear_rmse,
                "mae": linear_mae,
                "mse": linear_mse
            },
            "dtree_reg": {
                "d2mae": dtree_d2mae,
                "cv_results": {
                    "d2mae_results": dtree_cv_d2mae,
                    "best_maxdepth": dtree_best_maxdepth
                }
            },
            "randomforest_reg": {
                "mae": rf_reg_mae
            },
            "knn": {
                "mae": knn_reg_mae,
                "cv_results": {
                    "mae_results": knn_reg_cv_mae,
                    "best_k": knn_reg_best_k
                }
            }
        },
        "classification": {
            "logistic": {
                "accuracy": log_accuracy,
                "roc_auc": log_roc_auc,
                "log_loss": log_loss
            },
            "dtree_clf": {
                "accuracy": dtree_clf_f1,
                "cv_results": {
                    "accuracy_results": dtree_clf_cv_f1,
                    "best_maxdepth": dtree_clf_best_maxdepth
                }
            },
            "randomforest_clf": {
                "accuracy": rf_clf_accuracy
            },
            "naiveB": {
                "accuracy": naiveb_accuracy
            },
            "knn": {
                "accuracy": knn_clf_accuracy,
                "cv_results": {
                    "accuracy_results": knn_clf_cv_accuracy,
                    "best_k": knn_clf_best_k
                }
            },
            "svc": {
                "accuracy": svc_accuracy,
                "f1_score": svc_f1
            }
        }
    }

    with open(filename, "w") as f:
        json.dump(results, f, indent=2)

    print(f"[INFO] Results exported to {filename}")

In [None]:
export_model_results(
    linear_r2=r2_linreg, linear_rmse=rmse_linreg, linear_mae=mae_linreg, linear_mse=mse_linreg,

    dtree_d2mae=best_d2_mae_dtree_reg, dtree_cv_d2mae=cv_scores_decisionTree_reg, dtree_best_maxdepth=best_maxdepth_tree_reg,

    rf_reg_mae=mae_random_reg,

    knn_reg_mae=best_mae_knnreg, knn_reg_cv_mae=cv_scores_knn_reg_cleared, knn_reg_best_k=best_k_reg,

    log_accuracy=acc_logit, log_roc_auc=roc_auc_score_logit, log_loss=log_loss_logit,

    dtree_clf_f1=best_f1_score_tree_clf, dtree_clf_cv_f1=cv_scores_decisionTree_clf, dtree_clf_best_maxdepth=best_maxdepth_tree_clf,

    rf_clf_accuracy=acc_forest_clf,

    naiveb_accuracy=acc_nb,

    knn_clf_accuracy=best_acc_knn_clf, knn_clf_cv_accuracy=cv_scores_knn_clf, knn_clf_best_k=best_k_clf,

    svc_accuracy=acc_svc, svc_f1=f1_svc,

    filename="results/supervised.json"
)

# Saving preprocessed data and VIF table

In [35]:
def save_csv(df_to_save, name):
    df_to_save.to_csv(f"../resources/{name}.csv", index=False)
    print(f"[INFO] DataFrame сохранён в resources/{name}.csv")

In [36]:
save_csv(df_cleared_class, "preprocessed")
save_csv(vif_df, 'vif')

[INFO] DataFrame сохранён в resources/preprocessed.csv
[INFO] DataFrame сохранён в resources/vif.csv
