In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

variables_dict = {
    0: "bankruptcy",
    1: "net profit / total assets",
    2: "total liabilities / total assets",
    3: "working capital / total assets",
    4: "current assets / short-term liabilities",
    5: "[(cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)] * 365",
    6: "retained earnings / total assets",
    7: "EBIT / total assets",
    8: "book value of equity / total liabilities",
    9: "sales / total assets",
    10: "equity / total assets",
    11: "(gross profit + extraordinary items + financial expenses) / total assets",
    12: "gross profit / short-term liabilities",
    13: "(gross profit + depreciation) / sales",
    14: "(gross profit + interest) / total assets",
    15: "(total liabilities * 365) / (gross profit + depreciation)",
    16: "(gross profit + depreciation) / total liabilities",
    17: "total assets / total liabilities",
    18: "gross profit / total assets",
    19: "gross profit / sales",
    20: "(inventory * 365) / sales",
    21: "sales (n) / sales (n-1)",
    22: "profit on operating activities / total assets",
    23: "net profit / sales",
    24: "gross profit (in 3 years) / total assets",
    25: "(equity - share capital) / total assets",
    26: "(net profit + depreciation) / total liabilities",
    27: "profit on operating activities / financial expenses",
    28: "working capital / fixed assets",
    29: "logarithm of total assets",
    30: "(total liabilities - cash) / sales",
    31: "(gross profit + interest) / sales",
    32: "(current liabilities * 365) / cost of products sold",
    33: "operating expenses / short-term liabilities",
    34: "operating expenses / total liabilities",
    35: "profit on sales / total assets",
    36: "total sales / total assets",
    37: "(current assets - inventories) / long-term liabilities",
    38: "constant capital / total assets",
    39: "profit on sales / sales",
    40: "(current assets - inventory - receivables) / short-term liabilities",
    41: "total liabilities / ((profit on operating activities + depreciation) * (12/365))",
    42: "profit on operating activities / sales",
    43: "rotation receivables + inventory turnover in days",
    44: "(receivables * 365) / sales",
    45: "net profit / inventory",
    46: "(current assets - inventory) / short-term liabilities",
    47: "(inventory * 365) / cost of products sold",
    48: "EBITDA (profit on operating activities - depreciation) / total assets",
    49: "EBITDA (profit on operating activities - depreciation) / sales",
    50: "current assets / total liabilities",
    51: "short-term liabilities / total assets",
    52: "(short-term liabilities * 365) / cost of products sold)",
    53: "equity / fixed assets",
    54: "constant capital / fixed assets",
    55: "working capital",
    56: "(sales - cost of products sold) / sales",
    57: "(current assets - inventory - short-term liabilities) / (sales - gross profit - depreciation)",
    58: "total costs /total sales",
    59: "long-term liabilities / equity",
    60: "sales / inventory",
    61: "sales / receivables",
    62: "(short-term liabilities *365) / sales",
    63: "sales / short-term liabilities",
    64: "sales / fixed asset"
}



# Read Data
data_files = [f"../data/{year}.csv" for year in range(2017, 2022)]
data_frames = [pd.read_csv(file, na_values='?') for file in data_files]
df = pd.concat(data_frames, ignore_index=True)



In [6]:


# Preprocess
# 'bankruptcy' is the target variable, others are features
X = df.drop('class', axis=1)
y = df['class']

# Fill missing values with the mean
X.fillna(X.mean(), inplace=True)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



In [7]:
# Create and train the model
model = LogisticRegression(max_iter=1000) 
model.fit(X_train_scaled, y_train)

In [8]:
# Asess the model
y_pred = model.coef_
print("Coeff:", y_pred)


Coeff: [[-7.04936188e-01  2.63584325e-01 -7.34950459e-01 -1.72863203e+00
   2.31234104e-02  3.79520121e-02 -3.55984306e-01 -3.46731729e-04
  -2.73950383e+00 -4.54444138e-01 -7.16850937e-01  5.24431483e-01
  -1.55969812e-01 -3.55997081e-01  9.22577430e-03 -1.70681160e+00
  -3.54669090e-02 -3.59807901e-01 -2.91028507e-01  1.72789732e-01
  -3.21373162e-02 -3.53580906e-01 -2.66956445e-01 -3.69635652e-02
   4.94546699e-01  9.94021341e-01 -5.84930760e-02 -9.55226149e-02
  -2.14922177e-01  4.45345624e-02  5.21603453e-01  3.33921823e-01
   1.18573746e+00  1.63960563e-01 -3.17058969e+00  7.23221017e-01
  -1.60734574e-02 -1.30570178e+00 -7.41197452e-01  7.33553561e-01
  -7.60029639e-01  3.04758480e-01 -2.34782243e-02 -9.12449166e-02
   2.97879900e-03 -2.20388467e+00 -9.20369446e-02  3.73616962e+00
   3.86930186e-02 -1.01504961e+00 -1.28903661e+00 -1.16330146e+00
   1.41128601e-01  1.00888426e-01 -2.16734122e-01  4.11930846e-02
  -4.84566341e-02 -8.96574930e-01 -1.45772087e-02 -3.02215061e-02
  -

In [9]:
# Get the weights of the features
weights = model.coef_[0]
features = X.columns
feature_weights = pd.DataFrame({'Feature': features, 'Weight': weights})

# Get the top 5 features
top_features = feature_weights.reindex(feature_weights.Weight.abs().sort_values(ascending=False).index).head(5)
print("MS 5 features：")
print(top_features)

predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))


MS 5 features：
   Feature    Weight
47     X48  3.736170
34     X35 -3.170590
8       X9 -2.739504
45     X46 -2.203885
3       X4 -1.728632
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      8240
           1       0.50      0.01      0.02       441

    accuracy                           0.95      8681
   macro avg       0.72      0.50      0.50      8681
weighted avg       0.93      0.95      0.93      8681

Confusion Matrix:
[[8236    4]
 [ 437    4]]


In [13]:
for i in [48, 35, 9, 45, 46, 4]:
    print(variables_dict[i])

EBITDA (profit on operating activities - depreciation) / total assets
profit on sales / total assets
sales / total assets
net profit / inventory
(current assets - inventory) / short-term liabilities
current assets / short-term liabilities
