In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('cs_bisnode_panel.csv')

# 1. Data Exploration & Data Featuring

### 1.1. Construct Hold-out sample

In [3]:
# Filter data for the industry of interest (Manufacture of computer, electronic and optical products) and for the year 2014
data_filtered = data[(data['ind2'] == 26) & (data['year'] == 2014)]

# Filter SMEs with sales between 1000 EUR and 10 million EUR in 2014
sme_2014 = data_filtered[(data_filtered['sales'] >= 1000) & (data_filtered['sales'] <= 10e6)]

# Prepare data for 2015 to check for existence and sales in 2015
sales_2015_full = data[(data['ind2'] == 26) & (data['year'] == 2015)][['comp_id', 'sales']]

# Identify SMEs from 2014 that either do not exist in 2015 or have sales equal to 0 in 2015
sme_2014_comp_ids = sme_2014['comp_id'].unique()
sales_2015_existence = sales_2015_full[sales_2015_full['comp_id'].isin(sme_2014_comp_ids)]

# Firms that do not exist in 2015 data
defaulted_firms_ids = sme_2014_comp_ids[~np.isin(sme_2014_comp_ids, sales_2015_existence['comp_id'])]

# Firms with zero sales in 2015 from the list of existing firms
defaulted_due_to_zero_sales = sales_2015_existence[sales_2015_existence['sales'].fillna(0) <= 0]['comp_id'].unique()

# Combine the lists to get the final list of defaulted firms
all_defaulted_firms_ids = np.unique(np.concatenate((defaulted_firms_ids, defaulted_due_to_zero_sales)))

# Calculate the final numbers
num_defaulted_final = len(all_defaulted_firms_ids)
num_survived_final = len(sme_2014_comp_ids) - num_defaulted_final
total_firms = len(sme_2014_comp_ids)
average_sales = sme_2014['sales'].mean() / 1e6  # Convert to million EUR
min_sales = sme_2014['sales'].min() / 1e6  # Convert to million EUR
max_sales = sme_2014['sales'].max() / 1e6  # Convert to million EUR

# Print the final results
print(f"Number of defaulted firms: {num_defaulted_final}")
print(f"Number of survived firms: {num_survived_final}")
print(f"Total firms: {total_firms}")
print(f"Average sales (million EUR): {average_sales}")
print(f"Minimum sales (million EUR): {min_sales}")
print(f"Maximum sales (million EUR): {max_sales}")


Number of defaulted firms: 56
Number of survived firms: 981
Total firms: 1037
Average sales (million EUR): 0.49020221792682
Minimum sales (million EUR): 0.001070370361328125
Maximum sales (million EUR): 9.576485


### 1.2. Design data sample for training and testing
- Filter Data for Years Before 2014: Select data from 2013 and earlier. This ensures that we do not use any information from the hold-out sample for training the model.
- Select Industry: Focus on industry ind2 == 26, similar to what we did with the hold-out sample.
- Identifying SMEs: Just like with the hold-out sample, we will identify SMEs based on revenue in 2013.

In [4]:
# Industry ind2 == 26 and years before 2014
data_pre_2014 = data[(data['ind2'] == 26) & (data['year'] < 2014)]

# Filter based on revernues from 1000 EUR đến 10M EUR
sme_pre_2014 = data_pre_2014[(data_pre_2014['sales'] >= 1000) & (data_pre_2014['sales'] <= 10e6)]

# Check the size of sample
sme_pre_2014_shape = sme_pre_2014.shape
sme_pre_2014_info = sme_pre_2014[['comp_id', 'year', 'sales']].head()

(sme_pre_2014_shape, sme_pre_2014_info)


((9689, 48),
        comp_id  year         sales
 960  6538183.0  2005  29288.888672
 961  6538183.0  2006  35929.628906
 962  6538183.0  2007  31729.628906
 963  6538183.0  2008  35703.703125
 964  6538183.0  2009  43062.964844)

### 1.3. Data Featuring and Missing values

In [8]:
# Xử lý giá trị thiếu bằng cách thay thế bằng 0 cho các cột liên quan
sme_pre_2014_filled = sme_pre_2014.copy()

# Chuyển đổi cột founded_date từ chuỗi sang datetime
sme_pre_2014_filled['founded_date'] = pd.to_datetime(sme_pre_2014_filled['founded_date'], errors='coerce')

# Tính toán Firm Age sử dụng founded_date
sme_pre_2014_filled['Firm Age'] = 2013 - sme_pre_2014_filled['founded_date'].dt.year

# Tạo biến chỉ báo cho các cột có giá trị 0
for col in columns_with_zeros:
    flag_col = col + '_flag'
    sme_pre_2014_filled[flag_col] = (sme_pre_2014_filled[col] == 0).astype(int)
    sme_pre_2014_filled[col].replace(0, np.nan, inplace=True) 
    
# Tính toán các đặc trưng mới, sử dụng các cột đã được sửa để tránh chia cho 0
sme_pre_2014_filled['Current Ratio'] = sme_pre_2014_filled['curr_assets'] / sme_pre_2014_filled['curr_liab']
sme_pre_2014_filled['Net Profit Margin'] = sme_pre_2014_filled['profit_loss_year'] / sme_pre_2014_filled['sales']
sme_pre_2014_filled['Debt to Equity Ratio'] = sme_pre_2014_filled['curr_liab'] / sme_pre_2014_filled['share_eq']

# Hiển thị một số dòng dữ liệu sau khi thêm các đặc trưng mới
sme_pre_2014_filled[['Current Ratio', 'Net Profit Margin', 'Debt to Equity Ratio', 'Firm Age']].head()


Unnamed: 0,Current Ratio,Net Profit Margin,Debt to Equity Ratio,Firm Age
960,17.914893,-0.039074,0.046305,21
961,13.295082,0.015772,0.055886,21
962,3.41573,0.026497,0.298407,21
963,4.665919,0.010477,0.179333,21
964,2.087973,0.034661,0.814187,21


In [7]:
# Filter the data for the year 2014
data_2014 = data[data['year'] == 2014]

# Calculate Liquidity Ratio (Current Ratio)
# Current Ratio = Current Assets / Current Liabilities
data_2014['current_ratio'] = data_2014['curr_assets'] / data_2014['curr_liab']

# Calculate Profitability Ratio (Net Profit Margin)
# Net Profit Margin = Profit/Loss for the Year / Total Sales
data_2014['net_profit_margin'] = data_2014['profit_loss_year'] / data_2014['sales']

# Calculate Leverage Ratio (Debt to Equity Ratio)
# Debt to Equity Ratio = Current Liabilities / Shareholder's Equity
data_2014['debt_to_equity_ratio'] = data_2014['curr_liab'] / data_2014['share_eq']

# Calculate Firm Age
# Firm Age = Year of Data (2014) - Year Firm was Founded
data_2014['firm_age'] = 2014 - data_2014['founded_year']

# Filter the dataset for SMEs in the 'Manufacture of computer, electronic and optical products' industry (ind2 == 26)
# and for firms that have sales between 1000 EUR and 10 million EUR
sme_2014 = data_2014[(data_2014['sales'] >= 1000) & (data_2014['sales'] <= 1e7) & (data_2014['ind2'] == 26)]

# Select only the relevant columns for modeling
sme_features_2014 = sme_2014[['comp_id', 'current_ratio', 'net_profit_margin', 'debt_to_equity_ratio', 'firm_age']]

# Display the first few rows of the calculated features
print(sme_features_2014.head())


         comp_id  current_ratio  net_profit_margin  debt_to_equity_ratio  \
969    6538183.0       5.598051           0.083408              0.175352   
1128   8416055.0      15.283726          -0.072277              0.064558   
1467  12428378.0       0.000000          -2.670686             -0.185762   
1706  17776540.0       0.866408           0.017085              1.144598   
1735  18626760.0       2.079793           0.037247              0.634148   

      firm_age  
969       22.0  
1128      19.0  
1467      23.0  
1706      17.0  
1735       4.0  


In [None]:
# Get IDs of default companies
defaulted_ids = all_defaulted_firms_ids

# # Label the data: 1 if the company defaulted in 2015, 0 otherwise
sme_features_2014['default'] = sme_features_2014['comp_id'].isin(defaulted_ids).astype(int)

# Define the features (X) and the label (y)
X = sme_features_2014.drop(['comp_id', 'default'], axis=1) 
y = sme_features_2014['default'] 

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# # Output the shapes of the resulting data splits to verify the sizes
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# 2. Modeling

### 2.1. Logistic regression

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Create an imputer object with a median filling strategy
imputer = SimpleImputer(strategy='median')

# Impute missing values in the training data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Create a logistic regression model
# StandardScaler is used to standardize the features by removing the mean and scaling to unit variance
# Logistic Regression is chosen because it's a good baseline model for binary classification tasks
logreg_pipeline = Pipeline(steps=[('scaler', StandardScaler()),
                                  ('logistic_regression', LogisticRegression(random_state=42))])

# Train the logistic regression model on the imputed and scaled training data
logreg_pipeline.fit(X_train_imputed, y_train)

# Now the logistic regression model is trained, we can use it to make predictions
# However, we will evaluate the model's performance in the next steps using the test set

# The comments are in English to explain each step of the process
