required dependencies

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score, roc_auc_score

Data Collection and Preprocessing

In [None]:
merged_file1= pd.read_csv('data/merged_file.csv')
data = merged_file1.replace({0: False, 1: True})

In [None]:
df = pd.read_csv("merged_file.csv")


In [7]:
df.head()

Unnamed: 0,net profit / total assets,total liabilities / total assets,working capital / total assets,current assets / short-term liabilities,[(cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)] * 365,retained earnings / total assets,EBIT / total assets,book value of equity / total liabilities,sales / total assets,equity / total assets,...,(sales - cost of products sold) / sales,(current assets - inventory - short-term liabilities) / (sales - gross profit - depreciation),total costs /total sales,long-term liabilities / equity,sales / inventory,sales / receivables,(short-term liabilities *365) / sales,sales / short-term liabilities,sales / fixed assets,output
0,0.20055,0.37951,0.39641,2.0472,32.351,0.38825,0.24976,1.3305,1.1389,0.50494,...,0.12196,0.39718,0.87804,0.001924,8.416,5.1372,82.658,4.4158,7.4277,0
1,0.20912,0.49988,0.47225,1.9447,14.786,0.0,0.25834,0.99601,1.6996,0.49788,...,0.1213,0.42002,0.853,0.0,4.1486,3.2732,107.35,3.4,60.987,0
2,0.24866,0.69592,0.26713,1.5548,-1.1523,0.0,0.30906,0.43695,1.309,0.30408,...,0.24114,0.81774,0.76599,0.69484,4.9909,3.951,134.27,2.7185,5.2078,0
3,0.081483,0.30734,0.45879,2.4928,51.952,0.14988,0.092704,1.8661,1.0571,0.57353,...,0.054015,0.14207,0.94598,0.0,4.5746,3.6147,86.435,4.2228,5.5497,0
4,0.18732,0.61323,0.2296,1.4063,-7.3128,0.18732,0.18732,0.6307,1.1559,0.38677,...,0.13485,0.48431,0.86515,0.12444,6.3985,4.3158,127.21,2.8692,7.898,0


In [8]:
columns_to_drop = ['(current assets - inventories) / long-term liabilities', 'sales (n) / sales (n-1)','profit on operating activities / financial expenses','sales / inventory','net profit / inventory']
data = data.drop(columns=columns_to_drop)

In [9]:
data = data.replace("?", 0)

In [10]:
question_marks_count = data.eq('?').sum()
question_marks_count_sorted = question_marks_count.sort_values(ascending=False)
print(question_marks_count_sorted)

net profit / total assets                                                                                              0
total liabilities / total assets                                                                                       0
profit on sales / total assets                                                                                         0
total sales / total assets                                                                                             0
constant capital / total assets                                                                                        0
profit on sales / sales                                                                                                0
(current assets - inventory - receivables) / short-term liabilities                                                    0
total liabilities / ((profit on operating activities + depreciation) * (12/365))                                       0
profit on operating activities /

Split the data into training and testing sets

In [11]:

y = data['output']
X = data.drop('output', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Feature Selection


In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Model Selection

**RandomForestClassifier**


In [13]:
model = RandomForestClassifier()

In [None]:
model.fit(X_train, y_train)

In [15]:
pi_predict=model.predict(X_test)

In [16]:
y_pred = model.predict(X_test)
accuracy= accuracy_score(y_test, y_pred)

In [17]:
classification_rep = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)


Accuracy: 0.9502361479092271
Classification Report:
              precision    recall  f1-score   support

       False       0.95      1.00      0.97      8240
        True       0.56      0.10      0.17       441

    accuracy                           0.95      8681
   macro avg       0.76      0.55      0.57      8681
weighted avg       0.93      0.95      0.93      8681



**LOGISTICREGRESSION**


In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_test)

In [21]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

In [22]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("ROC-AUC:", roc_auc)

Accuracy: 0.9491994009906692
Precision: 0.5
Recall: 0.009070294784580499
F1-Score: 0.017817371937639197
ROC-AUC: 0.5042924289456883


**DECISIONTREE**

In [None]:
classifier = DecisionTreeClassifier(max_depth=5)
classifier.fit(X_train, y_train)

In [24]:
y_pred = classifier.predict(X_test)  
accuracy = accuracy_score(y_test, y_pred)  
print("Accuracy:", accuracy)

Accuracy: 0.9527704181545905


**GRADIENTBOOSTINGMETHOD**

In [25]:

param_grid = {
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5],
    'n_estimators': [100, 200]
}

In [26]:
gbm = xgb.XGBClassifier()

In [None]:
grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, scoring='accuracy', cv=3)
grid_search.fit(X_train, y_train)

In [28]:
best_params = grid_search.best_params_

In [29]:
gbm = xgb.XGBClassifier(**best_params)
gbm.fit(X_train, y_train)

In [30]:
y_pred = gbm.predict(X_test)

In [31]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC Score:", roc_auc)

Accuracy: 0.9604884229927427
Precision: 0.8888888888888888
Recall: 0.25396825396825395
F1 Score: 0.3950617283950617
ROC-AUC Score: 0.6261346124210202


**SVM**

In [32]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [None]:
svm = SVC()
svm.fit(X_train, y_train)

In [34]:
y_pred = svm.predict(X_test)

In [35]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9493145950927313
Classification Report:
              precision    recall  f1-score   support

       False       0.95      1.00      0.97      8240
        True       1.00      0.00      0.00       441

    accuracy                           0.95      8681
   macro avg       0.97      0.50      0.49      8681
weighted avg       0.95      0.95      0.92      8681

