In [1]:
import time
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy import stats
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from pprint import pprint
import math
from scipy.io import arff


In [2]:
# params
seed = 42

[Описание данных](https://archive.ics.uci.edu/ml/datasets/Polish+companies+bankruptcy+data#)

In [3]:
dfs = [arff.loadarff("./Data/" + table_name) for table_name in os.listdir("./Data")]
dfs = [pd.DataFrame(data[0]) for data in dfs]

dfs_names = [table_name.replace(".csv", "") for table_name in os.listdir("./Data")]

Датасет изначально состоит из пяти таблиц. Вот какое описание дано для них в базе: 
- `1year.arff` the data contains financial rates from 1st year of the forecasting period and corresponding class label that indicates bankruptcy status after 5 years. The data contains 7027 instances (financial statements), 271 represents bankrupted companies, 6756 firms that did not bankrupt in the forecasting period.
- `2year.arff` the data contains financial rates from 2nd year of the forecasting period and corresponding class label that indicates bankruptcy status after 4 years. The data contains 10173 instances (financial statements), 400 represents bankrupted companies, 9773 firms that did not bankrupt in the forecasting period.

и так далее. Описание немного туманное. Тем не менее, по всей видимости, таблицы нельзя считать относящимися к одним и тем же entities, поэтому мы просто вертикально сложим их, создав новую переменную table_year, чтобы отличать данные из разных таблиц. 

In [4]:
# сохранение отдельных выборок для каждого года -- AM
for i in range(len(dfs)):
    X = dfs[i].iloc[:, :-1]
    y = dfs[i].iloc[:,  -1].astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    
    year = dfs_names[i].split('.')[0]
    
    X_train.reset_index(drop=True).to_parquet(f'./samples/{year}_X_train.parquet')
    X_test.reset_index(drop=True).to_parquet(f'./samples/{year}_X_test.parquet')
    pd.DataFrame(y_train).reset_index(drop=True).to_parquet(f'./samples/{year}_y_train.parquet')
    pd.DataFrame(y_test).reset_index(drop=True).to_parquet(f'./samples/{year}_y_test.parquet')

In [5]:
def add_year_column(data, name):
    data["table_year"] = name.replace(".arff", "")
    return data

for df_num in range(len(dfs)):
    dfs[df_num] = add_year_column(dfs[df_num], dfs_names[df_num])
    
df = pd.DataFrame(columns = dfs[0].columns)

for df_num in range(len(dfs)):
    df = df.append(dfs[df_num], ignore_index = True)

In [6]:
df["class"] = df["class"].str.decode("utf-8")
df["class"] = pd.to_numeric(df['class'], errors='coerce')

### Список факторов:
|    | Variable Name   | Description                                                                                                         | Unnamed: 2   |
|---:|:----------------|:--------------------------------------------------------------------------------------------------------------------|:-------------|
|  0 | Attr1           | net profit / total assets                                                                                           |              |
|  1 | Attr2           | total liabilities / total assets                                                                                    |              |
|  2 | Attr3           | working capital / total assets                                                                                      |              |
|  3 | Attr4           | current assets / short-term liabilities                                                                             |              |
|  4 | Attr5           | [(cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)] * 365 |              |
|  5 | Attr6           | retained earnings / total assets                                                                                    |              |
|  6 | Attr7           | EBIT / total assets                                                                                                 |              |
|  7 | Attr8           | book value of equity / total liabilities                                                                            |              |
|  8 | Attr9           | sales / total assets                                                                                                |              |
|  9 | Attr10          | equity / total assets                                                                                               |              |
| 10 | Attr11          | (gross profit + extraordinary items + financial expenses) / total assets                                            |              |
| 11 | Attr12          | gross profit / short-term liabilities                                                                               |              |
| 12 | Attr13          | (gross profit + depreciation) / sales                                                                               |              |
| 13 | Attr14          | (gross profit + interest) / total assets                                                                            |              |
| 14 | Attr15          | (total liabilities * 365) / (gross profit + depreciation)                                                           |              |
| 15 | Attr16          | (gross profit + depreciation) / total liabilities                                                                   |              |
| 16 | Attr17          | total assets / total liabilities                                                                                    |              |
| 17 | Attr18          | gross profit / total assets                                                                                         |              |
| 18 | Attr19          | gross profit / sales                                                                                                |              |
| 19 | Attr20          | (inventory * 365) / sales                                                                                           |              |
| 20 | Attr21          | sales (n) / sales (n-1)                                                                                             |              |
| 21 | Attr22          | profit on operating activities / total assets                                                                       |              |
| 22 | Attr23          | net profit / sales                                                                                                  |              |
| 23 | Attr24          | gross profit (in 3 years) / total assets                                                                            |              |
| 24 | Attr25          | (equity - share capital) / total assets                                                                             |              |
| 25 | Attr26          | (net profit + depreciation) / total liabilities                                                                     |              |
| 26 | Attr27          | profit on operating activities / financial expenses                                                                 |              |
| 27 | Attr28          | working capital / fixed assets                                                                                      |              |
| 28 | Attr29          | logarithm of total assets                                                                                           |              |
| 29 | Attr30          | (total liabilities - cash) / sales                                                                                  |              |
| 30 | Attr31          | (gross profit + interest) / sales                                                                                   |              |
| 31 | Attr32          | (current liabilities * 365) / cost of products sold                                                                 |              |
| 32 | Attr33          | operating expenses / short-term liabilities                                                                         |              |
| 33 | Attr34          | operating expenses / total liabilities                                                                              |              |
| 34 | Attr35          | profit on sales / total assets                                                                                      |              |
| 35 | Attr36          | total sales / total assets                                                                                          |              |
| 36 | Attr37          | (current assets - inventories) / long-term liabilities                                                              |              |
| 37 | Attr38          | constant capital / total assets                                                                                     |              |
| 38 | Attr39          | profit on sales / sales                                                                                             |              |
| 39 | Attr40          | (current assets - inventory - receivables) / short-term liabilities                                                 |              |
| 40 | Attr41          | total liabilities / ((profit on operating activities + depreciation) * (12/365))                                    |              |
| 41 | Attr42          | profit on operating activities / sales                                                                              |              |
| 42 | Attr43          | rotation receivables + inventory turnover in days                                                                   |              |
| 43 | Attr44          | (receivables * 365) / sales                                                                                         |              |
| 44 | Attr45          | net profit / inventory                                                                                              |              |
| 45 | Attr46          | (current assets - inventory) / short-term liabilities                                                               |              |
| 46 | Attr47          | (inventory * 365) / cost of products sold                                                                           |              |
| 47 | Attr48          | EBITDA (profit on operating activities - depreciation) / total assets                                               |              |
| 48 | Attr49          | EBITDA (profit on operating activities - depreciation) / sales                                                      |              |
| 49 | Attr50          | current assets / total liabilities                                                                                  |              |
| 50 | Attr51          | short-term liabilities / total assets                                                                               |              |
| 51 | Attr52          | (short-term liabilities * 365) / cost of products sold)                                                             |              |
| 52 | Attr53          | equity / fixed assets                                                                                               |              |
| 53 | Attr54          | constant capital / fixed assets                                                                                     |              |
| 54 | Attr55          | working capital                                                                                                     |              |
| 55 | Attr56          | (sales - cost of products sold) / sales                                                                             |              |
| 56 | Attr57          | (current assets - inventory - short-term liabilities) / (sales - gross profit - depreciation)                       |              |
| 57 | Attr58          | total costs /total sales                                                                                            |              |
| 58 | Attr59          | long-term liabilities / equity                                                                                      |              |
| 59 | Attr60          | sales / inventory                                                                                                   |              |
| 60 | Attr61          | sales / receivables                                                                                                 |              |
| 61 | Attr62          | (short-term liabilities *365) / sales                                                                               |              |
| 62 | Attr63          | sales / short-term liabilities                                                                                      |              |
| 63 | Attr64          | sales / fixed assets                                                                                                |              |

In [7]:
#get description of factors as tabulated markdown
df_names = pd.read_excel('description.xlsx', header = [0]) ## сделай git add --force description.xlsx или вбей руками, плиз
df_names = df_names.fillna("")
df_names["Variable Name"] = df_names["Variable Name"].str.replace("X", "Attr")
df_names["Variable Name"] = df_names["Variable Name"].str.replace(" ", "")
df_names["Variable Name"] = df_names["Variable Name"].str.upper()
#print(df_names.to_markdown())

FileNotFoundError: [Errno 2] No such file or directory: 'description.xlsx'

In [None]:
df["class"].value_counts().plot.barh()
plt.title("Distribution of status")
plt.show()

df["class"].isnull().value_counts().plot.barh()
plt.title("Number of accounts with missing default status (True for missing)")
plt.show()

Для единообразия нотации, создадим новую зависимую переменную target:

In [None]:
df.rename(columns = {"class":"target"}, inplace = True)

Пропуски заполнены NaN:

In [None]:
df[df.Attr1.isnull()]

Выделяем test & train сеты

In [None]:
X = df.loc[:, df.columns != 'target']
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)


Сохраняем данные

In [None]:
!pip install pyarrow
!pip install fastparquet

In [None]:
# Save data & info ===
# parquet is optimized for large volumes of data
!mkdir samples
X_train.to_parquet('./samples/X_train.parquet')
X_test.to_parquet('./samples/X_test.parquet')
# переводим pd.Series в pd.DataFrame для удобного экспорта
pd.DataFrame(y_train).to_parquet('./samples/y_train.parquet')
pd.DataFrame(y_test).to_parquet('./samples/y_test.parquet')

#списки категориальных и количественных переменных
df_number_of_uniques = df.nunique()
presumably_continuous = df_number_of_uniques[df_number_of_uniques >= 15]
presumably_discrete = df_number_of_uniques[df_number_of_uniques < 15]

presumably_continuous_names = list(presumably_continuous.index)
presumably_discrete_names = list(presumably_discrete.index)

with open('factors.json', 'w') as f:
    json.dump({'cat_vals': presumably_discrete_names, "num_vals": presumably_continuous_names}, f)

### Статистики

Целевых событий немного.

In [None]:
print(f'Количество наблюдений: {X.shape[0]}')
print(f'Количество наблюдений, где имеются данные о дефолте или его отсутствии: {X[y.isnull() == False].shape[0]}')
print(f'Количество факторов: {X.shape[1]}')
print(f'Количество целевых событий: {y.sum()}')
print(f'Доля целевых событий: {y.sum() / X[y.isnull() == False].shape[0] * 100}%')

### Пропуски

В датасете присутствуют пропуски, но их доля несущественна: больше всего их в переменной, где их 6%, но в остальных почти всегда меньше процента

In [None]:
for col in X.columns:
    print(f'Количество пропусков по фактору {col}:\n\t{X[col].isna().sum()} или {X[col].isna().sum() / df.shape[0] * 100}%')

### Визуальный анализ

Изготовим маску для осмысленного наименования факторов:

In [None]:
from pprint import pprint 
df_names_short = df_names[["Variable Name", "Description"]][(df_names["Variable Name"] != "")&(df_names["Description"] != "")]
short_names = df_names_short["Variable Name"]
short_names = short_names.str.upper()
meaningful_names = df_names_short["Description"]
names_mask = dict(zip(short_names, meaningful_names))
names_mask["TABLE_YEAR"] = "Table of origin"
pprint(names_mask)

In [None]:
X_cat = X.copy()
X_cat.columns = X_cat.columns.str.upper()

Из графиков ниже видно, что у переменных все ОЧЕНЬ плохо с выбросами. Например:

In [None]:
print(np.max(X_cat.ATTR1))
print(np.mean(X_cat.ATTR1))
print(np.std(X_cat.ATTR1))

Их нужно будет радикально убирать

In [None]:
def plot_categorical(column, column_name, names_mask):
    column.value_counts().plot.barh()
    plt.title(f'Distribution of {column_name} ({names_mask[column_name]})')
    plt.show()

def plot_mixed(column, column_name, names_mask):
    print("This is a column mixed of both numeric and non-numeric values")
    check_numeric = column.str.isnumeric()
    check_numeric[check_numeric.isnull()] = True
    check_numeric[column.isnull()] = False

    plt.hist(column[check_numeric], orientation='horizontal')
    plt.title(f'Distribution of {column_name} ({names_mask[column_name]}) without non-numeric values')
    plt.show()

    check_numeric.value_counts().plot.barh()
    plt.title(f'Distribution of non-missing (numeric) values in {column_name} ({names_mask[column_name]})')
    plt.show()

def plot_numeric(column, column_name, names_mask):
    plt.hist(column, orientation='horizontal')
    plt.title(f'Distribution of {column_name} ({names_mask[column_name]})')
    plt.show()    

def plot_datetime(column, column_name, names_mask):
    column.hist()
    plt.title(f'Distribution of {column_name} ({names_mask[column_name]})')
    plt.show()
    
def check_mixed_or_categorical(column):
    if is_string_dtype(column) == False:
        return(False)
    else:
        check_numeric = column.str.isnumeric()
        check_numeric[check_numeric.isnull()] = True
        check_numeric[column.isnull()] = False
        if len(column[check_numeric]) == 0:
            return("categorical")
        else:
            return("mixed")

def check_factor_plotting_type(column, column_name, names_mask):
    if column.dtype == pd.Series([np.datetime64("1995-05-05")]).dtype:
        return("datetime")
    elif check_mixed_or_categorical(column) != False:
        return(check_mixed_or_categorical(column))
    else:
        return("numeric")

def plot_all(column, column_name, names_mask, presumably_discrete_names = presumably_discrete_names):
    map_functions_to_types = {
        "categorical" : plot_categorical,
        "mixed" : plot_mixed,
        "numeric" : plot_numeric, 
        "datetime" : plot_datetime
    }
    
    type_checked = check_factor_plotting_type(column, column_name, names_mask)
    print(type_checked)
    map_functions_to_types[type_checked](column, column_name, names_mask)


for col in X_cat.columns:
    
    plot_all(X_cat[col], col, names_mask)


### Выводы

Большой датасет по польским корпоратам с большим количеством переменных и низкой долей пропусков. Из недостатков нужно отметить низкую долю целевых событий (4%) и очень большие проблемы с аутлаерами.