In [None]:
%load_ext nb_black

In [None]:
import pandas as pd
import numpy as np

In [None]:
DATA_DIR = "../train.csv"
df = pd.read_csv(DATA_DIR)
df.head()

In [None]:
df.isna().sum().sort_values().index.to_list()

In [None]:
def get_raw_columns(df):
    return df.assign(
        **{
            "COMPANY_ID": df["company_id"],
            "WORKING_CAPITAL": df["working capital"],
            "TOTAL_ASSETS": lambda _df: _df["WORKING_CAPITAL"]
            / _df["working capital / total assets"],
            "LOG_TOTAL_ASSETS": df["logarithm of total assets"],
            "SALES": lambda _df: _df["TOTAL_ASSETS"] * _df["sales / total assets"],
            "SHORT_TERM_LIABILITIES": lambda _df: _df["TOTAL_ASSETS"]
            * _df["short-term liabilities / total assets"],
            "CONSTANT_CAPITAL": lambda _df: _df["TOTAL_ASSETS"]
            * _df["constant capital / total assets"],
            "NET_PROFIT": lambda _df: _df["TOTAL_ASSETS"]
            * _df["net profit / total assets"],
            "TOTAL_SALES": lambda _df: _df["TOTAL_ASSETS"]
            * _df["total sales / total assets"],
            "PROFIT_ON_SALES": lambda _df: _df["TOTAL_ASSETS"]
            * _df["profit on sales / total assets"],
            "PROFIT_ON_OPERATING_ACTIVITIES": lambda _df: _df["TOTAL_ASSETS"]
            * _df["profit on operating activities / total assets"],
            "GROSS_PROFIT": lambda _df: _df["TOTAL_ASSETS"]
            * _df["gross profit / total assets"],
            "EBITDA": df[
                "EBITDA (profit on operating activities - depreciation) / total assets"
            ],
            "DEPRECIATION": lambda _df: -(
                _df["TOTAL_ASSETS"]
                * df[
                    "EBITDA (profit on operating activities - depreciation) / total assets"
                ]
                - _df["PROFIT_ON_OPERATING_ACTIVITIES"]
            ),
            "RETAINED_EARNINGS": lambda _df: _df["TOTAL_ASSETS"]
            * _df["retained earnings / total assets"],
            "EBIT": lambda _df: _df["TOTAL_ASSETS"] * _df["EBIT / total assets"],
            "INTEREST": lambda _df: _df["TOTAL_ASSETS"]
            * _df["(gross profit + interest) / total assets"]
            - _df["GROSS_PROFIT"],
            "TOTAL_LIABILITIES": lambda _df: _df["TOTAL_ASSETS"]
            * _df["total liabilities / total assets"],
            "EQUITY": lambda _df: _df["TOTAL_ASSETS"] * _df["equity / total assets"],
            "LONG_TERM_LIABILITIES": lambda _df: _df["EQUITY"]
            * _df["long-term liabilities / equity"],
            "SHARE_CAPITAL": lambda _df: -(
                _df["TOTAL_ASSETS"] * _df["(equity - share capital) / total assets"]
                - _df["EQUITY"]
            ),
            "TOTAL_COSTS": lambda _df: _df["TOTAL_SALES"]
            * _df["total costs /total sales"],
            "CURRENT_ASSETS": lambda _df: _df["TOTAL_LIABILITIES"]
            * _df["current assets / total liabilities"],
            "INVENTORY": lambda _df: -(
                _df[
                    "(current assets - inventory - short-term liabilities) / (sales - gross profit - depreciation)"
                ]
                * (_df["SALES"] - _df["GROSS_PROFIT"] - _df["DEPRECIATION"])
                - _df["CURRENT_ASSETS"]
                - _df["SHORT_TERM_LIABILITIES"]
            ),
            "BOOK_VALUE_OF_EQUITY": lambda _df: _df["TOTAL_ASSETS"]
            * _df["book value of equity / total liabilities"],
            "OPERATING_EXPENSES": lambda _df: _df["TOTAL_LIABILITIES"]
            * _df["operating expenses / total liabilities"],
            "CASH": lambda _df: -(
                _df["SALES"] * _df["(total liabilities - cash) / sales"]
                - _df["TOTAL_LIABILITIES"]
            ),
            "RECEIVABLES": lambda _df: _df["SALES"]
            * _df["(receivables * 365) / sales"]
            / 365,
            "SHORT_TERM_SECURITIES": lambda _df: (
                _df[
                    "[(cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)] * 365"
                ]
                / 365
            )
            * (_df["OPERATING_EXPENSES"] - _df["DEPRECIATION"])
            - _df["CASH"]
            - _df["RECEIVABLES"]
            + _df["SHORT_TERM_LIABILITIES"],
            "COST_OF_PRODUCTS_SOLD": lambda _df: -(
                _df["SALES"] * (_df["(sales - cost of products sold) / sales"] - 1)
            ),
            "SALES_N_OVER_LAST_SALES": df["sales (n) / sales (n-1)"],
            "GROSS_PROFIT_IN_3_YEARS": lambda _df: _df["TOTAL_ASSETS"]
            * _df["gross profit (in 3 years) / total assets"],
            "FIXED_ASSETS": lambda _df: _df["EQUITY"] / _df["equity / fixed assets"],
            "FINANCIAL_EXPENSES": lambda _df: np.where(
                _df["PROFIT_ON_OPERATING_ACTIVITIES"] == 0,
                0,
                _df["PROFIT_ON_OPERATING_ACTIVITIES"]
                / _df["profit on operating activities / financial expenses"],
            ),
            "EXTRAORDINARY_ITEMS": lambda _df: _df["TOTAL_ASSETS"]
            * _df[
                "(gross profit + extraordinary items + financial expenses) / total assets"
            ]
            - _df["GROSS_PROFIT"]
            - _df["FINANCIAL_EXPENSES"],
            "INVENTORIES": lambda _df: _df["TOTAL_ASSETS"]
            * _df["(current assets - inventories) / long-term liabilities"],
            "ROTATION_RECEIVABLES_PLUS_INVENTORY_TURNOVER_IN_DAYS": df[
                "rotation receivables + inventory turnover in days"
            ],
            "BANKRUPTCY_LABEL": df["bankruptcy_label"],
        }
    ).loc[:, "COMPANY_ID":]

In [None]:
def assign_number_of_null_values_per_row(df):
    return df.assign(**{"NULL_VALUE_COUNT": df.isna().sum(axis=1)})


# def remove_companies_with_null_values(df):
#     return df.loc[
#         (~df["TOTAL_ASSETS"].isna())
#         & (~df["LONG_TERM_LIABILITIES"].isna())
#         & (~df["SHORT_TERM_SECURITIES"].isna())
#         & (~df["TOTAL_COSTS"].isna())
#     ]


def remove_companies_with_null_values(df):
    # todo: maybe we should explicitly remove the row using that company_id?
    return df.loc[(~df["TOTAL_ASSETS"].isna())]


def remove_duplicates(df):
    # todo: need to try different combination of cols to find other possible duplicates
    cols = list(df.columns)
    cols.remove("COMPANY_ID")
    return df.drop_duplicates(cols)

In [None]:
preprocessed_df = (
    df.pipe(get_raw_columns)
    .pipe(assign_number_of_null_values_per_row)
    .pipe(remove_companies_with_null_values)
    .pipe(remove_duplicates)
)

In [None]:
preprocessed_df.isna().sum().sort_values(ascending=False)

In [None]:
preprocessed_df.loc[
    preprocessed_df.duplicated("WORKING_CAPITAL", keep=False)
].sort_values("WORKING_CAPITAL")

In [None]:
# couldn't save fixed assets
# can we recalculate gross profit in 3 years ??
# extraordinary items depend on FINANCIAL_EXPENSES

preprocessed_df.loc[(~preprocessed_df["TOTAL_ASSETS"].isna())].isna().sum().sort_values(
    ascending=False
)

# Checks

### Check values if inverting formula of given column

In [None]:
def check_equal_values_by_inverting(
    df, orig_col, col_to_be_inverted, divide_by_365=False
):
    test_df = df[[orig_col, col_to_be_inverted]].assign(
        **{
            "INVERTED_COL": (1 / df[col_to_be_inverted]) * 365
            if divide_by_365
            else 1 / df[col_to_be_inverted],
            "DIFF": lambda _df: _df[orig_col] - _df["INVERTED_COL"],
            "DIFF_RATIO": lambda _df: abs(_df["DIFF"] / _df[orig_col]),
        },
    )
    return test_df


In [None]:
check_equal_values_by_inverting(
    df, "total liabilities / total assets", "total assets / total liabilities"
).sort_values("DIFF_RATIO", ascending=False)[:60]

In [None]:
check_equal_values_by_inverting(
    df,
    "(short-term liabilities *365) / sales",
    "sales / short-term liabilities",
    divide_by_365=True,
).sort_values("DIFF_RATIO", ascending=False)[:60]

In [None]:
check_equal_values_by_inverting(
    df, "(receivables * 365) / sales", "sales / receivables", divide_by_365=True,
).sort_values("DIFF_RATIO", ascending=False)[:60]

### Check different derivations

In [None]:
def check_different_depreciation_derivations(df):
    return df.assign(
        **{
            "WORKING_CAPITAL": df["working capital"],
            "TOTAL_ASSETS": lambda _df: _df["WORKING_CAPITAL"]
            / _df["working capital / total assets"],
            "SALES": lambda _df: _df["TOTAL_ASSETS"] * _df["sales / total assets"],
            "NET_PROFIT": lambda _df: _df["TOTAL_ASSETS"]
            * _df["net profit / total assets"],
            "TOTAL_LIABILITIES": lambda _df: _df["TOTAL_ASSETS"]
            * _df["total liabilities / total assets"],
            "GROSS_PROFIT": lambda _df: _df["TOTAL_ASSETS"]
            * _df["gross profit / total assets"],
            "PROFIT_ON_OPERATING_ACTIVITIES": lambda _df: _df["TOTAL_ASSETS"]
            * _df["profit on operating activities / total assets"],
            "DEPRECIATION1": lambda _df: -(
                _df["TOTAL_ASSETS"]
                * df[
                    "EBITDA (profit on operating activities - depreciation) / total assets"
                ]
                - _df["PROFIT_ON_OPERATING_ACTIVITIES"]
            ),
            "DEPRECIATION2": lambda _df: -(
                _df["SALES"]
                * _df["EBITDA (profit on operating activities - depreciation) / sales"]
                - _df["PROFIT_ON_OPERATING_ACTIVITIES"]
            ),
            "DEPRECIATION3": lambda _df: _df["SALES"]
            * _df["(gross profit + depreciation) / sales"]
            - _df["GROSS_PROFIT"],
            "DEPRECIATION4": lambda _df: _df["TOTAL_LIABILITIES"]
            * _df["(gross profit + depreciation) / total liabilities"]
            - _df["GROSS_PROFIT"],
            "DEPRECIATION5": lambda _df: _df["TOTAL_LIABILITIES"]
            * _df["(net profit + depreciation) / total liabilities"]
            - _df["NET_PROFIT"],
        }
    ).loc[:, "DEPRECIATION1":]

In [None]:
check_different_depreciation_derivations(df)[:60]

In [None]:
def check_equal_values_derived_from_sales_and_total_assets(df, total_assets_col, sales_col, invert_sales_col=False):

    test_df = df.assign(
        **{
            "TOTAL_ASSETS": 1 / (df["working capital / total assets"] / df["working capital"]),
            "SALES": lambda _df: _df["TOTAL_ASSETS"] * _df["sales / total assets"],
            "DERIVED_FROM_SALES": lambda _df: _df["SALES"]
            * (1/_df[sales_col]) if invert_sales_col else _df["SALES"]
            * _df[sales_col],
            "DERIVED_FROM_TA": lambda _df: _df["TOTAL_ASSETS"]
            * _df[total_assets_col],
            "DIFF": lambda _df: _df['DERIVED_FROM_SALES'] - _df["DERIVED_FROM_TA"],
            "DIFF_RATIO": lambda _df: abs(_df['DIFF']/_df["DERIVED_FROM_TA"])
        },
    )[
        [
            "DERIVED_FROM_SALES",
            "DERIVED_FROM_TA",
            "DIFF",
            "DIFF_RATIO",
        ]
    ]

    return test_df


In [None]:
check_equal_values_derived_from_sales_and_total_assets(
    df,
    "profit on operating activities / total assets",
    "profit on operating activities / sales",
).sort_values("DIFF_RATIO", ascending=False)[:60]

In [None]:
check_equal_values_derived_from_sales_and_total_assets(
    df, "gross profit / total assets", "gross profit / sales"
).sort_values("DIFF_RATIO", ascending=False)[:60]

In [None]:
check_equal_values_derived_from_sales_and_total_assets(
    df, "profit on sales / total assets", "profit on sales / sales"
).sort_values("DIFF_RATIO", ascending=False)[:60]

In [None]:
check_equal_values_derived_from_sales_and_total_assets(
    df, "net profit / total assets", "net profit / sales"
).sort_values("DIFF_RATIO", ascending=False)[:60]

In [None]:
check_equal_values_derived_from_sales_and_total_assets(
    df,
    "EBITDA (profit on operating activities - depreciation) / total assets",
    "EBITDA (profit on operating activities - depreciation) / sales",
).sort_values("DIFF_RATIO", ascending=False)[:60]

In [None]:
check_equal_values_derived_from_sales_and_total_assets(
    df, "(gross profit + interest) / total assets", "(gross profit + interest) / sales"
).sort_values("DIFF_RATIO", ascending=False)[:60]

In [None]:
def check_log_total_assets(df):
    return df.assign(
        **{
            "TOTAL_ASSETS": (1 / df["working capital / total assets"])
            * df["working capital"],
            "LOG_TOTAL_ASSETS": lambda _df: np.log10(_df["TOTAL_ASSETS"]),
            "DIFF": lambda _df: _df["logarithm of total assets"]
            - _df["LOG_TOTAL_ASSETS"],
            "DIFF_RATIO": lambda _df: abs(
                _df["DIFF"] / _df["logarithm of total assets"]
            ),
        }
    )[
        [
            "TOTAL_ASSETS",
            "logarithm of total assets",
            "LOG_TOTAL_ASSETS",
            "DIFF",
            "DIFF_RATIO",
        ]
    ]

In [None]:
check_log_total_assets(df).sort_values("DIFF_RATIO", ascending=False)[:60]

In [None]:
def check_long_and_short_term_liabilities_equate_total_liabilities(df):
    return df.assign(
        **{
            "TOTAL_ASSETS": df["working capital"]
            / df["working capital / total assets"],
            "EQUITY": lambda _df: _df["TOTAL_ASSETS"] * _df["equity / total assets"],
            "SHORT_TERM_LIABILITIES": lambda _df: _df["TOTAL_ASSETS"]
            * _df["short-term liabilities / total assets"],
            "LONG_TERM_LIABILITIES": lambda _df: _df["EQUITY"]
            * _df["long-term liabilities / equity"],
            "TOTAL_LIABILITIES": lambda _df: _df["TOTAL_ASSETS"]
            * _df["total liabilities / total assets"],
            "CALCULATED_TOTAL_LIABILITIES": lambda _df: (
                _df["SHORT_TERM_LIABILITIES"] + _df["LONG_TERM_LIABILITIES"]
            ),
            "DIFF": lambda _df: _df["TOTAL_LIABILITIES"]
            - _df["CALCULATED_TOTAL_LIABILITIES"],
            "DIFF_RATIO": lambda _df: abs(_df["DIFF"] / _df["TOTAL_LIABILITIES"]),
        }
    ).loc[:, "SHORT_TERM_LIABILITIES":]

In [None]:
check_long_and_short_term_liabilities_equate_total_liabilities(df).sort_values(
    "DIFF_RATIO", ascending=False
)[:60]