In [None]:
import pandas as pd

pd.set_option("display.max_columns", None)
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))
import numpy as np

import plotly.figure_factory as ff
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

%load_ext nb_black

In [None]:
df = pd.read_csv("../data/train.csv", index_col="company_id")

In [None]:
df.info()

### calculate current assets

1. use current assets / total liabilities (19 missing and 0 zero)
2. use current assets / short-term liabilities (43 missing and 0 zero)

Excluding missing values, both ways of calculating current assets gives the same answer (within absolute tolerance of 100).

However, method 1 gives us less missing values so we will choose method 1.

In [None]:
def calculate_current_assets(df):
    new_df = df.assign(
        **{
            "total_assets": 10 ** df["logarithm of total assets"],
            "total_liabilities": lambda df: df["total liabilities / total assets"]
            * df["total_assets"],
            "short_term_liabilities": lambda df: df[
                "short-term liabilities / total assets"
            ]
            * df["total_assets"],
            "current_assets_method_1": lambda df: df[
                "current assets / total liabilities"
            ]
            * df["total_liabilities"],
            "current_assets_method_2": lambda df: df[
                "current assets / short-term liabilities"
            ]
            * df["short_term_liabilities"],
        }
    )
    return new_df[
        [
            "total_assets",
            "total_liabilities",
            "short_term_liabilities",
            "current_assets_method_1",
            "current_assets_method_2",
            "bankruptcy_label",
        ]
    ]

In [None]:
preprocessed_df = calculate_current_assets(df)
preprocessed_df.sample(10)

In [None]:
def compare_col(df, col1, col2, atol, rtol=0, sample=True):
    mask = np.isclose(df[col1], df[col2], atol=atol, rtol=rtol)
    print(
        f"Number of rows which have absolute difference more than {atol}: {len(df[~mask])}"
    )
    if sample:
        return df[~mask].sample(10)
    else:
        return df[~mask]


compare_col(
    preprocessed_df, ["current_assets_method_1"], ["current_assets_method_2"], atol=100,
).dropna(subset=["current_assets_method_1", "current_assets_method_2"])

### calculate inventory

1. use (inventory * 365) / sales (21 missing and 599 zeros)
2. use (inventory * 365) / cost of products sold (73 missing and 603 zeros)
3. use sales / inventory (614 missing and 6 zeros)
4. use net profit / inventory (613 missing and 44 zeros)
5. use (current assets - inventory) / short-term liabilities (43 missing and 0 zero)

Compare method 1 and method 3 (same total number of missing values and zeros).

Excluding missing values, both ways of calculating inventory gives the same answer (within absolute tolerance of 100).

However, we will use method 1 because it preserves the original missing and zeros.

In [None]:
def calculate_inventory(df):
    new_df = df.assign(
        **{
            "total_assets": 10 ** df["logarithm of total assets"],
            "sales_method_1": lambda df: df["sales / total assets"]
            * df["total_assets"],
            "gross_profit": lambda df: df["gross profit / total assets"]
            * df["total_assets"],
            "sales_method_3": lambda df: df["gross_profit"]
            / df["gross profit / sales"],
            "sales_method_8": lambda df: df["sales_method_3"].combine_first(
                df["sales_method_1"]
            ),
            "inventory_method_1": lambda df: (df["(inventory * 365) / sales"] / 365)
            * df["sales_method_8"],
            "inventory_method_3": lambda df: df["sales_method_8"]
            / df["sales / inventory"],
        }
    )
    return new_df[["inventory_method_1", "inventory_method_3"]]

In [None]:
preprocessed_df = calculate_inventory(df)
preprocessed_df.sample(10)
preprocessed_df[preprocessed_df["inventory_method_3"].isna()]

In [None]:
compare_col(
    preprocessed_df, ["inventory_method_1"], ["inventory_method_3"], atol=100,
).dropna(subset=["inventory_method_1", "inventory_method_3"])

### calculate receivables

1. use (receivables * 365) / sales (21 missing and 31 zeros)
2. use sales / receivables (32 missing and 20 zeros)
3. use (current assets - inventory - receivables) / short-term liabilities (43 missing and 8 zeros)

Compare method 1 and 2 (same total number of missing values and zeros)

Excluding missing values, both ways of calculating receivables gives the same answer (within absolute tolerance of 100).

However, we will use method 1 because it preserves the original missing and zeros.

In [None]:
def calculate_receivables(df):
    new_df = df.assign(
        **{
            "total_assets": 10 ** df["logarithm of total assets"],
            "sales_method_1": lambda df: df["sales / total assets"]
            * df["total_assets"],
            "gross_profit": lambda df: df["gross profit / total assets"]
            * df["total_assets"],
            "sales_method_3": lambda df: df["gross_profit"]
            / df["gross profit / sales"],
            "sales_method_8": lambda df: df["sales_method_3"].combine_first(
                df["sales_method_1"]
            ),
            "receivables_method_1": lambda df: (df["(receivables * 365) / sales"] / 365)
            * df["sales_method_8"],
            "receivables_method_2": lambda df: df["sales_method_8"]
            / df["sales / receivables"],
        }
    )
    return new_df[["receivables_method_1", "receivables_method_2"]]

In [None]:
preprocessed_df = calculate_receivables(df)
preprocessed_df.sample(10)

In [None]:
compare_col(
    preprocessed_df, ["receivables_method_1"], ["receivables_method_2"], atol=100,
).dropna(subset=["receivables_method_1", "receivables_method_2"])

### calculate fixed assets

1. use working capital / fixed assets (231 missing and 0 zero)
2. use equity / fixed assets (231 missing and 1 zero) 
3. use constant capital / fixed assets (231 missing and 1 zero)
4. use sales / fixed assets (231 missing and 19 zero)

Compare method 1, 2 and 3.

Method 2 and 3 almost always reconcile but method 1 deviates quite a lot sometimes. This further confirms that working capital might not be correct.

We will use method 2.

In [None]:
def calculate_fixed_assets(df):
    new_df = df.assign(
        **{
            "total_assets": 10 ** df["logarithm of total assets"],
            "equity": lambda df: df["equity / total assets"] * df["total_assets"],
            "constant_capital": lambda df: df["constant capital / total assets"]
            * df["total_assets"],
            "fixed_assets_method_1": lambda df: df["working capital"]
            / df["working capital / fixed assets"],
            "fixed_assets_method_2": lambda df: df["equity"]
            / df["equity / fixed assets"],
            "fixed_assets_method_3": lambda df: df["constant_capital"]
            / df["constant capital / fixed assets"],
        }
    )
    return new_df[
        [
            "fixed_assets_method_1",
            "fixed_assets_method_2",
            "fixed_assets_method_3",
            "bankruptcy_label",
        ]
    ]

In [None]:
preprocessed_df = calculate_fixed_assets(df)
preprocessed_df.sample(10)

In [None]:
compare_col(
    preprocessed_df, ["fixed_assets_method_1"], ["fixed_assets_method_2"], atol=100,
).dropna(subset=["fixed_assets_method_1", "fixed_assets_method_2"])

In [None]:
compare_col(
    preprocessed_df, ["fixed_assets_method_3"], ["fixed_assets_method_2"], atol=100,
).dropna(subset=["fixed_assets_method_3", "fixed_assets_method_2"])

### calculate working capital

1. calculate working capital using current assets - short-term liabilities
2. use the original working capital column

213 rows have working capital differences more than 100.

Combine first method 1 with method 2 since the original working capital column might be wrong according to other reconciliation methods.

In [None]:
def calculate_working_capital(df):
    new_df = df.assign(
        **{
            "total_assets": 10 ** df["logarithm of total assets"],
            "total_liabilities": lambda df: df["total liabilities / total assets"]
            * df["total_assets"],
            "short_term_liabilities": lambda df: df[
                "short-term liabilities / total assets"
            ]
            * df["total_assets"],
            "current_assets_method_1": lambda df: df[
                "current assets / total liabilities"
            ]
            * df["total_liabilities"],
            "calculated_working_capital": lambda df: df["current_assets_method_1"]
            - df["short_term_liabilities"],
        }
    )
    return new_df[
        ["calculated_working_capital", "working capital", "bankruptcy_label",]
    ]

In [None]:
preprocessed_df = calculate_working_capital(df)
preprocessed_df.sample(10)

In [None]:
compare_col(
    preprocessed_df, ["calculated_working_capital"], ["working capital"], atol=100,
).dropna(subset=["calculated_working_capital", "working capital"])

In [None]:
preprocessed_df[preprocessed_df["calculated_working_capital"].isna()]