In [None]:
import pandas as pd

pd.set_option("display.max_columns", None)
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))
import numpy as np

import plotly.figure_factory as ff
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

%load_ext nb_black

In [None]:
df = pd.read_csv("../data/train.csv", index_col="company_id")

In [None]:
df.info()

## calculate long-term liabilities

1. use total liabilities - short-term liabilities 
2. use equity * long term liabilities /  equity (1 missing and 4444 zeros)

Long term liabilities calculated using method 1 is usually larger than method 2. 
Could it be that short-term liabilities is just a subset of current liabilities?

We can test this out by calculating total liabilities minus current liabilities and compare this method 3 to method 2

To get current liabilities we need cost of products sold to reverse (current liabilities * 365) / cost of products sold

To get cost of products sold we need sales to use (sales - cost of products sold) / sales 

In [None]:
def calculate_long_term_liabilities(df):
    new_df = df.assign(
        **{
            "total_assets": 10 ** df["logarithm of total assets"],
            "total_liabilities": lambda df: df["total liabilities / total assets"]
            * df["total_assets"],
            "short_term_liabilities": lambda df: df[
                "short-term liabilities / total assets"
            ]
            * df["total_assets"],
            "equity": lambda df: df["equity / total assets"] * df["total_assets"],
            "long_term_liabilities_method_1": lambda df: df["total_liabilities"]
            - df["short_term_liabilities"],
            "long_term_liabilities_method_2": lambda df: df[
                "long-term liabilities / equity"
            ]
            * df["equity"],
        }
    )
    return new_df[
        [
            "total_assets",
            "total_liabilities",
            "short_term_liabilities",
            "equity",
            "long_term_liabilities_method_1",
            "long_term_liabilities_method_2",
            "bankruptcy_label",
        ]
    ]

In [None]:
preprocessed_df = calculate_long_term_liabilities(df)
preprocessed_df.sample(10)

In [None]:
def compare_col(df, col1, col2, atol, rtol=0, sample=True):
    mask = np.isclose(df[col1], df[col2], atol=atol, rtol=rtol)
    print(
        f"Number of rows which have absolute difference more than {atol}: {len(df[~mask])}"
    )
    if sample:
        return df[~mask].sample(10)
    else:
        return df[~mask]


compare_col(
    preprocessed_df,
    ["long_term_liabilities_method_1"],
    ["long_term_liabilities_method_2"],
    atol=1000,
)

### definition of short-term liabilities might differ depends on context

Dataset gives us three ways of calculating account payable days

1. (short-term liabilities * 365) / cost of products sold)
2. (short-term liabilities *365) / sales
3. (current liabilities * 365) / cost of products sold

Let's define the short-term liabilities calculated from **short-term liabilites / total assets** as the **True** short-term liabilites

Observation:
- short-term liabilities in the first column is much less compared to the true short-term liabilities
- short-term liabilities in the second and third columns is equal to the true short-term liabilities

Based on domain knowledge, **payable days = (payables * 365) / cost of products sold)** and payables are usually a subset of short-term liabilities.

Therefore, short-term liabilities in the first column might actually be payables

In [None]:
cols = [
    "(short-term liabilities * 365) / cost of products sold)",
    "(current liabilities * 365) / cost of products sold",
]

df[cols]

In [None]:
def calculate_current_liabilities(df):
    new_df = df.assign(
        **{
            "total_assets": 10 ** df["logarithm of total assets"],
            "sales_method_1": lambda df: df["sales / total assets"]
            * df["total_assets"],
            "gross_profit": lambda df: df["gross profit / total assets"]
            * df["total_assets"],
            "sales_method_3": lambda df: df["gross_profit"]
            / df["gross profit / sales"],
            "sales_method_8": lambda df: df["sales_method_3"].combine_first(
                df["sales_method_1"]
            ),
            "COPS": lambda df: (
                df["(sales - cost of products sold) / sales"]
                * df["sales_method_8"]
                * -1
            )
            + df["sales_method_8"],
            "current_liabilities": lambda df: (
                df["(current liabilities * 365) / cost of products sold"] / 365
            )
            * df["COPS"],
        }
    )
    return new_df[["current_liabilities",]]

In [None]:
def calculate_short_term_liabilities_from_APD2(df):
    new_df = df.assign(
        **{
            "total_assets": 10 ** df["logarithm of total assets"],
            "sales_method_1": lambda df: df["sales / total assets"]
            * df["total_assets"],
            "gross_profit": lambda df: df["gross profit / total assets"]
            * df["total_assets"],
            "sales_method_3": lambda df: df["gross_profit"]
            / df["gross profit / sales"],
            "sales_method_8": lambda df: df["sales_method_3"].combine_first(
                df["sales_method_1"]
            ),
            "short_term_liabilities_from_APD2": lambda df: (
                df["(short-term liabilities *365) / sales"] / 365
            )
            * df["sales_method_8"],
        }
    )
    return new_df[["short_term_liabilities_from_APD2",]]

In [None]:
df_with_short_term_liabilities_from_APD1 = calculate_short_term_liabilities_from_APD1(
    df
)
df_with_short_term_liabilities_from_APD2 = calculate_short_term_liabilities_from_APD2(
    df
)
short_term_liabilities_df = (
    df_with_short_term_liabilities_from_APD1.join(df_with_current_liabilities)
    .join(df_with_short_term_liabilities_from_APD2)
    .join(preprocessed_df)[
        [
            "short_term_liabilities",
            "short_term_liabilities_from_APD1",
            "short_term_liabilities_from_APD2",
            "current_liabilities",
        ]
    ]
)
short_term_liabilities_df.sample(10)

In [None]:
compare_col(
    short_term_liabilities_df,
    "short_term_liabilities_from_APD2",
    "current_liabilities",
    atol=100,
).dropna(subset=["current_liabilities"])

In [None]:
compare_col(
    short_term_liabilities_df, "short_term_liabilities", "current_liabilities", atol=100
).dropna(subset=["current_liabilities"])