In [None]:
import pandas as pd

pd.set_option("display.max_columns", None)
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))
import numpy as np

import plotly.figure_factory as ff
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

%load_ext nb_black

In [None]:
df = pd.read_csv("../data/input/train.csv", index_col="company_id")

In [None]:
df.info()

In [None]:
def compare_col(df, col1, col2, atol, rtol=0, sample=True):
    mask = np.isclose(df[col1], df[col2], atol=atol, rtol=rtol)
    print(
        f"Number of rows which have absolute difference more than {atol}: {len(df[~mask])}"
    )
    if sample:
        return df[~mask].sample(10)
    else:
        return df[~mask]

### calculate interest

1. use (gross profit + interest)/ total assets
2. use (gross profit + interest)/ sales

interest_method_1 has only one missing value but only one non-zero value
interest_method_2 has 22 missing values but 5914 non-zero value

In [None]:
def calculate_interest(df):
    new_df = df.assign(
        **{
            "total_assets": 10 ** df["logarithm of total assets"],
            "sales_method_1": lambda df: df["sales / total assets"]
            * df["total_assets"],
            "gross_profit": lambda df: df["gross profit / total assets"]
            * df["total_assets"],
            "sales_method_3": lambda df: df["gross_profit"]
            / df["gross profit / sales"],
            "sales_method_8": lambda df: df["sales_method_3"].combine_first(
                df["sales_method_1"]
            ),
            "interest_method_1": lambda df: (
                df["(gross profit + interest) / total assets"] * df["total_assets"]
            )
            - df["gross_profit"],
            "interest_method_2": lambda df: (
                df["(gross profit + interest) / sales"] * df["sales_method_8"]
            )
            - df["gross_profit"],
        }
    )
    return new_df[["interest_method_1", "interest_method_2"]]

In [None]:
df_with_interest = calculate_interest(df)
df_with_interest.sample(10)

In [None]:
# exclude very small numbers which are close to zero by rounding them
df_with_interest[df_with_interest.interest_method_2.round() != 0].dropna(
    subset=["interest_method_2"]
)

### Compare interest_method_2 with financial expenses

financial expenses has 1310 missing values but only 3 zeros

there are 5374 rows which have differences more than 100 so interest and financial expenses might represent different accounting values

In [None]:
def calculate_financial_expenses(df):
    new_df = df.assign(
        **{
            "total_assets": 10 ** df["logarithm of total assets"],
            "profit_on_operating_activities": lambda df: df[
                "profit on operating activities / total assets"
            ]
            * df["total_assets"],
            "financial_expenses": lambda df: df["profit_on_operating_activities"]
            / df["profit on operating activities / financial expenses"],
        }
    )
    return new_df[["financial_expenses"]]

In [None]:
df_with_financial_expenses = calculate_financial_expenses(df)
df_with_financial_expenses.sample(10)

In [None]:
joined_df = df_with_interest.join(df_with_financial_expenses)
compare_col(
    joined_df, "financial_expenses", "interest_method_2", atol=100,
)