In [1]:
%%capture
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import math

import seaborn as sns
import matplotlib.pyplot as plt

from py_helper_functions import *

from datetime import datetime

from patsy.highlevel import dmatrices

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance
from sklearn.inspection import PartialDependenceDisplay
from sklearn.inspection import partial_dependence
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet

In [60]:
def get_comp_default(df):
    # add all missing year and comp_id combinations -
    # originally missing combinations will have NAs in all other columns
    df = (
        df.set_index(["year", "comp_id"])
        .unstack(fill_value="toReplace")
        .stack()
        .reset_index()
    )
    df = df.replace("toReplace", np.nan)  # only way I could define it as NaN
    
    # generate status_alive; if sales larger than zero and not-NA, then firm is alive
    df["status_alive"] = (df["sales"] > 0 & (False == df["sales"].isna())).astype(int)
    
    # defaults in one year if there are sales in this year but no sales one year later
    # Status_in_one_years: data.groupby('comp_id')['status_alive'].shift(-1)
    df["default"] = (
        (df["status_alive"] == 1)
        & (df.groupby("comp_id")["status_alive"].shift(-1) == 0)
    ).astype(int)
    
    return df

def get_cleaned_data(local=True) -> pd.DataFrame:
    '''
    This function reads from csv files, cleans it and returns the cleaned dataframe
    :param local: default True
    :param src: path to file
    :return: dataframe with cleaned data
    '''
    
    if local:
        raw_files = ['cs_bisnode_panel1.csv', 'cs_bisnode_panel2.csv']
    else:
        raw_files = ['cs_bisnode_panel1.csv', 'cs_bisnode_panel2.csv']
        
    dfs = []
    for file in raw_files:
        u_df = pd.read_csv(file)
        dfs.append(u_df)
    df = pd.concat(dfs, ignore_index=True)
    
    # drop variables with many NAs
    df = df.drop(columns=["COGS", "finished_prod", "net_dom_sales", "net_exp_sales", "wages", "D", "exit_year", "exit_date"])

    working_sample = get_comp_default(df[(df.ind2 == 26)])
    
    del df
        
    return working_sample[(working_sample["begin"].notna())]

In [61]:
work_df = get_cleaned_data()
work_df.head()

Unnamed: 0,year,comp_id,begin,end,amort,curr_assets,curr_liab,extra_exp,extra_inc,extra_profit_loss,...,origin,nace_main,ind2,ind,urban_m,region_m,founded_date,labor_avg,status_alive,default
0,2005,6538183.0,01/01/2005,31/12/2005,792.59259,6237.037109,348.148163,0.0,0.0,0.0,...,Foreign,2630.0,26.0,2.0,2.0,East,25/08/1992,,1,0
1,2005,6934257.0,13/05/2005,31/12/2005,803.703674,4648.147949,9311.111328,0.0,0.0,0.0,...,,2660.0,26.0,2.0,3.0,East,26/04/2005,,1,1
2,2005,8416055.0,01/01/2005,31/12/2005,3155.555664,71070.36719,25514.81445,74.074074,0.0,-74.074074,...,Domestic,2651.0,26.0,2.0,1.0,Central,28/08/1995,,1,0
5,2005,12428378.0,01/01/2005,31/12/2005,3137.037109,1740.740723,2192.592529,0.0,0.0,0.0,...,Domestic,2660.0,26.0,2.0,2.0,Central,27/06/1991,,1,0
7,2005,15711554.0,01/01/2005,31/12/2005,1592.592651,4740.740723,4970.370605,0.0,7407.407227,7407.407227,...,Domestic,2630.0,26.0,2.0,1.0,Central,01/09/2001,,1,0


In [62]:
work_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14877 entries, 0 to 21947
Data columns (total 42 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   year                  14877 non-null  int64  
 1   comp_id               14877 non-null  float64
 2   begin                 14877 non-null  object 
 3   end                   14877 non-null  object 
 4   amort                 14589 non-null  float64
 5   curr_assets           14872 non-null  float64
 6   curr_liab             14872 non-null  float64
 7   extra_exp             14007 non-null  float64
 8   extra_inc             14007 non-null  float64
 9   extra_profit_loss     14056 non-null  float64
 10  fixed_assets          14872 non-null  float64
 11  inc_bef_tax           14644 non-null  float64
 12  intang_assets         14872 non-null  float64
 13  inventories           14872 non-null  float64
 14  liq_assets            14872 non-null  float64
 15  material_exp          14

In [63]:
holdout_set = work_df[(work_df.ind2 == 26) 
                      & (work_df.year == 2014) 
                      & (work_df.sales >= 1000) 
                      & (work_df.sales <= 10_000_000) 
]
holdout_set.shape

(1037, 42)

In [64]:
holdout_set.default.sum()

56

In [65]:
work_df.default.sum()

1369