In [1]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import patsy 

In [4]:
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    LogisticRegressionCV,
)
from sklearn.metrics import (
    auc,
    brier_score_loss,
    confusion_matrix,
    mean_squared_error,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.preprocessing import StandardScaler



In [5]:
data = pd.read_csv('data/cs_bisnode_panel.csv')
data.shape

(287829, 48)

We are trying to categorize firms as fast growing or not. My criterion for fast growth is the change on return on sales.
I decided that if a firm's income to sales ratio is above median, those firms are labelled as fast growing. The justification
for the median value as threshold is that we have a balanced data set to train our models. Now that we know our criterion is the income to sales ratio change from 2014 to 2015, let's manipulate our data accordingly.

Filter the data from 2014 to 2015

In [6]:
data = data.query("year in (2014,2015)")
data.shape

(55657, 48)

Get firms that are operating

In [7]:
data["status_alive"] = (data["sales"] > 0 & (False == data["sales"].isna())).astype(int)

In [8]:
data = data.query('status_alive == 1')
data.shape

(44386, 49)

In [9]:
data = data[data.duplicated(subset=['comp_id'],keep = False)]
data.shape

(39770, 49)

In [10]:
data[['curr_assets','curr_liab','share_eq','liq_assets','inc_bef_tax','sales','fixed_assets']].isnull().sum()

curr_assets     15
curr_liab       15
share_eq        15
liq_assets      15
inc_bef_tax      0
sales            0
fixed_assets    15
dtype: int64

In [11]:
data.dropna(subset = ['curr_assets','curr_liab','share_eq','liq_assets','inc_bef_tax','sales','fixed_assets',
                     'inventories','tang_assets','intang_assets'],
           inplace = True)

In [12]:
data = data.replace(0,1)

## FEATURE ENGINEERING

In [13]:
data['current_ratio'] = (data['curr_assets'] / data['curr_liab'])

In [14]:
data['debt_to_equity'] = (data['curr_liab'] / data['share_eq'])

In [15]:
data['quick_ratio'] = (data['liq_assets'] / data['curr_liab'])

In [16]:
data['return_on_sales'] = (data['inc_bef_tax'] / data['sales'])

In [17]:
data['fixed_asset_turnover'] = (data['fixed_assets'] / data['sales'])

In [18]:
df =pd.pivot_table(data[['comp_id','year','inventories','tang_assets','intang_assets',
                     'current_ratio','debt_to_equity','quick_ratio','return_on_sales','fixed_asset_turnover']],
              index = 'comp_id', columns = 'year')

In [19]:
df['inventory_c'] = ((df['inventories'][2015] - df['inventories'][2014]) / (df['inventories'][2014])) * 100

In [20]:
df['tang_assets_c'] = ((df['tang_assets'][2015] - df['tang_assets'][2014]) / (df['tang_assets'][2014])) * 100

In [21]:
df['intang_assets_c'] = ((df['intang_assets'][2015] - df['intang_assets'][2014]) / (df['intang_assets'][2014])) * 100

In [22]:
df['current_ratio_c'] = ((df['current_ratio'][2015] - df['current_ratio'][2014]) / (df['current_ratio'][2014])) * 100

In [23]:
df['debt_to_equity_c'] = ((df['debt_to_equity'][2015] - df['debt_to_equity'][2014]) / (df['debt_to_equity'][2014])) * 100

In [24]:
df['quick_ratio_c'] = ((df['quick_ratio'][2015] - df['quick_ratio'][2014]) / (df['quick_ratio'][2014])) * 100

In [25]:
df['return_on_sales_c'] = ((df['return_on_sales'][2015] - df['return_on_sales'][2014]) / (df['return_on_sales'][2014])) * 100

In [26]:
df['fixed_asset_turnover_c'] = ((df['fixed_asset_turnover'][2015] - df['fixed_asset_turnover'][2014]) / (df['fixed_asset_turnover'][2014])) * 100

## DEFINE TARGET VARIABLE

When we examine the distribution of income to sales we see that some firms are in loss margin. If those firms are making loss,
we cannot talk about a growth at all. Therefore, I will build my model considering firms that generate at least 10% income on 
sales.

Now we have income/sales for each firm. Let's assign the labels. 

In [27]:
df = df[['inventory_c','tang_assets_c','intang_assets_c','current_ratio_c','debt_to_equity_c',
        'quick_ratio_c','return_on_sales_c','fixed_asset_turnover_c']]

In [28]:
df['fast_growth'] = np.where(df['return_on_sales_c'] > df['return_on_sales_c'].quantile(.75),1,0)

Now that we have our classification variable. We can continue with model building.

In [29]:
df.dropna(inplace = True)
df.shape

(19872, 9)

In [30]:
variables = ['inventory_c','tang_assets_c','intang_assets_c','current_ratio_c',
             'debt_to_equity_c','quick_ratio_c','fixed_asset_turnover_c']

In [31]:
def default_cost_function(tn, fp, fn, tp):
    
    cost = fp*2 + fn
    
    return cost

## RANDOM FOREST

In [32]:
from sklearn.model_selection import train_test_split
# implementing train-test-split
X = df.drop('fast_growth',axis = 1)
y = df.fast_growth
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=20230226)

In [44]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# number of features at every split
max_features = ['auto', 'sqrt']
# max depth
max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
max_depth.append(None)
# create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }
# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 50, cv =5, verbose=2, random_state=42, n_jobs = -1)
# Fit the model
rfc_random.fit(X_train, y_train)
# print results
print(rfc_random.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'n_estimators': 1000, 'max_features': 'auto', 'max_depth': 140}


In [46]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

In [47]:
rfc = RandomForestClassifier(n_estimators=1000, max_depth=140, max_features='auto')
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
print("=== Confusion Matrix ===")
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))

=== Confusion Matrix ===
[[2962    1]
 [   0 1012]]


=== Classification Report ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2963
           1       1.00      1.00      1.00      1012

    accuracy                           1.00      3975
   macro avg       1.00      1.00      1.00      3975
weighted avg       1.00      1.00      1.00      3975



## LOGIT

In [48]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix,ConfusionMatrixDisplay

In [49]:
class LogisticRegressionWithThreshold(LogisticRegressionCV):
    def predict(self, X, threshold=None):
        if threshold == None: # If no threshold passed in, simply call the base class predict, effectively threshold=0.5
            return LogisticRegressionCV.predict(self, X)
        else:
            y_scores = LogisticRegressionCV.predict_proba(self, X)[:, 1]
            y_pred_with_threshold = (y_scores >= threshold).astype(int)

            return y_pred_with_threshold
        
    def threshold_from_cost_function(self, X, y, cost_function):
        y_scores = LogisticRegressionCV.predict_proba(self, X)[:, 1]
        precisions, recalls, thresholds = precision_recall_curve(y, y_scores)

        costs = []
        tns = []
        fps = []
        fns = []
        tps = []
        
        for threshold in thresholds:
            y_pred_with_threshold = (y_scores >= threshold).astype(int)
            tn, fp, fn, tp = confusion_matrix(y, y_pred_with_threshold).ravel()
            costs.append(cost_function(tn, fp, fn, tp))
            tns.append(tn), fps.append(fp), fns.append(fn), tps.append(tp)

        df_cost = pd.DataFrame({'precision':precisions[:-1], 'recall':recalls[:-1], 'threshold':thresholds, 'cost':costs, 'tn':tns, 'fp':fps, 'fn':fns, 'tp':tps})
        
        min_cost = df_cost['cost'].min()
        threshold = df_cost[df_cost['cost']==min_cost].iloc[0]['threshold']

        return threshold,min_cost,df_cost


In [50]:
lrt = LogisticRegressionWithThreshold(cv = 5)
lrt.fit(X_train, y_train)

threshold, min_cost, df_cost,cf= lrt.threshold_from_cost_function(X_train, y_train, default_cost_function)
y_pred = lrt.predict(X_train, threshold)

In [51]:
df_cost[df_cost['cost'] == df_cost['cost'].min()]

Unnamed: 0,precision,recall,threshold,cost,tn,fp,fn,tp
11877,1.0,0.994692,0.877853,21,11941,0,21,3935


## LOGIT-LASSO

In [41]:
logit_lasso = LogisticRegressionWithThreshold(penalty = 'l1',cv = 5,solver="liblinear")
logit_lasso.fit(X_train, y_train)

threshold, min_cost, df_cost,cf = logit_lasso.threshold_from_cost_function(X_train, y_train, default_cost_function)
y_pred = logit_lasso.predict(X_train, threshold)

In [42]:
df_cost[df_cost['cost'] == df_cost['cost'].min()]

Unnamed: 0,precision,recall,threshold,cost,tn,fp,fn,tp
12026,0.99974,0.973205,0.508404,108,11940,1,106,3850
