In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import set_config

from sklearn.pipeline import make_pipeline 
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

from df_after_transform import df_after_transform
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_validate, GridSearchCV

In [15]:
firm_performance = pd.read_csv('Input_data_clean/Firm_Performance.csv')
firm_data = pd.read_csv('input_data/Firm_data.csv')

In [3]:
firm_performance

Unnamed: 0,tic,fyear,roa,EBITDA_margin,NI_rev,roe,eps,chng_price,div_yield,vol,mva,tobinsQ,asset_g,revenue_g,ni_g,employee_g
0,AAL,2010,-0.018774,0.058773,-0.021245,0.119392,-1.412506,1.123689,0.000000,0.604973,6542.5755,1.260785,-0.013759,0.113119,-0.679155,-0.008238
1,AAL,2011,-0.082984,0.028932,-0.082383,0.278301,-5.902740,-0.523944,0.000000,0.642134,7228.3438,1.303101,-0.049426,0.083536,3.201699,0.023642
2,AAL,2012,-0.079796,0.057695,-0.075478,0.234882,-5.595123,1.498047,0.000000,0.572826,8253.5571,1.351066,-0.014173,0.034677,-0.052046,-0.029338
3,AAL,2013,-0.043380,0.110662,-0.068658,0.671549,-7.024963,0.772532,0.000000,0.362859,9322.9923,1.220516,0.798299,0.074713,-0.022388,0.419936
4,AAL,2014,0.065843,0.154396,0.067573,1.426027,4.132048,1.117107,0.003729,0.386805,35384.5843,1.808402,0.035314,0.596661,-2.571429,0.026268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4877,ALLE,2015,0.067343,0.214738,0.074416,6.011719,1.603275,0.217927,0.006068,0.203976,6302.1267,3.757680,0.133638,-0.023698,-0.121575,0.105882
4878,ALLE,2016,0.101940,0.218945,0.102368,2.022065,2.404643,-0.000967,0.007500,0.221979,5984.2360,3.662737,-0.016584,0.082153,0.488629,0.000000
4879,ALLE,2017,0.107514,0.235570,0.113487,0.680528,2.874966,0.239666,0.008044,0.173708,7161.5327,3.817283,0.131085,0.076050,0.192929,0.063830
4880,ALLE,2018,0.154758,0.226599,0.159205,0.668049,4.595454,-0.007339,0.010538,0.227619,6892.5153,3.452678,0.105507,0.134333,0.591292,0.100000


In [4]:
set_config(display="diagram")

firm_performance = firm_performance.dropna(subset=['tobinsQ'])
# load data and split off X and y
y = firm_performance.tobinsQ
firm_performance = firm_performance.drop('tobinsQ',axis=1)

performance_score = firm_performance

firm_performance = firm_performance.drop('fyear',axis=1)

In [5]:
# create test set for use later
rng = np.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(firm_performance, y, random_state=rng)

In [6]:
numer_pipe = make_pipeline(SimpleImputer(), 
                           StandardScaler())

preproc_pipe = ColumnTransformer(
    [ 
    # numerical vars
    ("num_impute", numer_pipe, make_column_selector(dtype_include=np.number)),
    ]
    , remainder = 'drop'
)

In [7]:
preproc_pipe

In [8]:
linear_pipe = make_pipeline(preproc_pipe,
                           LinearRegression())

results = linear_pipe.fit(X_train, y_train)

In [9]:
coefficients = linear_pipe.named_steps['linearregression'].coef_
coefficients

array([ 0.88051688,  0.01133028, -0.41044578, -0.06893226, -0.13392542,
        0.30846905, -0.20893141,  0.20238912,  0.18325009, -0.04538935,
        0.11856892,  0.01630705,  0.17562288])

In [10]:
coef_df = pd.DataFrame({'metric':X_train.columns[1:],
                        'weight':coefficients})
coef_df

Unnamed: 0,metric,weight
0,roa,0.880517
1,EBITDA_margin,0.01133
2,NI_rev,-0.410446
3,roe,-0.068932
4,eps,-0.133925
5,chng_price,0.308469
6,div_yield,-0.208931
7,vol,0.202389
8,mva,0.18325
9,asset_g,-0.045389


In [12]:
weights = np.abs(coefficients) / np.sum(np.abs(coefficients))

In [13]:
weight_df = pd.DataFrame({'metric':X_train.columns[1:],
                        'weight':weights})
weight_df

Unnamed: 0,metric,weight
0,roa,0.318557
1,EBITDA_margin,0.004099
2,NI_rev,0.148493
3,roe,0.024939
4,eps,0.048452
5,chng_price,0.111599
6,div_yield,0.075588
7,vol,0.073221
8,mva,0.066297
9,asset_g,0.016421


In [17]:
weight_dict = weight_df.set_index('metric').T.to_dict('list')

In [21]:
scaler = StandardScaler()
performance_score = pd.DataFrame(scaler.fit_transform(performance_score.iloc[:, 2:]), columns=performance_score.iloc[:, 2:].columns)

In [22]:
performance_score

Unnamed: 0,roa,EBITDA_margin,NI_rev,roe,eps,chng_price,div_yield,vol,mva,tobinsQ,asset_g,revenue_g,ni_g,employee_g
0,-1.064598,-0.779009,-0.588044,0.002450,-0.003444,3.046857,-0.655657,3.428324,-0.319913,-0.606562,-0.372228,0.054612,-0.069031,-0.369472
1,-1.900607,-0.909013,-0.870503,0.058396,-0.003869,-2.309650,-0.655657,3.796819,-0.307645,-0.582438,-0.472503,-0.032340,0.318936,-0.195798
2,-1.859098,-0.783708,-0.838602,0.043110,-0.003840,4.263905,-0.655657,3.109552,-0.289305,-0.555093,-0.373393,-0.175950,-0.006339,-0.484418
3,-1.384961,-0.552956,-0.807096,0.196843,-0.003976,1.905234,-0.655657,1.027504,-0.270173,-0.629519,1.910785,-0.058273,-0.003374,1.963075
4,0.037104,-0.362427,-0.177700,0.462465,-0.002919,3.025456,-0.523556,1.264954,0.196055,-0.294367,-0.234265,1.475852,-0.258201,-0.181493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4877,0.056644,-0.099547,-0.146086,2.076907,-0.003158,0.102196,-0.440713,-0.547998,-0.324215,0.816907,0.042161,-0.347527,-0.013290,0.252218
4878,0.507089,-0.081218,-0.016946,0.672307,-0.003082,-0.609436,-0.389986,-0.369481,-0.329902,0.762780,-0.380172,-0.036407,0.047712,-0.324593
4879,0.579659,-0.008793,0.034424,0.200004,-0.003038,0.172867,-0.370708,-0.848140,-0.308841,0.850886,0.034984,-0.054344,0.018151,0.023130
4880,1.194771,-0.047876,0.245642,0.195611,-0.002875,-0.630153,-0.282365,-0.313556,-0.313653,0.643026,-0.036924,0.116962,0.057975,0.220173


In [23]:
for metric in weight_dict:
    performance_score[metric] = performance_score[metric] * weight_dict[metric]

In [25]:
performance_score['Performance Score'] = performance_score.sum(axis=1)

In [36]:
performance_score

Unnamed: 0,roa,EBITDA_margin,NI_rev,roe,eps,chng_price,div_yield,vol,mva,tobinsQ,asset_g,revenue_g,ni_g,employee_g,Performance Score
0,-0.339135,-0.003193,-0.087320,0.000061,-0.000167,0.340027,-0.049560,0.251026,-0.021209,-0.606562,-0.006112,0.002343,-0.000407,-0.023475,-0.543685
1,-0.605452,-0.003726,-0.129264,0.001456,-0.000187,-0.257755,-0.049560,0.278008,-0.020396,-0.582438,-0.007759,-0.001387,0.001882,-0.012441,-1.389019
2,-0.592229,-0.003213,-0.124526,0.001075,-0.000186,0.475849,-0.049560,0.227685,-0.019180,-0.555093,-0.006132,-0.007548,-0.000037,-0.030779,-0.683873
3,-0.441189,-0.002267,-0.119848,0.004909,-0.000193,0.212623,-0.049560,0.075235,-0.017912,-0.629519,0.031377,-0.002500,-0.000020,0.124729,-0.814133
4,0.011820,-0.001486,-0.026387,0.011533,-0.000141,0.337639,-0.039575,0.092621,0.012998,-0.294367,-0.003847,0.063309,-0.001523,-0.011532,0.151062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4877,0.018044,-0.000408,-0.021693,0.051795,-0.000153,0.011405,-0.033313,-0.040125,-0.021494,0.816907,0.000692,-0.014908,-0.000078,0.016025,0.782697
4878,0.161537,-0.000333,-0.002516,0.016766,-0.000149,-0.068013,-0.029478,-0.027054,-0.021871,0.762780,-0.006243,-0.001562,0.000281,-0.020624,0.763522
4879,0.184654,-0.000036,0.005112,0.004988,-0.000147,0.019292,-0.028021,-0.062102,-0.020475,0.850886,0.000574,-0.002331,0.000107,0.001470,0.953971
4880,0.380603,-0.000196,0.036476,0.004878,-0.000139,-0.070325,-0.021343,-0.022959,-0.020794,0.643026,-0.000606,0.005017,0.000342,0.013989,0.947969


In [33]:
scores_df = firm_performance[['tic', 'fyear']].copy()

In [34]:
scores_df['Performance Score'] = performance_score['Performance Score']

In [35]:
scores_df

Unnamed: 0,tic,fyear,Performance Score
0,AAL,2010,-0.543685
1,AAL,2011,-1.389019
2,AAL,2012,-0.683873
3,AAL,2013,-0.814133
4,AAL,2014,0.151062
...,...,...,...
4877,ALLE,2015,0.782697
4878,ALLE,2016,0.763522
4879,ALLE,2017,0.953971
4880,ALLE,2018,0.947969
