In [184]:
# Data Manipulation libraries
import numpy as np
import pandas as pd

# Sci-Kit Learn Processing and Evaluating
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, QuantileTransformer


# Supervised Learning Models  
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso  
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor  
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.svm import SVR  
from sklearn.ensemble import RandomForestRegressor  
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt

# Let's set our Random State here as well
random_state = 9

In [None]:
root_path = '../../datasets/'
X_train_file = 'X_train_filled_KPIs_QoQ_PCA.csv'
y_train_file = 'y_train.csv'
X_old = 'train_1003.csv'
X = pd.read_csv(root_path+X_train_file)
y = pd.read_csv(root_path+y_train_file)
old = pd.read_csv(root_path+X_old)
print(X.shape, y.shape, old.shape)



(1910, 265) (1974, 19) (1450, 287)


In [186]:
X.head()


Unnamed: 0,Ticker,Name,Sector,CapitalExpenditure_2024Q2,CapitalExpenditure_2024Q3,CapitalExpenditure_2024Q4,CapitalExpenditure_2025Q1,CashAndSTInvestments_2024Q2,CashAndSTInvestments_2024Q3,CashAndSTInvestments_2024Q4,...,KPI_TotalAssetTurnover_Rate,KPI_GrossProfitMargin_Rate,NetIncome_Rate,InterestExpense_Rate,CurrentAssets_Rate,LongTermDebt_Rate,TotalEquity_Rate,KPI_Leverage_Rate,Revenue_Rate,IncomeTaxExpense_Rate
0,ACIW,ACI WORLDWIDE INC,Information Technology,-3777000.0,-4045500.0,-4663500.0,-4112000.0,185108000.0,197075000.0,188364000.0,...,-0.003947,0.0151,-759200.0,34750.0,7415600.0,18436000.0,17312350.0,0.026378,3923100.0,-277950.0
1,HONE,HARBORONE BANCORP INC,Financials,-220000.0,-569000.0,-339000.0,-208000.0,235062000.0,224279000.0,231071000.0,...,-7e-05,0.0,-42500.0,-1234200.0,9490765.0,-89428400.0,-1327700.0,-0.026218,-290200.0,-30400.0
2,REPL,REPLIMUNE GROUP INC,Health Care,-1618000.0,-1114000.0,-2266000.0,-1503000.0,82785000.0,75247000.0,78303500.0,...,0.00764,-0.040655,2600350.0,32650.0,-4396650.0,-2281000.0,30492100.0,-0.164635,3422350.0,-8850.0
3,RBRK,RUBRIK INC CLASS A,Information Technology,-15766000.0,-4929000.0,-7527000.0,-8401000.0,429302000.0,142349000.0,103896000.0,...,0.069558,0.078404,-58974600.0,28700.0,-228753000.0,-334860250.0,-1030123000.0,-1.456054,-174352950.0,-7468150.0
4,CSL,CARLISLE COMPANIES INC,Industrials,-24900000.0,-19300000.0,-36600000.0,-29000000.0,1736300000.0,1530600000.0,753500000.0,...,-0.003726,-0.014521,-178880000.0,-1330000.0,-584660000.0,1010000.0,-281300000.0,0.084583,-127510000.0,-19280000.0


In [187]:
y.head()

Unnamed: 0,Ticker,CapitalExpenditure_2025Q2,CashAndSTInvestments_2025Q2,CashFromOps_2025Q2,CostOfRevenue_2025Q2,CurrentAssets_2025Q2,CurrentLiabilities_2025Q2,EPS_2025Q2,IncomeTaxExpense_2025Q2,InterestExpense_2025Q2,LongTermDebt_2025Q2,NetIncome_2025Q2,OperatingIncome_2025Q2,OtherOperatingExpense_2025Q2,Revenue_2025Q2,TotalAssets_2025Q2,TotalDebt_2025Q2,TotalEquity_2025Q2,TotalLiabilities_2025Q2
0,ACIW,,,,,,,,,,,,,,,,,,
1,HONE,-139000.0,203053000.0,-4987000.0,44301000.0,,,0.2,2569000.0,32385000.0,439652000.0,8058000.0,,32385000.0,44301000.0,5609075000.0,439652000.0,580147000.0,5028928000.0
2,REPL,,,,,,,,,,,,,,,,,,
3,RBRK,-6315000.0,283998000.0,39655000.0,60483000.0,1116895000.0,961289000.0,-0.53,1274000.0,9813000.0,322821000.0,-102104000.0,-93091000.0,311089000.0,278481000.0,1474606000.0,334571000.0,-556530000.0,2031136000.0
4,BRCC,,,,,,,,,,,,,,,,,,


In [188]:
new = X.merge(y, on='Ticker')
print(old.shape,new.shape)

(1450, 287) (1910, 283)


In [189]:
old.set_index('Ticker',inplace=True)
new.set_index('Ticker',inplace=True)
old.sort_index(inplace=True)
new.sort_index(inplace=True)



In [190]:
old.head(1)

Unnamed: 0_level_0,ID,Name,Sector,CapitalExpenditure_2024Q2,CapitalExpenditure_2024Q3,CapitalExpenditure_2024Q4,CapitalExpenditure_2025Q1,CashAndSTInvestments_2024Q2,CashAndSTInvestments_2024Q3,CashAndSTInvestments_2024Q4,...,InterestExpense_Rate,CashFromOps_Rate,TotalAssets_Rate,TotalLiabilities_Rate,OperatingIncome_Rate,TotalDebt_Rate,KPI_NetProfitMargin_Rate,NetIncome_2025Q2,Revenue_2025Q2,KPI_NetProfitMargin_2025Q2
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,262,AGILENT TECHNOLOGIES INC,Health Care,-39486000,-92000000.0,-93000000.0,-97000000,796798000.0,1779000000.0,1329000000.0,...,1984900.0,82214900.0,992170000.0,661200000.0,66848100.0,309299600.0,0.030873,215000000,1668000000.0,0.128897


In [191]:
new.head(1)

Unnamed: 0_level_0,Name,Sector,CapitalExpenditure_2024Q2,CapitalExpenditure_2024Q3,CapitalExpenditure_2024Q4,CapitalExpenditure_2025Q1,CashAndSTInvestments_2024Q2,CashAndSTInvestments_2024Q3,CashAndSTInvestments_2024Q4,CashAndSTInvestments_2025Q1,...,InterestExpense_2025Q2,LongTermDebt_2025Q2,NetIncome_2025Q2,OperatingIncome_2025Q2,OtherOperatingExpense_2025Q2,Revenue_2025Q2,TotalAssets_2025Q2,TotalDebt_2025Q2,TotalEquity_2025Q2,TotalLiabilities_2025Q2
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,AGILENT TECHNOLOGIES INC,Health Care,-39486000.0,-92000000.0,-93000000.0,-97000000.0,796798000.0,1779000000.0,1329000000.0,1467000000.0,...,29000000.0,3349000000.0,215000000.0,300000000.0,566000000.0,1668000000.0,12158000000.0,3495000000.0,6136000000.0,6022000000.0


In [192]:
old.sample(1)

Unnamed: 0_level_0,ID,Name,Sector,CapitalExpenditure_2024Q2,CapitalExpenditure_2024Q3,CapitalExpenditure_2024Q4,CapitalExpenditure_2025Q1,CashAndSTInvestments_2024Q2,CashAndSTInvestments_2024Q3,CashAndSTInvestments_2024Q4,...,InterestExpense_Rate,CashFromOps_Rate,TotalAssets_Rate,TotalLiabilities_Rate,OperatingIncome_Rate,TotalDebt_Rate,KPI_NetProfitMargin_Rate,NetIncome_2025Q2,Revenue_2025Q2,KPI_NetProfitMargin_2025Q2
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
IOT,631,SAMSARA INC CLASS A,Information Technology,-16051500,-4992000.0,-4776000.0,-5347000,436700000.0,159272000.0,160348000.0,...,617900.0,-30172700.0,-1394057000.0,-647180600.0,-49164600.0,-564424000.0,-0.043154,-22121000,366884000.0,-0.060294


In [193]:
compare1 = old.loc['SSNC','KPI_NetProfitMargin_Rate']
compare2 = new.loc['SSNC','KPI_NetProfitMargin_Rate']
print(compare1,'/',compare2)

0.007886824 / 0.0078868240160804


In [194]:
old.isna().sum().max()

0

In [195]:
new.isna().sum().max()
#new.dropna(inplace=True)

691

In [196]:
print(old.shape,new.shape)

(1450, 286) (1910, 282)


In [197]:
old['Market Cap'].unique()

array(['Mid-Cap', 'Small-Cap', 'Mega-Cap', 'Large-Cap', 'Micro-Cap'],
      dtype=object)

In [198]:
new['Market Cap'].unique()

array(['Mid-Cap', 'Small-Cap', 'Mega-Cap', 'Large-Cap', 'Micro-Cap'],
      dtype=object)

In [199]:
len(new['Name'].unique())

1909

In [200]:
len(old[old['Market Cap']=='Mega-Cap'])

7

In [201]:
len(new[new['Market Cap'] =='Mega-Cap'])

7

In [202]:
old_tickers = list(old.index)
new_tickers = list(new.index)
in_old_not_new = []
for ticker in old_tickers:
    if ticker not in new_tickers:
        in_old_not_new.append(ticker)
print(len(in_old_not_new))

68


In [203]:
in_new_not_old = []
for ticker in new_tickers:
    if ticker not in old_tickers:
        in_new_not_old.append(ticker)
print(len(in_new_not_old))

528


In [204]:
# We need to look at the tickers that were in old
sample = new[new.index.isin(in_new_not_old)]
sample[['Sector','Exchange','Market Cap','Location',]].head()

Unnamed: 0_level_0,Sector,Exchange,Market Cap,Location
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAP,Consumer Discretionary,New York Stock Exchange Inc.,Small-Cap,1
ABM,Industrials,New York Stock Exchange Inc.,Small-Cap,1
ABR,Financials,New York Stock Exchange Inc.,Small-Cap,1
ABSI,Health Care,NASDAQ,Micro-Cap,1
ACA,Industrials,New York Stock Exchange Inc.,Small-Cap,1


In [205]:
print(len(sample))

528


In [206]:
sample.iloc[:,-14:].isna().sum()

CurrentAssets_2025Q2            457
CurrentLiabilities_2025Q2       452
EPS_2025Q2                      452
IncomeTaxExpense_2025Q2         452
InterestExpense_2025Q2          450
LongTermDebt_2025Q2             447
NetIncome_2025Q2                452
OperatingIncome_2025Q2          466
OtherOperatingExpense_2025Q2    453
Revenue_2025Q2                  452
TotalAssets_2025Q2              443
TotalDebt_2025Q2                444
TotalEquity_2025Q2              443
TotalLiabilities_2025Q2         443
dtype: int64

In [207]:
#Let's see if these are in test
test_path = 'X_test_filled_KPIs_QoQ_PCA.csv'
test = pd.read_csv(root_path+test_path)
print(test.head())

  Ticker                   Name         Sector  CapitalExpenditure_2024Q2  \
0     SF  STIFEL FINANCIAL CORP     Financials                -33952000.0   
1     GH    GUARDANT HEALTH INC    Health Care                 -5077000.0   
2   TGLS         TECNOGLASS INC    Industrials                -20302000.0   
3   CARG   CARGURUS INC CLASS A  Communication                -31226000.0   
4    BKU         BANKUNITED INC     Financials                 51939000.0   

   CapitalExpenditure_2024Q3  CapitalExpenditure_2024Q4  \
0                -12769000.0                -17431000.0   
1                 -4199000.0                -18875000.0   
2                -23685000.0                -25690000.0   
3                -14895000.0                -13698000.0   
4                 -2547000.0                 -3407000.0   

   CapitalExpenditure_2025Q1  CashAndSTInvestments_2024Q2  \
0                -16573000.0                 2.615670e+09   
1                 -4459000.0                 1.035239e+09   

In [208]:
print(len(in_old_not_new),len(test[test['Ticker'].isin(in_old_not_new)]))

68 68


In [209]:
new = new.dropna(subset=['Revenue_2025Q2','NetIncome_2025Q2'])

In [210]:
new.shape

(1388, 282)

In [211]:
old_tickers = list(old.index)
new_tickers = list(new.index)
in_old_not_new = []
for ticker in old_tickers:
    if ticker not in new_tickers:
        in_old_not_new.append(ticker)
print(len(in_old_not_new))

138


In [212]:
in_new_not_old = []
for ticker in new_tickers:
    if ticker not in old_tickers:
        in_new_not_old.append(ticker)
print(len(in_new_not_old))

76


In [213]:
new.shape

(1388, 282)