# Processing data for modelling

## - Stock price

In [1]:
import pandas as pd
from tqdm import tqdm

from functions import print_shape


In [2]:
# test with feature_engineering.py
prices = pd.read_csv('data/train_files/stock_prices.csv', parse_dates=[1])
print_shape(prices)
prices.head(2)

 Shape:
 ----------------------------------------
 Observations:   2.33M
 Features:       12
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close',
       'Volume', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag',
       'Target'],
      dtype='object')
----------------------------------------
RowId missing Values: 0 (0.0%)
Date missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
Open missing Values: 7608 (0.3%)
High missing Values: 7608 (0.3%)
Low missing Values: 7608 (0.3%)
Close missing Values: 7608 (0.3%)
Volume missing Values: 0 (0.0%)
AdjustmentFactor missing Values: 0 (0.0%)
ExpectedDividend missing Values: 2313666 (99.2%)
SupervisionFlag missing Values: 0 (0.0%)
Target missing Values: 238 (0.0%)


Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,,False,0.00073
1,20170104_1332,2017-01-04,1332,568.0,576.0,563.0,571.0,2798500,1.0,,False,0.012324


In [3]:

from feature_engineering import fill_and_drop_na_values

fill_prices = fill_and_drop_na_values(prices, drop=False)
fill_prices.to_csv('data/curr_fill_prices.csv')
print_shape(fill_prices)
fill_prices.head(2)
del fill_prices


100%|██████████| 2000/2000 [03:29<00:00,  9.57it/s]


 Shape:
 ----------------------------------------
 Observations:   2.33M
 Features:       12
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close',
       'Volume', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag',
       'Target'],
      dtype='object')
----------------------------------------
RowId missing Values: 0 (0.0%)
Date missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
Open missing Values: 264 (0.0%)
High missing Values: 264 (0.0%)
Low missing Values: 264 (0.0%)
Close missing Values: 264 (0.0%)
Volume missing Values: 0 (0.0%)
AdjustmentFactor missing Values: 0 (0.0%)
ExpectedDividend missing Values: 127359 (5.5%)
SupervisionFlag missing Values: 0 (0.0%)
Target missing Values: 238 (0.0%)


In [4]:
fill_prices = pd.read_csv('data/curr_fill_prices.csv', parse_dates=['Date'], index_col=[0])
print_shape(fill_prices)

 Shape:
 ----------------------------------------
 Observations:   2.33M
 Features:       12
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close',
       'Volume', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag',
       'Target'],
      dtype='object')
----------------------------------------
RowId missing Values: 0 (0.0%)
Date missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
Open missing Values: 264 (0.0%)
High missing Values: 264 (0.0%)
Low missing Values: 264 (0.0%)
Close missing Values: 264 (0.0%)
Volume missing Values: 0 (0.0%)
AdjustmentFactor missing Values: 0 (0.0%)
ExpectedDividend missing Values: 127359 (5.5%)
SupervisionFlag missing Values: 0 (0.0%)
Target missing Values: 238 (0.0%)


In [5]:
from feature_engineering import adjust_price


ad_price = adjust_price(fill_prices)
ad_price.to_csv('data/curr_ad_price.csv')
print_shape(ad_price)
ad_price.head(2)
del ad_price

100%|██████████| 2000/2000 [03:32<00:00,  9.39it/s]


 Shape:
 ----------------------------------------
 Observations:   2.33M
 Features:       13
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'AdjustmentFactor',
       'ExpectedDividend', 'SupervisionFlag', 'Target', 'ad_Open', 'ad_High',
       'ad_Low', 'ad_Close', 'ad_Volume', 'ad_Target'],
      dtype='object')
----------------------------------------
RowId missing Values: 0 (0.0%)
Date missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
AdjustmentFactor missing Values: 0 (0.0%)
ExpectedDividend missing Values: 127359 (5.5%)
SupervisionFlag missing Values: 0 (0.0%)
Target missing Values: 238 (0.0%)
ad_Open missing Values: 264 (0.0%)
ad_High missing Values: 264 (0.0%)
ad_Low missing Values: 264 (0.0%)
ad_Close missing Values: 264 (0.0%)
ad_Volume missing Values: 0 (0.0%)
ad_Target missing Values: 4238 (0.2%)


In [6]:
ad_price = pd.read_csv('data/curr_ad_price.csv', parse_dates=['Date'], index_col=[0])
print_shape(ad_price)

 Shape:
 ----------------------------------------
 Observations:   2.33M
 Features:       13
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'AdjustmentFactor',
       'ExpectedDividend', 'SupervisionFlag', 'Target', 'ad_Open', 'ad_High',
       'ad_Low', 'ad_Close', 'ad_Volume', 'ad_Target'],
      dtype='object')
----------------------------------------
RowId missing Values: 0 (0.0%)
Date missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
AdjustmentFactor missing Values: 0 (0.0%)
ExpectedDividend missing Values: 127359 (5.5%)
SupervisionFlag missing Values: 0 (0.0%)
Target missing Values: 238 (0.0%)
ad_Open missing Values: 264 (0.0%)
ad_High missing Values: 264 (0.0%)
ad_Low missing Values: 264 (0.0%)
ad_Close missing Values: 264 (0.0%)
ad_Volume missing Values: 0 (0.0%)
ad_Target missing Values: 4238 (0.2%)


In [7]:
from feature_engineering import price_new_features, encode_flag

ad_price_feat = price_new_features(ad_price)

ad_price_feat['SupervisionFlag'] = encode_flag(ad_price_feat)
ad_price_feat.to_csv('data/curr_ad_price_feat.csv')
print_shape(ad_price_feat)
del ad_price_feat

100%|██████████| 2000/2000 [6:51:17<00:00, 12.34s/it]    


 Shape:
 ----------------------------------------
 Observations:   2.33M
 Features:       36
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'AdjustmentFactor',
       'ExpectedDividend', 'SupervisionFlag', 'Target', 'ad_Open', 'ad_High',
       'ad_Low', 'ad_Close', 'ad_Volume', 'ad_Target', 'ad_Close_lag1',
       'ad_Close_sma10', 'ad_Open_lag1', 'ad_Open_sma10', 'ad_High_lag1',
       'ad_High_sma10', 'ad_Low_lag1', 'ad_Low_sma10', 'ad_Volume_lag1',
       'ad_Volume_sma10', 'RSI', 'Return', 'logreturn', 'Log_Return', 'MACD',
       'MACD_h', 'MACD_s', 'Year', 'week', 'Volatility_week', 'macd', 'macd_h',
       'macd_s'],
      dtype='object')
----------------------------------------
RowId missing Values: 0 (0.0%)
Date missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
AdjustmentFactor missing Values: 0 (0.0%)
ExpectedDividend missing Values: 127359 (5.5%)
SupervisionFlag missing Values: 0 (0.0%)
Targ

In [8]:
ad_price_feat =  pd.read_csv('data/curr_ad_price_feat.csv', parse_dates=['Date'], index_col=[0])
print_shape(ad_price_feat)

 Shape:
 ----------------------------------------
 Observations:   2.33M
 Features:       36
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'AdjustmentFactor',
       'ExpectedDividend', 'SupervisionFlag', 'Target', 'ad_Open', 'ad_High',
       'ad_Low', 'ad_Close', 'ad_Volume', 'ad_Target', 'ad_Close_lag1',
       'ad_Close_sma10', 'ad_Open_lag1', 'ad_Open_sma10', 'ad_High_lag1',
       'ad_High_sma10', 'ad_Low_lag1', 'ad_Low_sma10', 'ad_Volume_lag1',
       'ad_Volume_sma10', 'RSI', 'Return', 'logreturn', 'Log_Return', 'MACD',
       'MACD_h', 'MACD_s', 'Year', 'week', 'Volatility_week', 'macd', 'macd_h',
       'macd_s'],
      dtype='object')
----------------------------------------
RowId missing Values: 0 (0.0%)
Date missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
AdjustmentFactor missing Values: 0 (0.0%)
ExpectedDividend missing Values: 127359 (5.5%)
SupervisionFlag missing Values: 0 (0.0%)
Targ

In [9]:
ad_price_feat.columns

Index(['RowId', 'Date', 'SecuritiesCode', 'AdjustmentFactor',
       'ExpectedDividend', 'SupervisionFlag', 'Target', 'ad_Open', 'ad_High',
       'ad_Low', 'ad_Close', 'ad_Volume', 'ad_Target', 'ad_Close_lag1',
       'ad_Close_sma10', 'ad_Open_lag1', 'ad_Open_sma10', 'ad_High_lag1',
       'ad_High_sma10', 'ad_Low_lag1', 'ad_Low_sma10', 'ad_Volume_lag1',
       'ad_Volume_sma10', 'RSI', 'Return', 'logreturn', 'Log_Return', 'MACD',
       'MACD_h', 'MACD_s', 'Year', 'week', 'Volatility_week', 'macd', 'macd_h',
       'macd_s'],
      dtype='object')

## - Financials

In [10]:
#from feature_engineering import fill_finances

financial = financial = pd.read_csv('data/train_files/financials.csv',parse_dates=['Date'])
financial.head(2)

Unnamed: 0,DisclosureNumber,DateCode,Date,SecuritiesCode,DisclosedDate,DisclosedTime,DisclosedUnixTime,TypeOfDocument,CurrentPeriodEndDate,TypeOfCurrentPeriod,...,ForecastEarningsPerShare,ApplyingOfSpecificAccountingOfTheQuarterlyFinancialStatements,MaterialChangesInSubsidiaries,ChangesBasedOnRevisionsOfAccountingStandard,ChangesOtherThanOnesBasedOnRevisionsOfAccountingStandard,ChangesInAccountingEstimates,RetrospectiveRestatement,NumberOfIssuedAndOutstandingSharesAtTheEndOfFiscalYearIncludingTreasuryStock,NumberOfTreasuryStockAtTheEndOfFiscalYear,AverageNumberOfShares
0,20161210000000.0,20170104_2753,2017-01-04,2753.0,2017-01-04,07:30:00,1483483000.0,3QFinancialStatements_Consolidated_JP,2016-12-31,3Q,...,319.76,,False,True,False,False,False,6848800.0,－,6848800.0
1,20170100000000.0,20170104_3353,2017-01-04,3353.0,2017-01-04,15:00:00,1483510000.0,3QFinancialStatements_Consolidated_JP,2016-11-30,3Q,...,485.36,,False,True,False,False,False,2035000.0,118917,1916083.0


In [11]:
from feature_engineering import fill_finances_knn



filled_finances = fill_finances_knn(financial, prices)
filled_finances.to_csv('data/curr_filled_finances.csv')
print_shape(filled_finances)
filled_finances.head()
del filled_finances

100%|██████████| 2000/2000 [00:04<00:00, 425.65it/s]


 Shape:
 ----------------------------------------
 Observations:   39.9K
 Features:       7
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['Date', 'Day', 'Month', 'Year', 'SecuritiesCode', 'Profit', 'NetSales'], dtype='object')
----------------------------------------
Date missing Values: 0 (0.0%)
Day missing Values: 0 (0.0%)
Month missing Values: 0 (0.0%)
Year missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
Profit missing Values: 0 (0.0%)
NetSales missing Values: 0 (0.0%)


In [12]:
filled_finances = pd.read_csv('data/curr_filled_finances.csv', parse_dates=['Date'], index_col=[0])
print_shape(filled_finances)

 Shape:
 ----------------------------------------
 Observations:   39.9K
 Features:       7
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['Date', 'Day', 'Month', 'Year', 'SecuritiesCode', 'Profit', 'NetSales'], dtype='object')
----------------------------------------
Date missing Values: 0 (0.0%)
Day missing Values: 0 (0.0%)
Month missing Values: 0 (0.0%)
Year missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
Profit missing Values: 0 (0.0%)
NetSales missing Values: 0 (0.0%)


In [14]:
from feature_engineering import new_features_financial

filled_financial_features = new_features_financial(filled_finances)
filled_financial_features.to_csv('data/curr_filled_financial_features.csv')
print_shape(filled_financial_features)
filled_financial_features.head()



100%|██████████| 2000/2000 [01:08<00:00, 29.11it/s]


 Shape:
 ----------------------------------------
 Observations:   39.9K
 Features:       17
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['Date', 'Day', 'Month', 'Year', 'SecuritiesCode', 'Profit', 'NetSales',
       'margin', 'profit_ttm', 'rev_ttm', 'win_quarter_growth',
       'rev_quarter_growth', 'win_yoy_growth', 'rev_yoy_growth',
       'win_ttm_growth', 'rev_ttm_growth', 'margin_growth'],
      dtype='object')
----------------------------------------
Date missing Values: 0 (0.0%)
Day missing Values: 0 (0.0%)
Month missing Values: 0 (0.0%)
Year missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
Profit missing Values: 0 (0.0%)
NetSales missing Values: 0 (0.0%)
margin missing Values: 0 (0.0%)
profit_ttm missing Values: 6000 (15.0%)
rev_ttm missing Values: 6000 (15.0%)
win_quarter_growth missing Values: 2000 (5.0%)
rev_quarter_growth missing Values: 2000 (5.0%)
win_yoy_growth missing Values: 8000 (20.0%)
rev_yoy_growth missing Value

Unnamed: 0,Date,Day,Month,Year,SecuritiesCode,Profit,NetSales,margin,profit_ttm,rev_ttm,win_quarter_growth,rev_quarter_growth,win_yoy_growth,rev_yoy_growth,win_ttm_growth,rev_ttm_growth,margin_growth
0,2017-02-10,10.0,2.0,2017.0,1301.0,2449000000.0,179975000000.0,1.360745,,,,,,,,,
1,2017-05-11,11.0,5.0,2017.0,1301.0,2422000000.0,236561000000.0,1.023837,,,-1.102491,31.441033,,,,,-24.75903
2,2017-08-04,4.0,8.0,2017.0,1301.0,754000000.0,56844000000.0,1.326437,,,-68.868704,-75.97068,,,,,29.555461
3,2017-11-06,6.0,11.0,2017.0,1301.0,1633000000.0,120458000000.0,1.355659,7258000000.0,593838000000.0,116.578249,111.909788,,,,,2.203042
4,2018-02-09,9.0,2.0,2018.0,1301.0,2784000000.0,198323000000.0,1.403771,7593000000.0,612186000000.0,70.483772,64.640788,13.679053,10.194749,4.615597,3.089732,3.548929


In [15]:
filled_financial_features.SecuritiesCode = filled_financial_features.SecuritiesCode.astype(int)

In [16]:
filled_financial_features.head(2)

Unnamed: 0,Date,Day,Month,Year,SecuritiesCode,Profit,NetSales,margin,profit_ttm,rev_ttm,win_quarter_growth,rev_quarter_growth,win_yoy_growth,rev_yoy_growth,win_ttm_growth,rev_ttm_growth,margin_growth
0,2017-02-10,10.0,2.0,2017.0,1301,2449000000.0,179975000000.0,1.360745,,,,,,,,,
1,2017-05-11,11.0,5.0,2017.0,1301,2422000000.0,236561000000.0,1.023837,,,-1.102491,31.441033,,,,,-24.75903


In [17]:
ad_price.head(2)

Unnamed: 0,RowId,Date,SecuritiesCode,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,ad_Open,ad_High,ad_Low,...,ad_Volume,ad_Target,ad_Close_lag1,ad_Open_lag1,ad_High_lag1,ad_Low_lag1,ad_Volume_lag1,macd,macd_h,macd_s
1483,20170104_8194,2017-01-04,8194,1.0,,False,-0.002963,3325.0,3390.0,3300.0,...,36100.0,-0.002963,,,,,,,,
3348,20170105_8194,2017-01-05,8194,1.0,,False,-0.020802,3395.0,3420.0,3360.0,...,19400.0,-0.020802,,,,,,,,


## - financials + price

In [18]:
# create key on financial : RowId
filled_financial_features['RowId'] = filled_financial_features.Date.dt.strftime('%Y%m%d').astype(str) + '_' + filled_financial_features.SecuritiesCode.astype(str)

In [19]:
filled_financial_features.sort_values('Date').head(20)

Unnamed: 0,Date,Day,Month,Year,SecuritiesCode,Profit,NetSales,margin,profit_ttm,rev_ttm,win_quarter_growth,rev_quarter_growth,win_yoy_growth,rev_yoy_growth,win_ttm_growth,rev_ttm_growth,margin_growth,RowId
0,2017-01-04,4.0,1.0,2017.0,2753,1494000000.0,22761000000.0,6.563859,,,,,,,,,,20170104_2753
0,2017-01-05,5.0,1.0,2017.0,9977,1059000000.0,78930000000.0,1.341695,,,,,,,,,,20170105_9977
0,2017-01-05,5.0,1.0,2017.0,9974,4487000000.0,143111000000.0,3.135329,,,,,,,,,,20170105_9974
0,2017-01-05,5.0,1.0,2017.0,2659,7171000000.0,134781000000.0,5.320483,,,,,,,,,,20170105_2659
0,2017-01-05,5.0,1.0,2017.0,7453,19996000000.0,247027000000.0,8.094662,,,,,,,,,,20170105_7453
0,2017-01-05,5.0,1.0,2017.0,8168,1044000000.0,112305000000.0,0.929611,,,,,,,,,,20170105_8168
0,2017-01-05,5.0,1.0,2017.0,7463,3002000000.0,14619000000.0,20.53492,,,,,,,,,,20170105_7463
0,2017-01-06,6.0,1.0,2017.0,9993,451000000.0,84244000000.0,0.53535,,,,,,,,,,20170106_9993
0,2017-01-06,6.0,1.0,2017.0,5982,2244000000.0,36499000000.0,6.148114,,,,,,,,,,20170106_5982
0,2017-01-06,6.0,1.0,2017.0,3222,6121000000.0,510318000000.0,1.199448,,,,,,,,,,20170106_3222


In [20]:
filled_financial_features.RowId.info()

<class 'pandas.core.series.Series'>
Int64Index: 39912 entries, 0 to 3
Series name: RowId
Non-Null Count  Dtype 
--------------  ----- 
39912 non-null  object
dtypes: object(1)
memory usage: 623.6+ KB


In [21]:
# concat financials and price
price_financial = pd.merge(ad_price, filled_financial_features, how='left', on='RowId', suffixes=[None, 'f_'])
#price_financial = pd.concat([ad_price, filled_financial_features], keys='RowID')

In [22]:
print_shape(price_financial)

 Shape:
 ----------------------------------------
 Observations:   2.33M
 Features:       38
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'AdjustmentFactor',
       'ExpectedDividend', 'SupervisionFlag', 'Target', 'ad_Open', 'ad_High',
       'ad_Low', 'ad_Close', 'ad_Volume', 'ad_Target', 'ad_Close_lag1',
       'ad_Open_lag1', 'ad_High_lag1', 'ad_Low_lag1', 'ad_Volume_lag1', 'macd',
       'macd_h', 'macd_s', 'Datef_', 'Day', 'Month', 'Year',
       'SecuritiesCodef_', 'Profit', 'NetSales', 'margin', 'profit_ttm',
       'rev_ttm', 'win_quarter_growth', 'rev_quarter_growth', 'win_yoy_growth',
       'rev_yoy_growth', 'win_ttm_growth', 'rev_ttm_growth', 'margin_growth'],
      dtype='object')
----------------------------------------
RowId missing Values: 0 (0.0%)
Date missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
AdjustmentFactor missing Values: 0 (0.0%)
ExpectedDividend missing Values: 127369 (5

In [23]:
price_financial.head()

Unnamed: 0,RowId,Date,SecuritiesCode,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,ad_Open,ad_High,ad_Low,...,margin,profit_ttm,rev_ttm,win_quarter_growth,rev_quarter_growth,win_yoy_growth,rev_yoy_growth,win_ttm_growth,rev_ttm_growth,margin_growth
0,20170104_8194,2017-01-04,8194,1.0,,False,-0.002963,3325.0,3390.0,3300.0,...,,,,,,,,,,
1,20170105_8194,2017-01-05,8194,1.0,,False,-0.020802,3395.0,3420.0,3360.0,...,,,,,,,,,,
2,20170106_8194,2017-01-06,8194,1.0,,False,-0.00607,3365.0,3385.0,3335.0,...,,,,,,,,,,
3,20170110_8194,2017-01-10,8194,1.0,,False,-0.022901,3350.0,3360.0,3290.0,...,,,,,,,,,,
4,20170111_8194,2017-01-11,8194,1.0,,False,0.054688,3330.0,3330.0,3260.0,...,,,,,,,,,,


In [24]:
price_financial.columns


Index(['RowId', 'Date', 'SecuritiesCode', 'AdjustmentFactor',
       'ExpectedDividend', 'SupervisionFlag', 'Target', 'ad_Open', 'ad_High',
       'ad_Low', 'ad_Close', 'ad_Volume', 'ad_Target', 'ad_Close_lag1',
       'ad_Open_lag1', 'ad_High_lag1', 'ad_Low_lag1', 'ad_Volume_lag1', 'macd',
       'macd_h', 'macd_s', 'Datef_', 'Day', 'Month', 'Year',
       'SecuritiesCodef_', 'Profit', 'NetSales', 'margin', 'profit_ttm',
       'rev_ttm', 'win_quarter_growth', 'rev_quarter_growth', 'win_yoy_growth',
       'rev_yoy_growth', 'win_ttm_growth', 'rev_ttm_growth', 'margin_growth'],
      dtype='object')

In [25]:
from functions import missingValues

missingValues(price_financial)

RowId missing Values: 0 (0.0%)
Date missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
AdjustmentFactor missing Values: 0 (0.0%)
ExpectedDividend missing Values: 127369 (5.5%)
SupervisionFlag missing Values: 0 (0.0%)
Target missing Values: 238 (0.0%)
ad_Open missing Values: 264 (0.0%)
ad_High missing Values: 264 (0.0%)
ad_Low missing Values: 264 (0.0%)
ad_Close missing Values: 264 (0.0%)
ad_Volume missing Values: 0 (0.0%)
ad_Target missing Values: 4238 (0.2%)
ad_Close_lag1 missing Values: 2331644 (99.9%)
ad_Open_lag1 missing Values: 2331644 (99.9%)
ad_High_lag1 missing Values: 2331644 (99.9%)
ad_Low_lag1 missing Values: 2331644 (99.9%)
ad_Volume_lag1 missing Values: 2331644 (99.9%)
macd missing Values: 25 (0.0%)
macd_h missing Values: 33 (0.0%)
macd_s missing Values: 33 (0.0%)
Datef_ missing Values: 2292963 (98.3%)
Day missing Values: 2292963 (98.3%)
Month missing Values: 2292963 (98.3%)
Year missing Values: 2292963 (98.3%)
SecuritiesCodef_ missing Values: 2292963 (98.3%)

In [27]:
#price_financial_fill = ffill_pro_code(price_financial)
price_financial_fill = fill_and_drop_na_values(price_financial, drop=False)


100%|██████████| 2000/2000 [1:17:32<00:00,  2.33s/it]


In [28]:
print_shape(price_financial_fill)

 Shape:
 ----------------------------------------
 Observations:   2.33M
 Features:       38
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'AdjustmentFactor',
       'ExpectedDividend', 'SupervisionFlag', 'Target', 'ad_Open', 'ad_High',
       'ad_Low', 'ad_Close', 'ad_Volume', 'ad_Target', 'ad_Close_lag1',
       'ad_Open_lag1', 'ad_High_lag1', 'ad_Low_lag1', 'ad_Volume_lag1', 'macd',
       'macd_h', 'macd_s', 'Datef_', 'Day', 'Month', 'Year',
       'SecuritiesCodef_', 'Profit', 'NetSales', 'margin', 'profit_ttm',
       'rev_ttm', 'win_quarter_growth', 'rev_quarter_growth', 'win_yoy_growth',
       'rev_yoy_growth', 'win_ttm_growth', 'rev_ttm_growth', 'margin_growth'],
      dtype='object')
----------------------------------------
RowId missing Values: 0 (0.0%)
Date missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
AdjustmentFactor missing Values: 0 (0.0%)
ExpectedDividend missing Values: 127369 (5

In [29]:
# save dataframe to csv
price_financial_fill.to_csv('data/curr_filled_financial_features.csv')

In [30]:
print_shape(price_financial_fill)

 Shape:
 ----------------------------------------
 Observations:   2.33M
 Features:       38
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'AdjustmentFactor',
       'ExpectedDividend', 'SupervisionFlag', 'Target', 'ad_Open', 'ad_High',
       'ad_Low', 'ad_Close', 'ad_Volume', 'ad_Target', 'ad_Close_lag1',
       'ad_Open_lag1', 'ad_High_lag1', 'ad_Low_lag1', 'ad_Volume_lag1', 'macd',
       'macd_h', 'macd_s', 'Datef_', 'Day', 'Month', 'Year',
       'SecuritiesCodef_', 'Profit', 'NetSales', 'margin', 'profit_ttm',
       'rev_ttm', 'win_quarter_growth', 'rev_quarter_growth', 'win_yoy_growth',
       'rev_yoy_growth', 'win_ttm_growth', 'rev_ttm_growth', 'margin_growth'],
      dtype='object')
----------------------------------------
RowId missing Values: 0 (0.0%)
Date missing Values: 0 (0.0%)
SecuritiesCode missing Values: 0 (0.0%)
AdjustmentFactor missing Values: 0 (0.0%)
ExpectedDividend missing Values: 127369 (5