# Processing data for modelling

In [1]:
import pandas as pd
from tqdm import tqdm

from functions import print_shape, df_security_code

from feature_engineering import (fill_and_drop_na_values,
 adjust_price, 
 price_new_features, 
 encode_flag,
 fill_finances_knn,
 new_features_financial
 )

MISSINGVALUES = True

## import data

In [2]:
def data_selection(selection = True):
    if selection:
        prices = df_security_code(pd.read_csv('data/train_files/stock_prices.csv', parse_dates=[1]))
        financial = df_security_code(pd.read_csv('data/train_files/financials.csv',parse_dates=['Date']))
    else: 
        prices = pd.read_csv('data/train_files/stock_prices.csv', parse_dates=[1])
        financial = pd.read_csv('data/train_files/financials.csv',parse_dates=['Date'])
    return prices, financial

In [3]:
# helper function
def save_and_load(df):
    df_name = name =[x for x in globals() if globals()[x] is df][0]
    name = 'data/curr_' + df_name + '.csv'
    df.to_csv(name)
    del df
    return pd.read_csv(name, parse_dates=['Date'], index_col=[0] )

In [4]:
prices, financial = data_selection(True)

In [5]:
prices.head(2)

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
1209,20170104_7203,2017-01-04,7203,7010.0,7103.0,6975.0,7097.0,9547600,1.0,,False,-0.016882
3074,20170105_7203,2017-01-05,7203,7090.0,7091.0,7023.0,7049.0,7644000,1.0,,False,-0.009957


In [6]:
financial.head(2)

Unnamed: 0,DisclosureNumber,DateCode,Date,SecuritiesCode,DisclosedDate,DisclosedTime,DisclosedUnixTime,TypeOfDocument,CurrentPeriodEndDate,TypeOfCurrentPeriod,...,ForecastEarningsPerShare,ApplyingOfSpecificAccountingOfTheQuarterlyFinancialStatements,MaterialChangesInSubsidiaries,ChangesBasedOnRevisionsOfAccountingStandard,ChangesOtherThanOnesBasedOnRevisionsOfAccountingStandard,ChangesInAccountingEstimates,RetrospectiveRestatement,NumberOfIssuedAndOutstandingSharesAtTheEndOfFiscalYearIncludingTreasuryStock,NumberOfTreasuryStockAtTheEndOfFiscalYear,AverageNumberOfShares
2014,20161220000000.0,20170206_7203,2017-02-06,7203.0,2017-02-06,16:30:00,1486366000.0,3QFinancialStatements_Consolidated_US,2016-12-31,3Q,...,561.38,,False,True,True,,,3262997492.0,274703146,3017815402.0
6778,20170300000000.0,20170510_7203,2017-05-10,7203.0,2017-05-10,15:00:00,1494396000.0,FYFinancialStatements_Consolidated_US,2017-03-31,FY,...,500.05,,False,True,True,,,3262997492.0,288274636,3008088275.0


## - Stock price

In [7]:
# test with feature_engineering.py
#prices = pd.read_csv('data/train_files/stock_prices.csv', parse_dates=[1])
print_shape(prices, MISSINGVALUES)
#prices.head(2)

 Shape:
 ----------------------------------------
 Observations:   1.2K
 Features:       12
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close',
       'Volume', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag',
       'Target'],
      dtype='object')
----------------------------------------


Unnamed: 0,N_missing,Percentage
ExpectedDividend,1192.0,99.17
Open,1.0,0.08
High,1.0,0.08
Low,1.0,0.08
Close,1.0,0.08
RowId,0.0,0.0
Date,0.0,0.0
SecuritiesCode,0.0,0.0
Volume,0.0,0.0
AdjustmentFactor,0.0,0.0


In [8]:

#from feature_engineering import fill_and_drop_na_values

# fill na
fill_prices = fill_and_drop_na_values(prices, drop=False)

# save and load

fill_prices = save_and_load(fill_prices)

fill_prices.to_csv('data/curr_fill_prices.csv')
print_shape(fill_prices, MISSINGVALUES)
fill_prices.head(2)



100%|██████████| 1/1 [00:00<00:00, 80.04it/s]

 Shape:
 ----------------------------------------
 Observations:   1.2K
 Features:       12
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close',
       'Volume', 'AdjustmentFactor', 'ExpectedDividend', 'SupervisionFlag',
       'Target'],
      dtype='object')
----------------------------------------





Unnamed: 0,N_missing,Percentage
RowId,0.0,0.0
Date,0.0,0.0
SecuritiesCode,0.0,0.0
Open,0.0,0.0
High,0.0,0.0
Low,0.0,0.0
Close,0.0,0.0
Volume,0.0,0.0
AdjustmentFactor,0.0,0.0
ExpectedDividend,0.0,0.0


Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
1209,20170104_7203,2017-01-04,7203,7010.0,7103.0,6975.0,7097.0,9547600,1.0,0.0,False,-0.016882
3074,20170105_7203,2017-01-05,7203,7090.0,7091.0,7023.0,7049.0,7644000,1.0,0.0,False,-0.009957


In [9]:
#fill_prices = pd.read_csv('data/curr_fill_prices.csv', parse_dates=['Date'], index_col=[0])
#print_shape(fill_prices)

In [10]:



ad_price = adjust_price(fill_prices)
ad_price = save_and_load(ad_price)
print_shape(ad_price, MISSINGVALUES)
ad_price.head(2)
#del ad_price

100%|██████████| 1/1 [00:00<00:00, 136.06it/s]

 Shape:
 ----------------------------------------
 Observations:   1.2K
 Features:       12
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'AdjustmentFactor',
       'ExpectedDividend', 'SupervisionFlag', 'ad_Open', 'ad_High', 'ad_Low',
       'ad_Close', 'ad_Volume', 'ad_Target'],
      dtype='object')
----------------------------------------





Unnamed: 0,N_missing,Percentage
ad_Target,2.0,0.17
RowId,0.0,0.0
Date,0.0,0.0
SecuritiesCode,0.0,0.0
AdjustmentFactor,0.0,0.0
ExpectedDividend,0.0,0.0
SupervisionFlag,0.0,0.0
ad_Open,0.0,0.0
ad_High,0.0,0.0
ad_Low,0.0,0.0


Unnamed: 0,RowId,Date,SecuritiesCode,AdjustmentFactor,ExpectedDividend,SupervisionFlag,ad_Open,ad_High,ad_Low,ad_Close,ad_Volume,ad_Target
1209,20170104_7203,2017-01-04,7203,1.0,0.0,False,7010.0,7103.0,6975.0,7097.0,9547600.0,-0.016882
3074,20170105_7203,2017-01-05,7203,1.0,0.0,False,7090.0,7091.0,7023.0,7049.0,7644000.0,-0.009957


In [11]:
#ad_price = pd.read_csv('data/curr_ad_price.csv', parse_dates=['Date'], index_col=[0])
#print_shape(ad_price)

In [12]:
#from feature_engineering import price_new_features, encode_flag

ad_price_feat = price_new_features(ad_price)

ad_price_feat['SupervisionFlag'] = encode_flag(ad_price_feat)
#ad_price_feat.to_csv('data/curr_ad_price_feat.csv')

ad_price_feat = save_and_load(ad_price_feat)

print_shape(ad_price_feat, MISSINGVALUES)


100%|██████████| 1/1 [00:01<00:00,  1.26s/it]

 Shape:
 ----------------------------------------
 Observations:   1.2K
 Features:       53
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'AdjustmentFactor',
       'ExpectedDividend', 'SupervisionFlag', 'ad_Open', 'ad_High', 'ad_Low',
       'ad_Close', 'ad_Volume', 'ad_Target', 'ad_Close_lag1', 'ad_Close_sma10',
       'logprice_ad_Close', 'trend_ad_Close', 'detrend_ad_Close',
       'season_ad_Close', 'error_ad_Close', 'ad_Open_lag1', 'ad_Open_sma10',
       'logprice_ad_Open', 'trend_ad_Open', 'detrend_ad_Open',
       'season_ad_Open', 'error_ad_Open', 'ad_High_lag1', 'ad_High_sma10',
       'logprice_ad_High', 'trend_ad_High', 'detrend_ad_High',
       'season_ad_High', 'error_ad_High', 'ad_Low_lag1', 'ad_Low_sma10',
       'logprice_ad_Low', 'trend_ad_Low', 'detrend_ad_Low', 'season_ad_Low',
       'error_ad_Low', 'ad_Volume_lag1', 'ad_Volume_sma10', 'RSI', 'Return',
       'Log_Return', 'macd', 'macd_h', 'mac




Unnamed: 0,N_missing,Percentage
macd_s,33.0,2.75
macd_h,33.0,2.75
error_ad_Close,29.0,2.41
detrend_ad_Close,29.0,2.41
season_ad_Low,29.0,2.41
error_ad_High,29.0,2.41
season_ad_High,29.0,2.41
detrend_ad_High,29.0,2.41
trend_ad_High,29.0,2.41
error_ad_Low,29.0,2.41


In [13]:
dateL = ['Date', 'Year',
       'week', 'Day', 'Month' ]

In [14]:
ad_price_feat[dateL]

Unnamed: 0,Date,Year,week,Day,Month
1209,2017-01-04,2017.0,1.0,4.0,1.0
3074,2017-01-05,2017.0,1.0,5.0,1.0
4939,2017-01-06,2017.0,1.0,6.0,1.0
6804,2017-01-10,2017.0,2.0,10.0,1.0
8669,2017-01-11,2017.0,2.0,11.0,1.0
...,...,...,...,...,...
2323846,2021-11-29,2021.0,48.0,29.0,11.0
2325846,2021-11-30,2021.0,48.0,30.0,11.0
2327846,2021-12-01,2021.0,48.0,1.0,12.0
2329846,2021-12-02,2021.0,48.0,2.0,12.0


In [15]:
#ad_price_feat =  pd.read_csv('data/curr_ad_price_feat.csv', parse_dates=['Date'], index_col=[0])
#print_shape(ad_price_feat)

## - Financials

In [16]:
#from feature_engineering import fill_finances

#financial = financial = pd.read_csv('data/train_files/financials.csv',parse_dates=['Date'])
#financial.head(2)

In [17]:
print_shape(financial)

 Shape:
 ----------------------------------------
 Observations:   21
 Features:       45
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['DisclosureNumber', 'DateCode', 'Date', 'SecuritiesCode',
       'DisclosedDate', 'DisclosedTime', 'DisclosedUnixTime', 'TypeOfDocument',
       'CurrentPeriodEndDate', 'TypeOfCurrentPeriod',
       'CurrentFiscalYearStartDate', 'CurrentFiscalYearEndDate', 'NetSales',
       'OperatingProfit', 'OrdinaryProfit', 'Profit', 'EarningsPerShare',
       'TotalAssets', 'Equity', 'EquityToAssetRatio', 'BookValuePerShare',
       'ResultDividendPerShare1stQuarter', 'ResultDividendPerShare2ndQuarter',
       'ResultDividendPerShare3rdQuarter',
       'ResultDividendPerShareFiscalYearEnd', 'ResultDividendPerShareAnnual',
       'ForecastDividendPerShare1stQuarter',
       'ForecastDividendPerShare2ndQuarter',
       'ForecastDividendPerShare3rdQuarter',
       'ForecastDividendPerShareFiscalYearEnd',
       'ForecastDividendPerS

Unnamed: 0,N_missing,Percentage
RetrospectiveRestatement,21.0,100.0
ApplyingOfSpecificAccountingOfTheQuarterlyFinancialStatements,21.0,100.0
ResultDividendPerShareFiscalYearEnd,15.0,71.43
ResultDividendPerShareAnnual,15.0,71.43
ForecastDividendPerShare1stQuarter,15.0,71.43
BookValuePerShare,15.0,71.43
ChangesInAccountingEstimates,14.0,66.67
ResultDividendPerShare3rdQuarter,10.0,47.62
EquityToAssetRatio,10.0,47.62
ForecastDividendPerShare2ndQuarter,10.0,47.62


In [18]:
filled_financial = fill_finances_knn(financial, prices)

100%|██████████| 1/1 [00:00<00:00, 241.94it/s]


In [19]:
filled_financial.head()

Unnamed: 0,Date,Day,Month,Year,SecuritiesCode,Profit,NetSales
0,2017-02-06,6.0,2.0,2017.0,7203.0,1432704000000.0,20154720000000.0
1,2017-05-10,10.0,5.0,2017.0,7203.0,1831109000000.0,27597190000000.0
2,2017-08-04,4.0,8.0,2017.0,7203.0,613056000000.0,7047606000000.0
3,2017-11-07,7.0,11.0,2017.0,7203.0,1071328000000.0,14191210000000.0
4,2018-02-06,6.0,2.0,2018.0,7203.0,2013177000000.0,21796970000000.0


In [20]:
from feature_engineering import fill_finances_knn



filled_financial = fill_finances_knn(financial, prices)
#filled_financial.to_csv('data/curr_filled_finances.csv')

filled_financial = save_and_load(filled_financial)

print_shape(filled_financial, MISSINGVALUES)
filled_financial.head(2)
#del filled_finances

100%|██████████| 1/1 [00:00<00:00, 266.68it/s]

 Shape:
 ----------------------------------------
 Observations:   21
 Features:       7
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['Date', 'Day', 'Month', 'Year', 'SecuritiesCode', 'Profit', 'NetSales'], dtype='object')
----------------------------------------





Unnamed: 0,N_missing,Percentage
Date,0.0,0.0
Day,0.0,0.0
Month,0.0,0.0
Year,0.0,0.0
SecuritiesCode,0.0,0.0
Profit,0.0,0.0
NetSales,0.0,0.0


Unnamed: 0,Date,Day,Month,Year,SecuritiesCode,Profit,NetSales
0,2017-02-06,6.0,2.0,2017.0,7203.0,1432704000000.0,20154720000000.0
1,2017-05-10,10.0,5.0,2017.0,7203.0,1831109000000.0,27597190000000.0


In [21]:
#filled_finances = pd.read_csv('data/curr_filled_finances.csv', parse_dates=['Date'], index_col=[0])
#print_shape(filled_finances)

In [22]:
from feature_engineering import new_features_financial

filled_financial_features = new_features_financial(filled_financial)

#filled_financial_features.to_csv('data/curr_filled_financial_features.csv')
filled_financial_features = save_and_load(filled_financial_features)
print_shape(filled_financial_features)
filled_financial_features.head()



100%|██████████| 1/1 [00:00<00:00, 293.95it/s]

 Shape:
 ----------------------------------------
 Observations:   21
 Features:       12
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['Date', 'Day', 'Month', 'Year', 'SecuritiesCode', 'Profit', 'NetSales',
       'margin', 'win_quarter_growth', 'rev_quarter_growth', 'margin_growth',
       'RowId'],
      dtype='object')
----------------------------------------





Unnamed: 0,N_missing,Percentage
win_quarter_growth,1.0,4.76
rev_quarter_growth,1.0,4.76
margin_growth,1.0,4.76
Date,0.0,0.0
Day,0.0,0.0
Month,0.0,0.0
Year,0.0,0.0
SecuritiesCode,0.0,0.0
Profit,0.0,0.0
NetSales,0.0,0.0


Unnamed: 0,Date,Day,Month,Year,SecuritiesCode,Profit,NetSales,margin,win_quarter_growth,rev_quarter_growth,margin_growth,RowId
0,2017-02-06,6.0,2.0,2017.0,7203,1432704000000.0,20154720000000.0,7.108528,,,,20170206_7203
1,2017-05-10,10.0,5.0,2017.0,7203,1831109000000.0,27597190000000.0,6.635128,27.807907,36.9267,-6.659616,20170510_7203
2,2017-08-04,4.0,8.0,2017.0,7203,613056000000.0,7047606000000.0,8.698784,-66.519961,-74.462598,31.101978,20170804_7203
3,2017-11-07,7.0,11.0,2017.0,7203,1071328000000.0,14191210000000.0,7.549238,74.752062,101.362094,-13.215016,20171107_7203
4,2018-02-06,6.0,2.0,2018.0,7203,2013177000000.0,21796970000000.0,9.236039,87.914159,53.594927,22.343988,20180206_7203


In [23]:
filled_financial_features[['Date', 'Year', 'Day', 'Month' ]]

Unnamed: 0,Date,Year,Day,Month
0,2017-02-06,2017.0,6.0,2.0
1,2017-05-10,2017.0,10.0,5.0
2,2017-08-04,2017.0,4.0,8.0
3,2017-11-07,2017.0,7.0,11.0
4,2018-02-06,2018.0,6.0,2.0
5,2018-05-09,2018.0,9.0,5.0
6,2018-08-03,2018.0,3.0,8.0
7,2018-11-06,2018.0,6.0,11.0
8,2019-02-06,2019.0,6.0,2.0
9,2019-05-08,2019.0,8.0,5.0


In [24]:
#filled_financial_features.SecuritiesCode = filled_financial_features.SecuritiesCode.astype(int)

In [25]:
filled_financial_features.head(2)

Unnamed: 0,Date,Day,Month,Year,SecuritiesCode,Profit,NetSales,margin,win_quarter_growth,rev_quarter_growth,margin_growth,RowId
0,2017-02-06,6.0,2.0,2017.0,7203,1432704000000.0,20154720000000.0,7.108528,,,,20170206_7203
1,2017-05-10,10.0,5.0,2017.0,7203,1831109000000.0,27597190000000.0,6.635128,27.807907,36.9267,-6.659616,20170510_7203


In [26]:
ad_price_feat.head(2)

Unnamed: 0,RowId,Date,SecuritiesCode,AdjustmentFactor,ExpectedDividend,SupervisionFlag,ad_Open,ad_High,ad_Low,ad_Close,...,Return,Log_Return,macd,macd_h,macd_s,Day,Month,Year,week,Volatility_week
1209,20170104_7203,2017-01-04,7203,1.0,0.0,0,7010.0,7103.0,6975.0,7097.0,...,,,,,,4.0,1.0,2017.0,1.0,1.619015
3074,20170105_7203,2017-01-05,7203,1.0,0.0,0,7090.0,7091.0,7023.0,7049.0,...,0.993237,-0.006786,,,,5.0,1.0,2017.0,1.0,1.619015


## - financials + price

In [27]:
# create key on financial : RowId
#filled_financial_features['RowId'] = filled_financial_features.Date.dt.strftime('%Y%m%d').astype(str) + '_' + filled_financial_features.SecuritiesCode.astype(str)

In [28]:
#filled_financial_features.sort_values('Date').head(20)

In [29]:
#filled_financial_features.RowId.info()

In [30]:
from feature_engineering import price_financial_function

In [31]:
# concat financials and price
#price_financial = price_financial_function(ad_price_feat, filled_financial_features)
price_financial = pd.merge(ad_price_feat, filled_financial_features, how='left', on='RowId', suffixes=[None, '_f_'])
#price_financial = pd.concat([ad_price, filled_financial_features], keys='RowID')

fea_to_remove = ['Date_f_', 'Day_f_', 'Month_f_', 'Year_f_', 'SecuritiesCode_f_','Log_Return', 'AdjustmentFactor']
price_financial.drop(fea_to_remove, axis=1, inplace=True)

price_financial = save_and_load(price_financial)
print_shape(price_financial)
price_financial.head()

 Shape:
 ----------------------------------------
 Observations:   1.2K
 Features:       57
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'ExpectedDividend',
       'SupervisionFlag', 'ad_Open', 'ad_High', 'ad_Low', 'ad_Close',
       'ad_Volume', 'ad_Target', 'ad_Close_lag1', 'ad_Close_sma10',
       'logprice_ad_Close', 'trend_ad_Close', 'detrend_ad_Close',
       'season_ad_Close', 'error_ad_Close', 'ad_Open_lag1', 'ad_Open_sma10',
       'logprice_ad_Open', 'trend_ad_Open', 'detrend_ad_Open',
       'season_ad_Open', 'error_ad_Open', 'ad_High_lag1', 'ad_High_sma10',
       'logprice_ad_High', 'trend_ad_High', 'detrend_ad_High',
       'season_ad_High', 'error_ad_High', 'ad_Low_lag1', 'ad_Low_sma10',
       'logprice_ad_Low', 'trend_ad_Low', 'detrend_ad_Low', 'season_ad_Low',
       'error_ad_Low', 'ad_Volume_lag1', 'ad_Volume_sma10', 'RSI', 'Return',
       'macd', 'macd_h', 'macd_s', 'Day', 'Month', 'Year', 'wee

Unnamed: 0,N_missing,Percentage
margin_growth,1182.0,98.34
rev_quarter_growth,1182.0,98.34
win_quarter_growth,1182.0,98.34
margin,1181.0,98.25
NetSales,1181.0,98.25
Profit,1181.0,98.25
macd_s,33.0,2.75
macd_h,33.0,2.75
error_ad_High,29.0,2.41
season_ad_Open,29.0,2.41


Unnamed: 0,RowId,Date,SecuritiesCode,ExpectedDividend,SupervisionFlag,ad_Open,ad_High,ad_Low,ad_Close,ad_Volume,...,Month,Year,week,Volatility_week,Profit,NetSales,margin,win_quarter_growth,rev_quarter_growth,margin_growth
0,20170104_7203,2017-01-04,7203,0.0,0,7010.0,7103.0,6975.0,7097.0,9547600.0,...,1.0,2017.0,1.0,1.619015,,,,,,
1,20170105_7203,2017-01-05,7203,0.0,0,7090.0,7091.0,7023.0,7049.0,7644000.0,...,1.0,2017.0,1.0,1.619015,,,,,,
2,20170106_7203,2017-01-06,7203,0.0,0,6840.0,6948.0,6830.0,6930.0,11586100.0,...,1.0,2017.0,1.0,1.619015,,,,,,
3,20170110_7203,2017-01-10,7203,0.0,0,6922.0,6958.0,6861.0,6861.0,8931100.0,...,1.0,2017.0,2.0,2.108875,,,,,,
4,20170111_7203,2017-01-11,7203,0.0,0,6899.0,6924.0,6875.0,6912.0,6605300.0,...,1.0,2017.0,2.0,2.108875,,,,,,


In [32]:
price_financial[dateL]

Unnamed: 0,Date,Year,week,Day,Month
0,2017-01-04,2017.0,1.0,4.0,1.0
1,2017-01-05,2017.0,1.0,5.0,1.0
2,2017-01-06,2017.0,1.0,6.0,1.0
3,2017-01-10,2017.0,2.0,10.0,1.0
4,2017-01-11,2017.0,2.0,11.0,1.0
...,...,...,...,...,...
1197,2021-11-29,2021.0,48.0,29.0,11.0
1198,2021-11-30,2021.0,48.0,30.0,11.0
1199,2021-12-01,2021.0,48.0,1.0,12.0
1200,2021-12-02,2021.0,48.0,2.0,12.0


In [33]:
#price_financial_fill = ffill_pro_code(price_financial)
price_financial_fill = fill_and_drop_na_values(price_financial, drop=False)

price_financial_fill = save_and_load(price_financial_fill)

print_shape(price_financial_fill)
price_financial_fill.head(2)


100%|██████████| 1/1 [00:00<00:00, 175.41it/s]

 Shape:
 ----------------------------------------
 Observations:   1.2K
 Features:       57
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'ExpectedDividend',
       'SupervisionFlag', 'ad_Open', 'ad_High', 'ad_Low', 'ad_Close',
       'ad_Volume', 'ad_Target', 'ad_Close_lag1', 'ad_Close_sma10',
       'logprice_ad_Close', 'trend_ad_Close', 'detrend_ad_Close',
       'season_ad_Close', 'error_ad_Close', 'ad_Open_lag1', 'ad_Open_sma10',
       'logprice_ad_Open', 'trend_ad_Open', 'detrend_ad_Open',
       'season_ad_Open', 'error_ad_Open', 'ad_High_lag1', 'ad_High_sma10',
       'logprice_ad_High', 'trend_ad_High', 'detrend_ad_High',
       'season_ad_High', 'error_ad_High', 'ad_Low_lag1', 'ad_Low_sma10',
       'logprice_ad_Low', 'trend_ad_Low', 'detrend_ad_Low', 'season_ad_Low',
       'error_ad_Low', 'ad_Volume_lag1', 'ad_Volume_sma10', 'RSI', 'Return',
       'macd', 'macd_h', 'macd_s', 'Day', 'Month', 'Year', 'wee




Unnamed: 0,N_missing,Percentage
margin_growth,85.0,7.07
rev_quarter_growth,85.0,7.07
win_quarter_growth,85.0,7.07
macd_s,33.0,2.75
macd_h,33.0,2.75
season_ad_High,29.0,2.41
detrend_ad_Open,29.0,2.41
season_ad_Open,29.0,2.41
error_ad_Open,29.0,2.41
detrend_ad_High,29.0,2.41


Unnamed: 0,RowId,Date,SecuritiesCode,ExpectedDividend,SupervisionFlag,ad_Open,ad_High,ad_Low,ad_Close,ad_Volume,...,Month,Year,week,Volatility_week,Profit,NetSales,margin,win_quarter_growth,rev_quarter_growth,margin_growth
0,20170104_7203,2017-01-04,7203,0.0,0,7010.0,7103.0,6975.0,7097.0,9547600.0,...,1.0,2017.0,1.0,1.619015,,,,,,
1,20170105_7203,2017-01-05,7203,0.0,0,7090.0,7091.0,7023.0,7049.0,7644000.0,...,1.0,2017.0,1.0,1.619015,,,,,,


In [34]:
price_financial_fill[dateL]

Unnamed: 0,Date,Year,week,Day,Month
0,2017-01-04,2017.0,1.0,4.0,1.0
1,2017-01-05,2017.0,1.0,5.0,1.0
2,2017-01-06,2017.0,1.0,6.0,1.0
3,2017-01-10,2017.0,2.0,10.0,1.0
4,2017-01-11,2017.0,2.0,11.0,1.0
...,...,...,...,...,...
1197,2021-11-29,2021.0,48.0,29.0,11.0
1198,2021-11-30,2021.0,48.0,30.0,11.0
1199,2021-12-01,2021.0,48.0,1.0,12.0
1200,2021-12-02,2021.0,48.0,2.0,12.0


In [35]:
# save dataframe to csv
#price_financial_fill.to_csv('data/curr_filled_financial_features.csv')

In [36]:
print_shape(price_financial_fill, False)

 Shape:
 ----------------------------------------
 Observations:   1.2K
 Features:       57
 Feature Date:    datetime64[ns]
----------------------------------------
Index(['RowId', 'Date', 'SecuritiesCode', 'ExpectedDividend',
       'SupervisionFlag', 'ad_Open', 'ad_High', 'ad_Low', 'ad_Close',
       'ad_Volume', 'ad_Target', 'ad_Close_lag1', 'ad_Close_sma10',
       'logprice_ad_Close', 'trend_ad_Close', 'detrend_ad_Close',
       'season_ad_Close', 'error_ad_Close', 'ad_Open_lag1', 'ad_Open_sma10',
       'logprice_ad_Open', 'trend_ad_Open', 'detrend_ad_Open',
       'season_ad_Open', 'error_ad_Open', 'ad_High_lag1', 'ad_High_sma10',
       'logprice_ad_High', 'trend_ad_High', 'detrend_ad_High',
       'season_ad_High', 'error_ad_High', 'ad_Low_lag1', 'ad_Low_sma10',
       'logprice_ad_Low', 'trend_ad_Low', 'detrend_ad_Low', 'season_ad_Low',
       'error_ad_Low', 'ad_Volume_lag1', 'ad_Volume_sma10', 'RSI', 'Return',
       'macd', 'macd_h', 'macd_s', 'Day', 'Month', 'Year', 'wee

final


In [37]:
df = pd.read_csv('data/curr_price_financial_fill.csv', parse_dates=['Date'], index_col=[0])

In [38]:
from functions import missingValues
missingValues(df)

Unnamed: 0,N_missing,Percentage
margin_growth,85.0,7.07
rev_quarter_growth,85.0,7.07
win_quarter_growth,85.0,7.07
macd_s,33.0,2.75
macd_h,33.0,2.75
season_ad_High,29.0,2.41
detrend_ad_Open,29.0,2.41
season_ad_Open,29.0,2.41
error_ad_Open,29.0,2.41
detrend_ad_High,29.0,2.41


In [39]:
import numpy as np


In [40]:
#df['Date'] = pd.to_datetime(df.Date)
df['logprice'] = np.log(df.ad_Close)
df_trend = pd.DataFrame()

for i in df.SecuritiesCode.unique()[:3]:
    df_current = df.query('SecuritiesCode == @i')
    df_current = df_current.reset_index()
    df_current['trend'] = df_current.ad_Close.rolling(30).mean()
    df_current['detrend'] = df_current.ad_Close - df_current.trend


    df_trend = pd.concat([df_trend ,df_current])

In [41]:
df.tail()

Unnamed: 0,RowId,Date,SecuritiesCode,ExpectedDividend,SupervisionFlag,ad_Open,ad_High,ad_Low,ad_Close,ad_Volume,...,Year,week,Volatility_week,Profit,NetSales,margin,win_quarter_growth,rev_quarter_growth,margin_growth,logprice
1197,20211129_7203,2021-11-29,7203,0.0,0,10155.0,10205.0,10025.0,10060.0,5595780.0,...,2021.0,48.0,4.626824,1524484000000.0,15481300000000.0,9.847262,69.796131,95.087718,-12.964213,9.216322
1198,20211130_7203,2021-11-30,7203,0.0,0,10257.5,10307.5,9995.0,10005.0,8527800.0,...,2021.0,48.0,4.626824,1524484000000.0,15481300000000.0,9.847262,69.796131,95.087718,-12.964213,9.21084
1199,20211201_7203,2021-12-01,7203,0.0,0,10037.5,10370.0,10035.0,10227.5,5058100.0,...,2021.0,48.0,4.626824,1524484000000.0,15481300000000.0,9.847262,69.796131,95.087718,-12.964213,9.232835
1200,20211202_7203,2021-12-02,7203,0.0,0,10150.0,10255.0,10025.0,10227.5,4871400.0,...,2021.0,48.0,4.626824,1524484000000.0,15481300000000.0,9.847262,69.796131,95.087718,-12.964213,9.232835
1201,20211203_7203,2021-12-03,7203,0.0,0,10262.5,10410.0,10202.5,10395.0,3804740.0,...,2021.0,48.0,4.626824,1524484000000.0,15481300000000.0,9.847262,69.796131,95.087718,-12.964213,9.24908


In [42]:
df_trend

Unnamed: 0,index,RowId,Date,SecuritiesCode,ExpectedDividend,SupervisionFlag,ad_Open,ad_High,ad_Low,ad_Close,...,Volatility_week,Profit,NetSales,margin,win_quarter_growth,rev_quarter_growth,margin_growth,logprice,trend,detrend
0,0,20170104_7203,2017-01-04,7203,0.0,0,7010.0,7103.0,6975.0,7097.0,...,1.619015,,,,,,,8.867427,,
1,1,20170105_7203,2017-01-05,7203,0.0,0,7090.0,7091.0,7023.0,7049.0,...,1.619015,,,,,,,8.860641,,
2,2,20170106_7203,2017-01-06,7203,0.0,0,6840.0,6948.0,6830.0,6930.0,...,1.619015,,,,,,,8.843615,,
3,3,20170110_7203,2017-01-10,7203,0.0,0,6922.0,6958.0,6861.0,6861.0,...,2.108875,,,,,,,8.833608,,
4,4,20170111_7203,2017-01-11,7203,0.0,0,6899.0,6924.0,6875.0,6912.0,...,2.108875,,,,,,,8.841014,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1197,1197,20211129_7203,2021-11-29,7203,0.0,0,10155.0,10205.0,10025.0,10060.0,...,4.626824,1.524484e+12,1.548130e+13,9.847262,69.796131,95.087718,-12.964213,9.216322,10231.416667,-171.416667
1198,1198,20211130_7203,2021-11-30,7203,0.0,0,10257.5,10307.5,9995.0,10005.0,...,4.626824,1.524484e+12,1.548130e+13,9.847262,69.796131,95.087718,-12.964213,9.210840,10232.666667,-227.666667
1199,1199,20211201_7203,2021-12-01,7203,0.0,0,10037.5,10370.0,10035.0,10227.5,...,4.626824,1.524484e+12,1.548130e+13,9.847262,69.796131,95.087718,-12.964213,9.232835,10233.583333,-6.083333
1200,1200,20211202_7203,2021-12-02,7203,0.0,0,10150.0,10255.0,10025.0,10227.5,...,4.626824,1.524484e+12,1.548130e+13,9.847262,69.796131,95.087718,-12.964213,9.232835,10236.166667,-8.666667


In [43]:
test = df.detrend.groupby(df.index//30).mean()
test = test.to_list()
tempList = test
count = 30
test = test + 29 * test

AttributeError: 'DataFrame' object has no attribute 'detrend'

In [None]:
test = test[ : len(df)]
df['season'] = test
df['error'] = df.Close - df.trend - df.season

In [None]:
df.drop(fea_to_remove, axis=1, inplace=True)