In [1]:
import pandas as pd
import numpy as np

In [2]:
def rolling_feature(df, window=50, op='mean'):
#input: a stock_prices-like df, the window param for rolling and type of aggregation
#output: df with a merged column of the needed rolling feature
    if op == 'mean':
        sma = df.set_index('Date').groupby(
            ['SecuritiesCode'])['Close'].rolling(
            window=window,min_periods=1).mean()
        sma.name = 'Sliding_mean_' + str(window)
        result = df.merge(sma, how='left', left_on=['SecuritiesCode','Date'], right_index=True)
        return result
    elif op == 'median':
        sma = df.set_index('Date').groupby(
            ['SecuritiesCode'])['Close'].rolling(
            window=window,min_periods=1).apply(np.median, raw=True, engine='numba')
        sma.name = 'Sliding_median_' + str(window)
        result = df.merge(sma, how='left', left_on=['SecuritiesCode','Date'], right_index=True)
        return result
    else:
        print('Invalid input')

In [20]:
def lags(df, lag=1, column='Close'):
#input: stock-price-like df with the columns Date, SecuritiesCode and the column to lag
#output: input df with the additional lagged column
    if 'Day_number' in df.columns:
        result = df
    else:
        prices_by_date = df.groupby(['Date']).agg({'SecuritiesCode' : 'count'}).reset_index()
        prices_by_date['Day_number'] = prices_by_date.index
        result = df.merge(prices_by_date[['Date', 'Day_number']], how='left', on='Date')
        del(prices_by_date)
        
    stock_prices_to_lag = result[['Day_number', 'SecuritiesCode', column]]
    stock_prices_to_lag['Day_number'] += lag
    stock_prices_to_lag.rename(columns={column : column + '_lag_' + str(lag)}, inplace=True)
    result = result.merge(
        stock_prices_to_lag[['Day_number', 'SecuritiesCode', column + '_lag_' + str(lag)]], 
        how='left', on=['Day_number', 'SecuritiesCode'])
    del(stock_prices_to_lag)
    return result

In [21]:
#weighted difference between yesterday's close and today's open
stock_prices = pd.read_csv('files/train_files/stock_prices.csv', index_col=0, parse_dates=[1])
stock_prices = lags(stock_prices, lag = 1, column='Close')
stock_prices['Gap'] = (stock_prices['Open'] - stock_prices['Close_lag_1']) / stock_prices['Close_lag_1']
stock_prices.sample(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_prices_to_lag['Day_number'] += lag
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_prices_to_lag.rename(columns={column : column + '_lag_' + str(lag)}, inplace=True)


Unnamed: 0,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,Day_number,Close_lag_1,Gap
1225643,2019-08-20,9001,3205.0,3225.0,3200.0,3220.0,199600,1.0,,False,0.004695,643,3230.0,-0.00774
2084102,2021-06-03,8098,1724.0,1740.0,1718.0,1734.0,120700,1.0,,False,-0.001151,1077,1724.0,0.0
2123484,2021-07-01,6036,2920.0,2920.0,2833.0,2836.0,125100,1.0,,False,-0.012007,1097,2900.0,0.006897
1233189,2019-08-26,8005,315.0,318.0,314.0,317.0,108200,1.0,,False,-0.006309,647,321.0,-0.018692
922630,2018-12-21,5949,1823.0,1832.0,1790.0,1804.0,334700,1.0,,False,0.029046,487,1840.0,-0.009239
2204413,2021-08-31,9532,2068.0,2076.0,2052.0,2063.0,1106800,1.0,,False,0.004812,1137,2083.0,-0.007201
1462713,2020-02-21,2002,1946.0,1955.0,1928.0,1931.0,810400,1.0,,False,0.007551,765,1956.0,-0.005112
2133440,2021-07-08,5801,2721.0,2729.0,2682.0,2708.0,592400,1.0,,False,0.034019,1102,2738.0,-0.006209
428719,2017-12-05,6151,2891.0,2928.0,2850.0,2919.0,36500,1.0,,False,0.005478,228,2904.0,-0.004477
583824,2018-04-06,4350,542.0,545.0,532.0,532.0,53600,1.0,,False,0.003788,310,543.0,-0.001842


In [22]:
#candles
def candle(row):
#function to use in .apply
#row must contain 3 lags for open and close
#source: https://www.investopedia.com/trading/candlestick-charting-what-is-it/
    seq = ''
    if row['Close_lag_3'] > row['Open_lag_3']:
        seq += 'g'
    elif row['Close_lag_3'] < row['Open_lag_3']:
        seq += 'r'
        
    if row['Close_lag_2'] > row['Open_lag_2']:
        seq += 'g'
    elif row['Close_lag_2'] < row['Open_lag_2']:
        seq += 'r'
        
    if row['Close_lag_1'] > row['Open_lag_1']:
        if abs(row['Close_lag_1'] - row['Open_lag_1']) < abs(row['Close_lag_2'] - row['Open_lag_2']):
            seq += 'g'
        else: 
            seq += 'G'
    elif row['Close_lag_1'] < row['Open_lag_1']:
        if abs(row['Close_lag_1'] - row['Open_lag_1']) < abs(row['Close_lag_2'] - row['Open_lag_2']):
            seq += 'r'
        else:
            seq += 'R'
    
    if row['Close'] > row['Open']:
        if (row['Close_lag_1'] < row['Close'] < row['Open_lag_1'] and
            row['Close_lag_1'] < row['Open'] < row['Open_lag_1']):
            seq += 'g_e'
        else:
            seq += 'g'
    else:
        if (row['Open_lag_1'] < row['Close'] < row['Close_lag_1'] and
            row['Open_lag_1'] < row['Open'] < row['Close_lag_1']):
            seq += 'r_e'
        else:
            seq += 'r'
            
    #outcomes
    if seq == 'ggGr_e':#Bearish Harami
        return -1
    elif seq == 'gggr': #Bearish Engulfing Pattern
        return -1
    elif seq == 'ggrr': #Bearish Evening Star
        return -1
    elif seq == 'rrrg': #Bullish Engulfing Pattern
        return 1
    elif seq == 'rrRg_e': #Bullish Harami
        return 1
    else:
        return 0
        

In [26]:
%%time
stock_prices = pd.read_csv('files/train_files/stock_prices.csv', index_col=0, parse_dates=[1])
for i in range(1, 4):
    stock_prices = lags(stock_prices, lag=i, column='Close')
    stock_prices = lags(stock_prices, lag=i, column='Open')
stock_prices

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_prices_to_lag['Day_number'] += lag
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_prices_to_lag.rename(columns={column : column + '_lag_' + str(lag)}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_prices_to_lag['Day_number'] += lag
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the d

CPU times: user 4.99 s, sys: 2.22 s, total: 7.21 s
Wall time: 7.87 s


Unnamed: 0,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target,Day_number,Close_lag_1,Open_lag_1,Close_lag_2,Open_lag_2,Close_lag_3,Open_lag_3
0,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,,False,0.000730,0,,,,,,
1,2017-01-04,1332,568.0,576.0,563.0,571.0,2798500,1.0,,False,0.012324,0,,,,,,
2,2017-01-04,1333,3150.0,3210.0,3140.0,3210.0,270800,1.0,,False,0.006154,0,,,,,,
3,2017-01-04,1376,1510.0,1550.0,1510.0,1550.0,11300,1.0,,False,0.011053,0,,,,,,
4,2017-01-04,1377,3270.0,3350.0,3270.0,3330.0,150800,1.0,,False,0.003026,0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2332526,2021-12-03,9990,514.0,528.0,513.0,528.0,44200,1.0,,False,0.034816,1201,507.0,518.0,522.0,520.0,520.0,529.0
2332527,2021-12-03,9991,782.0,794.0,782.0,794.0,35900,1.0,,False,0.025478,1201,778.0,777.0,770.0,768.0,768.0,786.0
2332528,2021-12-03,9993,1690.0,1690.0,1645.0,1645.0,7200,1.0,,False,-0.004302,1201,1650.0,1672.0,1671.0,1676.0,1680.0,1680.0
2332529,2021-12-03,9994,2388.0,2396.0,2380.0,2389.0,6500,1.0,,False,0.009098,1201,2362.0,2340.0,2340.0,2327.0,2328.0,2376.0


In [25]:
stock_prices['Candle'].value_counts()

 0    2133928
-1     127704
 1      70899
Name: Candle, dtype: int64