# Feature engineering

Importing libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Daily data

In [3]:
daily_data = pd.read_csv('daily_data_cleaned.csv', parse_dates=['Date'])

In [4]:
daily_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,transactions,ticker
0,2015-02-17 05:00:00,31.8725,32.2200,31.730000,31.9575,252516820.0,310744,AAPL
1,2015-02-18 05:00:00,31.9063,32.1950,31.862500,32.1788,179558052.0,226461,AAPL
2,2015-02-19 05:00:00,32.1200,32.2575,32.082500,32.1125,149449524.0,199195,AAPL
3,2015-02-20 05:00:00,32.1550,32.3750,32.012500,32.3738,195793676.0,236741,AAPL
4,2015-02-23 05:00:00,32.5050,33.2500,32.415000,33.2500,283896440.0,340905,AAPL
...,...,...,...,...,...,...,...,...
224025,2025-02-07 05:00:00,207.0200,210.2300,204.420000,205.5300,1476998.0,38798,ZS
224026,2025-02-10 05:00:00,209.2900,212.6900,207.480000,212.5300,1350518.0,32807,ZS
224027,2025-02-11 05:00:00,210.0000,211.3011,207.060000,209.2400,1672527.0,30743,ZS
224028,2025-02-12 05:00:00,206.7700,211.3700,204.640000,211.1400,1363301.0,31255,ZS


### Creating lag feature

In [5]:
daily_data['close_lag'] = daily_data['Close'].shift(1)
daily_data['volume_lag'] = daily_data['Volume'].shift(1)
daily_data['close_lag'] = daily_data['close_lag'].bfill()
daily_data['volume_lag'] = daily_data['volume_lag'].bfill()

### Creating difference feature

In [6]:
daily_data['close_diff'] = daily_data['Close'].diff()
daily_data['volume_diff'] = daily_data['Volume'].diff()
daily_data['close_diff'] = daily_data['close_diff'].bfill()
daily_data['volume_diff'] = daily_data['volume_diff'].bfill()

### Technical indicator: Creating rolling window feature

In [7]:
daily_data['close_mean_10_days'] = daily_data['Close'].rolling(window=10).mean()
daily_data['close_std_10_days'] = daily_data['Close'].rolling(window=10).std()
daily_data['close_max_10_days'] = daily_data['Close'].rolling(window=10).max()

daily_data['close_mean_30_days'] = daily_data['Close'].rolling(window=30).mean()
daily_data['close_std_30_days'] = daily_data['Close'].rolling(window=30).std()
daily_data['close_max_30_days'] = daily_data['Close'].rolling(window=30).max()

daily_data['close_mean_10_days'] = daily_data['close_mean_10_days'].bfill()
daily_data['close_std_10_days'] = daily_data['close_std_10_days'].bfill()
daily_data['close_max_10_days'] = daily_data['close_max_10_days'].bfill()
daily_data['close_mean_30_days'] = daily_data['close_mean_30_days'].bfill()
daily_data['close_std_30_days'] = daily_data['close_std_30_days'].bfill()
daily_data['close_max_30_days'] = daily_data['close_max_30_days'].bfill()


### Technical indicator: Exponential Moving Average

In [8]:
daily_data['EMA_close_10_days'] = daily_data['Close'].ewm(span=10, adjust=False).mean()
daily_data['EMA_close_30_days'] = daily_data['Close'].ewm(span=30, adjust=False).mean()

### Momentum indicator: RSI

In [9]:
def calculate_rsi_optimized(prices, period=14):
    """
    Optimized RSI calculation using pandas ewm (exponentially weighted moving average)
    This is more efficient and matches the traditional Wilder's smoothing method
    """
    delta = prices.diff()
    
    gains = delta.where(delta > 0, 0)
    losses = -delta.where(delta < 0, 0)
    
    # Use exponential weighted moving average with alpha = 1/period
    # This is equivalent to Wilder's smoothing method
    alpha = 1.0 / period
    avg_gain = gains.ewm(alpha=alpha, adjust=False).mean()
    avg_loss = losses.ewm(alpha=alpha, adjust=False).mean()
    
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    
    return rsi

In [10]:
daily_data['RSI_14_days'] = calculate_rsi_optimized(daily_data['Close'], period=14)

### Volatility measure: Bollinger bands

In [11]:
def fast_bollinger_features(prices, period=20, std_dev=2):
    """Optimized Bollinger Bands for large datasets"""
    # Vectorized calculations
    sma = prices.rolling(period).mean()
    std = prices.rolling(period).std()
    
    upper = sma + (std * std_dev)
    lower = sma - (std * std_dev)
    
    # Key features only
    percent_b = (prices - lower) / (upper - lower)
    bandwidth = (upper - lower) / sma
    
    return pd.DataFrame({
        'bb_percent_b_{}'.format(period) : percent_b,
        'bb_bandwidth_{}'.format(period): bandwidth,
        'bb_upper_{}'.format(period): upper,
        'bb_lower_{}'.format(period): lower,
        'bb_middle_{}'.format(period): sma
    })

In [12]:
fast_bollinger_features_20 = fast_bollinger_features(daily_data['Close'], period=20, std_dev=2)

In [13]:
daily_data = pd.concat([daily_data, fast_bollinger_features_20], axis=1)

In [14]:
daily_data[['bb_percent_b_20', 'bb_bandwidth_20', 'bb_upper_20', 'bb_lower_20', 'bb_middle_20']] = daily_data[['bb_percent_b_20', 'bb_bandwidth_20', 'bb_upper_20', 'bb_lower_20', 'bb_middle_20']].bfill()

In [15]:
fast_bollinger_features_50 = fast_bollinger_features(daily_data['Close'], period=50, std_dev=2)
daily_data = pd.concat([daily_data, fast_bollinger_features_50], axis=1)
daily_data[['bb_percent_b_50', 'bb_bandwidth_50', 'bb_upper_50', 'bb_lower_50', 'bb_middle_50']] = daily_data[['bb_percent_b_50', 'bb_bandwidth_50', 'bb_upper_50', 'bb_lower_50', 'bb_middle_50']].bfill()

In [16]:
daily_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,transactions,ticker,close_lag,volume_lag,...,bb_percent_b_20,bb_bandwidth_20,bb_upper_20,bb_lower_20,bb_middle_20,bb_percent_b_50,bb_bandwidth_50,bb_upper_50,bb_lower_50,bb_middle_50
0,2015-02-17 05:00:00,31.8725,32.2200,31.730000,31.9575,252516820.0,310744,AAPL,31.9575,252516820.0,...,0.250741,0.086711,33.311811,30.543329,31.92757,0.852169,0.075919,32.996786,30.583322,31.790054
1,2015-02-18 05:00:00,31.9063,32.1950,31.862500,32.1788,179558052.0,226461,AAPL,31.9575,252516820.0,...,0.250741,0.086711,33.311811,30.543329,31.92757,0.852169,0.075919,32.996786,30.583322,31.790054
2,2015-02-19 05:00:00,32.1200,32.2575,32.082500,32.1125,149449524.0,199195,AAPL,32.1788,179558052.0,...,0.250741,0.086711,33.311811,30.543329,31.92757,0.852169,0.075919,32.996786,30.583322,31.790054
3,2015-02-20 05:00:00,32.1550,32.3750,32.012500,32.3738,195793676.0,236741,AAPL,32.1125,149449524.0,...,0.250741,0.086711,33.311811,30.543329,31.92757,0.852169,0.075919,32.996786,30.583322,31.790054
4,2015-02-23 05:00:00,32.5050,33.2500,32.415000,33.2500,283896440.0,340905,AAPL,32.3738,195793676.0,...,0.250741,0.086711,33.311811,30.543329,31.92757,0.852169,0.075919,32.996786,30.583322,31.790054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224025,2025-02-07 05:00:00,207.0200,210.2300,204.420000,205.5300,1476998.0,38798,ZS,204.2300,1201540.0,...,0.828766,0.149001,210.529013,181.334987,195.93200,0.750671,0.194046,215.012585,176.980215,195.996400
224026,2025-02-10 05:00:00,209.2900,212.6900,207.480000,212.5300,1350518.0,32807,ZS,205.5300,1476998.0,...,0.981333,0.163103,213.129989,180.989011,197.05950,0.930457,0.195428,215.194284,176.882916,196.038600
224027,2025-02-11 05:00:00,210.0000,211.3011,207.060000,209.2400,1672527.0,30743,ZS,212.5300,1350518.0,...,0.842081,0.162877,214.337893,182.056107,198.19700,0.846259,0.194814,215.110908,176.923892,196.017400
224028,2025-02-12 05:00:00,206.7700,211.3700,204.640000,211.1400,1363301.0,31255,ZS,209.2400,1672527.0,...,0.867455,0.158502,215.331635,183.707365,199.51950,0.886428,0.197891,215.548224,176.733776,196.141000


In [17]:
daily_data.to_csv('daily_data_with_features.csv', index=False)

# Minute data

In [18]:
minute_data = pd.read_csv('cleaned_minute_data.csv', parse_dates=['Date'])

### Price movement at different intervals

In [19]:
minute_data['return_1min'] = minute_data['Close'].pct_change(1)
minute_data['return_5min'] = minute_data['Close'].pct_change(5)
minute_data['return_15min'] = minute_data['Close'].pct_change(15)
minute_data['return_30min'] = minute_data['Close'].pct_change(30)

minute_data['return_1min'] = minute_data['return_1min'].bfill()
minute_data['return_5min'] = minute_data['return_5min'].bfill()
minute_data['return_15min'] = minute_data['return_15min'].bfill()
minute_data['return_30min'] = minute_data['return_30min'].bfill()

### Price velocity and acceleration

In [20]:
minute_data['velocity_5min'] = minute_data['return_1min'].rolling(5).mean()
minute_data['acceleration_5min'] = minute_data['velocity_5min'].diff()

minute_data['velocity_5min'] = minute_data['velocity_5min'].bfill()
minute_data['acceleration_5min'] = minute_data['acceleration_5min'].bfill()

In [21]:
minute_data['volume_roc_5min'] = minute_data['Volume'].pct_change(5)
minute_data['volume_roc_15min'] = minute_data['Volume'].pct_change(15)

minute_data['volume_ma_20'] = minute_data['Volume'].rolling(20).mean()
minute_data['volume_ratio'] = minute_data['Volume'] / minute_data['volume_ma_20']

In [22]:
minute_data['realized_vol_5min'] = minute_data['return_1min'].rolling(5).std() * np.sqrt(5)
minute_data['realized_vol_15min'] = minute_data['return_1min'].rolling(15).std() * np.sqrt(15)
minute_data['realized_vol_60min'] = minute_data['return_1min'].rolling(60).std() * np.sqrt(60)

In [23]:
minute_data['realized_vol_5min'] = minute_data['realized_vol_5min'].bfill()
minute_data['realized_vol_15min'] = minute_data['realized_vol_15min'].bfill()
minute_data['realized_vol_60min'] = minute_data['realized_vol_60min'].bfill()
minute_data['volume_ratio'] = minute_data['volume_ratio'].bfill()
minute_data['volume_ma_20'] = minute_data['volume_ma_20'].bfill()
minute_data['volume_roc_15min'] = minute_data['volume_roc_15min'].bfill()
minute_data['volume_roc_5min'] = minute_data['volume_roc_5min'].bfill()

In [24]:
minute_data.head()

Unnamed: 0,Date,ticker,Open,High,Low,Close,Volume,Transactions,TradeDate,return_1min,...,return_30min,velocity_5min,acceleration_5min,volume_roc_5min,volume_roc_15min,volume_ma_20,volume_ratio,realized_vol_5min,realized_vol_15min,realized_vol_60min
0,2015-02-17 09:00:00,AAPL_minute_data,31.65,32.0,31.65,32.0,6176.0,10.0,2015-02-17,-0.007031,...,-0.004375,-0.002388,0.001406,-0.935233,-0.45013,2850.4,1.683974,0.009675,0.010117,0.010475
1,2015-02-17 09:01:00,AAPL_minute_data,31.775,31.8375,31.77,31.775,3700.0,8.0,2015-02-17,-0.007031,...,-0.004375,-0.002388,0.001406,-0.935233,-0.45013,2850.4,1.683974,0.009675,0.010117,0.010475
2,2015-02-17 09:02:00,AAPL_minute_data,31.775,31.8375,31.77,31.775,3700.0,8.0,2015-02-17,0.0,...,-0.004375,-0.002388,0.001406,-0.935233,-0.45013,2850.4,1.683974,0.009675,0.010117,0.010475
3,2015-02-17 09:03:00,AAPL_minute_data,31.8425,31.8425,31.8425,31.8425,400.0,1.0,2015-02-17,0.002124,...,-0.004375,-0.002388,0.001406,-0.935233,-0.45013,2850.4,1.683974,0.009675,0.010117,0.010475
4,2015-02-17 09:04:00,AAPL_minute_data,31.8425,31.8425,31.8425,31.8425,400.0,1.0,2015-02-17,0.0,...,-0.004375,-0.002388,0.001406,-0.935233,-0.45013,2850.4,1.683974,0.009675,0.010117,0.010475


In [25]:
for col in minute_data.select_dtypes(include='object'):
    minute_data[col] = minute_data[col].str.replace('_minute_data', '', regex=False)


In [26]:
minute_data.columns

Index(['Date', 'ticker', 'Open', 'High', 'Low', 'Close', 'Volume',
       'Transactions', 'TradeDate', 'return_1min', 'return_5min',
       'return_15min', 'return_30min', 'velocity_5min', 'acceleration_5min',
       'volume_roc_5min', 'volume_roc_15min', 'volume_ma_20', 'volume_ratio',
       'realized_vol_5min', 'realized_vol_15min', 'realized_vol_60min'],
      dtype='object')

In [27]:
minute_data['Date'] = pd.to_datetime(minute_data['Date'])

In [28]:
minute_data['timestamp'] = minute_data['Date'].dt.date

In [29]:
agg_funcs = {
    'return_1min': 'mean',  # total return in the day
    'return_5min': 'mean',
    'return_15min': 'mean',
    'return_30min': 'mean',
    'velocity_5min': 'mean',
    'acceleration_5min': 'mean',
    'volume_roc_5min': 'mean',
    'volume_roc_15min': 'mean',
    'volume_ma_20': 'mean',
    'volume_ratio': 'mean',
    'realized_vol_5min': 'mean',
    'realized_vol_15min': 'mean',
    'realized_vol_60min': 'mean',
}

# Group and aggregate
minute_to_daily = minute_data.groupby(['ticker', 'timestamp']).agg(agg_funcs).reset_index()

In [30]:
#minute_data.to_csv('minute_data_with_features.csv', index=False)

In [31]:
minute_to_daily.head()

Unnamed: 0,ticker,timestamp,return_1min,return_5min,return_15min,return_30min,velocity_5min,acceleration_5min,volume_roc_5min,volume_roc_15min,volume_ma_20,volume_ratio,realized_vol_5min,realized_vol_15min,realized_vol_60min
0,AAPL,2015-02-17,-1.002309e-05,-1.2e-05,-2.9e-05,-5.9e-05,-1.016938e-05,1.044803e-05,1.424515,3.450577,268059.308667,1.074401,0.000882,0.001692,0.003843
1,AAPL,2015-02-18,4.662713e-06,2.3e-05,6.9e-05,0.000137,4.66271e-06,2.167705e-08,2.519191,3.893001,121365.304444,1.04007,0.000415,0.000823,0.001882
2,AAPL,2015-02-19,-7.276972e-07,-3e-06,-9e-06,-1.6e-05,-6.196074e-07,-2.160685e-08,2.540214,4.339169,101424.814306,1.048567,0.000312,0.000603,0.001348
3,AAPL,2015-02-20,5.525064e-06,2.7e-05,8e-05,0.000159,5.422556e-06,1.362485e-09,3.471187,4.291842,142732.5425,1.05025,0.000317,0.000619,0.001347
4,AAPL,2015-02-21,6.500368e-06,2.6e-05,4.6e-05,0.000105,5.296778e-06,5.006509e-07,2.074813,0.823907,3884.010345,1.107219,0.000124,0.00025,0.00058


In [32]:
daily_data['timestamp'] = daily_data['Date'].dt.date

### Merging

In [33]:
merged_data = pd.merge(daily_data, minute_to_daily, on=['ticker', 'timestamp'], how='left')

In [34]:
merged_data['Target'] = merged_data['Close'].shift(-1)

In [35]:
merged_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,transactions,ticker,close_lag,volume_lag,...,velocity_5min,acceleration_5min,volume_roc_5min,volume_roc_15min,volume_ma_20,volume_ratio,realized_vol_5min,realized_vol_15min,realized_vol_60min,Target
0,2015-02-17 05:00:00,31.8725,32.2200,31.730000,31.9575,252516820.0,310744,AAPL,31.9575,252516820.0,...,-1.016938e-05,1.044803e-05,1.424515,3.450577,268059.308667,1.074401,0.000882,0.001692,0.003843,32.1788
1,2015-02-18 05:00:00,31.9063,32.1950,31.862500,32.1788,179558052.0,226461,AAPL,31.9575,252516820.0,...,4.662710e-06,2.167705e-08,2.519191,3.893001,121365.304444,1.040070,0.000415,0.000823,0.001882,32.1125
2,2015-02-19 05:00:00,32.1200,32.2575,32.082500,32.1125,149449524.0,199195,AAPL,32.1788,179558052.0,...,-6.196074e-07,-2.160685e-08,2.540214,4.339169,101424.814306,1.048567,0.000312,0.000603,0.001348,32.3738
3,2015-02-20 05:00:00,32.1550,32.3750,32.012500,32.3738,195793676.0,236741,AAPL,32.1125,149449524.0,...,5.422556e-06,1.362485e-09,3.471187,4.291842,142732.542500,1.050250,0.000317,0.000619,0.001347,33.2500
4,2015-02-23 05:00:00,32.5050,33.2500,32.415000,33.2500,283896440.0,340905,AAPL,32.3738,195793676.0,...,2.928986e-05,-3.252393e-08,3.103435,8.557640,293445.295778,1.051464,0.000598,0.001138,0.002426,33.0425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224025,2025-02-07 05:00:00,207.0200,210.2300,204.420000,205.5300,1476998.0,38798,ZS,204.2300,1201540.0,...,-2.757734e-06,-4.321096e-22,0.758244,1.852283,1327.261594,1.030114,0.000702,0.001361,0.003100,212.5300
224026,2025-02-10 05:00:00,209.2900,212.6900,207.480000,212.5300,1350518.0,32807,ZS,205.5300,1476998.0,...,6.608178e-05,0.000000e+00,0.273509,0.918532,603.234669,1.082622,0.000925,0.002445,0.008193,209.2400
224027,2025-02-11 05:00:00,210.0000,211.3011,207.060000,209.2400,1672527.0,30743,ZS,212.5300,1350518.0,...,,,,,,,,,,211.1400
224028,2025-02-12 05:00:00,206.7700,211.3700,204.640000,211.1400,1363301.0,31255,ZS,209.2400,1672527.0,...,,,,,,,,,,214.8400


In [36]:
merged_data.to_csv('merged_data_with_features.csv', index=False)