In [1]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np

In [10]:
data = pd.read_csv('/Users/amulya/Desktop/Capstone/DSCI-601-Amy/Data/Combined/Cleaned_AAPL.csv')
data['date'] = pd.to_datetime(data['date']) 

In [11]:
data

Unnamed: 0,date,RET,VOL_CHANGE,BA_SPREAD,ILLIQUIDITY,sprtrn,TURNOVER,DJI_Return
0,1992-01-02,0.055432,0.717745,0.008403,4.510000e-10,0.000408,17.419850,0.000000
1,1992-01-03,-0.008403,-0.172890,0.004237,-8.340000e-11,0.004985,14.408127,0.009173
2,1992-01-06,-0.016949,-0.399632,0.004310,-2.850000e-10,-0.003291,8.650181,-0.000437
3,1992-01-07,0.019397,0.237283,0.004228,2.590000e-10,-0.001340,10.702726,0.001469
4,1992-01-08,0.023256,0.645321,0.004132,1.840000e-10,0.001677,17.609419,-0.000281
...,...,...,...,...,...,...,...,...
7801,2022-12-23,-0.002798,-0.181476,0.000076,-3.330000e-13,0.005868,4.008909,0.005342
7802,2022-12-27,-0.013878,0.081093,0.000231,-1.550000e-12,-0.004050,4.334004,0.001133
7803,2022-12-28,-0.030685,0.238299,0.000079,-2.850000e-12,-0.012021,5.366792,-0.011006
7804,2022-12-29,0.028324,-0.115337,0.000231,2.890000e-12,0.017461,4.747802,0.010497


### Time based features

In [12]:
data['day_of_week'] = data['date'].dt.dayofweek
data['day_of_month'] = data['date'].dt.day
data['month'] = data['date'].dt.month


In [14]:
data['is_month_start'] = data['date'].dt.is_month_start.astype(int)
data['is_month_end'] = data['date'].dt.is_month_end.astype(int)


In [15]:
data['year'] = data['date'].dt.year
data['week'] = data['date'].dt.isocalendar().week


In [29]:
data.isnull().sum()

date                       0
RET                        0
VOL_CHANGE                 0
BA_SPREAD                  0
ILLIQUIDITY                0
sprtrn                     0
TURNOVER                   0
DJI_Return                 0
day_of_week                0
day_of_month               0
month                      0
is_month_start             0
is_month_end               0
year                       0
week                       0
RET_ema_12                 0
RET_ema_26                 0
RET_cumulative_sum         0
RET_cumulative_product     0
RET_skew_10                9
RET_kurtosis_10            9
RET_entropy_10             9
RSI                       13
OBV                        1
dtype: int64

### Cumulative features

In [18]:
#Exponential Moving Average (EMA):
#Smooths out short-term fluctuations and gives more weight to recent observations.

data['RET_ema_12'] = data['RET'].ewm(span=12, adjust=False).mean()
data['RET_ema_26'] = data['RET'].ewm(span=26, adjust=False).mean()


### Statistical Features

In [23]:
# Skewness and Kurtosis: Measure the asymmetry and "tailedness" of the distribution.

data['RET_skew_10'] = data['RET'].rolling(window=10).skew()
data['RET_kurtosis_10'] = data['RET'].rolling(window=10).kurt()


In [24]:
# Entropy:Measures the uncertainty or randomness in the series.
from scipy.stats import entropy
data['RET_entropy_10'] = data['RET'].rolling(window=10).apply(lambda x: entropy(pd.value_counts(x)))


  data['RET_entropy_10'] = data['RET'].rolling(window=10).apply(lambda x: entropy(pd.value_counts(x)))


### Technical Indicators

In [26]:
# RSI: Measures the speed and change of price movements to identify overbought or oversold conditions.
delta = data['RET'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
data['RSI'] = 100 - (100 / (1 + gain / loss))


In [27]:
#On-Balance Volume (OBV): A momentum indicator that relates volume to price changes.
data['OBV'] = (data['RET'].diff() * data['VOL_CHANGE']).cumsum()


In [34]:
data.to_csv('f_eng_data_AAPL.csv', index=False)

In [35]:
data

Unnamed: 0,date,RET,VOL_CHANGE,BA_SPREAD,ILLIQUIDITY,sprtrn,TURNOVER,DJI_Return,day_of_week,day_of_month,...,week,RET_ema_12,RET_ema_26,RET_cumulative_sum,RET_cumulative_product,RET_skew_10,RET_kurtosis_10,RET_entropy_10,RSI,OBV
0,1992-01-02,0.055432,0.717745,0.008403,4.510000e-10,0.000408,17.419850,0.000000,3,2,...,1,0.055432,0.055432,0.055432,5.543200e-02,,,,,
1,1992-01-03,-0.008403,-0.172890,0.004237,-8.340000e-11,0.004985,14.408127,0.009173,4,3,...,1,0.045611,0.050703,0.047029,-4.657951e-04,,,,,0.011036
2,1992-01-06,-0.016949,-0.399632,0.004310,-2.850000e-10,-0.003291,8.650181,-0.000437,0,6,...,2,0.035987,0.045692,0.030080,7.894761e-06,,,,,0.014452
3,1992-01-07,0.019397,0.237283,0.004228,2.590000e-10,-0.001340,10.702726,0.001469,1,7,...,2,0.033434,0.043744,0.049477,1.531347e-07,,,,,0.023076
4,1992-01-08,0.023256,0.645321,0.004132,1.840000e-10,0.001677,17.609419,-0.000281,2,8,...,2,0.031868,0.042227,0.072733,3.561300e-09,,,,,0.025566
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7801,2022-12-23,-0.002798,-0.181476,0.000076,-3.330000e-13,0.005868,4.008909,0.005342,4,23,...,51,-0.006299,-0.004706,9.107344,-0.000000e+00,-0.338420,0.287636,2.302585,50.877597,13.783051
7802,2022-12-27,-0.013878,0.081093,0.000231,-1.550000e-12,-0.004050,4.334004,0.001133,1,27,...,52,-0.007465,-0.005385,9.093466,0.000000e+00,-0.112210,1.256378,2.302585,51.987534,13.782153
7803,2022-12-28,-0.030685,0.238299,0.000079,-2.850000e-12,-0.012021,5.366792,-0.011006,2,28,...,52,-0.011037,-0.007259,9.062781,-0.000000e+00,0.364249,1.456694,2.302585,47.129014,13.778148
7804,2022-12-29,0.028324,-0.115337,0.000231,2.890000e-12,0.017461,4.747802,0.010497,3,29,...,52,-0.004982,-0.004623,9.091105,-0.000000e+00,0.335839,-0.117453,2.302585,52.472550,13.771342
