# Preprocess test data

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from xgboost import XGBRegressor

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error

from joblib import dump
from joblib import load

In [7]:
df = pd.read_csv("../data/Mastercard_test.csv")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,12/10/2021 16:00:00,348.65,350.0,344.62,345.34,3042883
1,13/10/2021 16:00:00,346.25,349.5,336.98,342.35,3755921
2,14/10/2021 16:00:00,343.84,345.81,340.83,344.58,4085934
3,15/10/2021 16:00:00,349.0,356.61,347.74,356.0,4053607
4,18/10/2021 16:00:00,353.96,359.94,350.23,358.44,3280888


In [8]:
# convert Date into datetime, and set the date as the index of the dataframe
df['Date'] = pd.to_datetime(df['Date'], format = "%d/%m/%Y %H:%M:%S").dt.date
# set date as the index
df.set_index('Date', inplace=True)

df = df.asfreq("D")
df = df.interpolate() # interpolate data to remove NaN values

## Create new features for the model

We will create features for:

1. month
2. day of the week
3. moving average
4. moving standard deviation
5. previous days' lagged close prices
6. open price
7. quarter of the year
8. Relative Strength Indicator

In [9]:
df['month'] = df.index.month
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,month
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-10-12,348.65,350.0,344.62,345.34,3042883.0,10
2021-10-13,346.25,349.5,336.98,342.35,3755921.0,10
2021-10-14,343.84,345.81,340.83,344.58,4085934.0,10
2021-10-15,349.0,356.61,347.74,356.0,4053607.0,10
2021-10-16,350.653333,357.72,348.57,356.813333,3796034.0,10


In [10]:
df['day'] = df.index.day_of_week
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-10-12,348.65,350.0,344.62,345.34,3042883.0,10,1
2021-10-13,346.25,349.5,336.98,342.35,3755921.0,10,2
2021-10-14,343.84,345.81,340.83,344.58,4085934.0,10,3
2021-10-15,349.0,356.61,347.74,356.0,4053607.0,10,4
2021-10-16,350.653333,357.72,348.57,356.813333,3796034.0,10,5


In [11]:
df['quarter'] = df.index.quarter
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,month,day,quarter
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-10-12,348.65,350.0,344.62,345.34,3042883.0,10,1,4
2021-10-13,346.25,349.5,336.98,342.35,3755921.0,10,2,4
2021-10-14,343.84,345.81,340.83,344.58,4085934.0,10,3,4
2021-10-15,349.0,356.61,347.74,356.0,4053607.0,10,4,4
2021-10-16,350.653333,357.72,348.57,356.813333,3796034.0,10,5,4


In [12]:
df['lag_1'] = df['Close'].shift(1)
df['lag_2'] = df['Close'].shift(2)

df = df.dropna()
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,month,day,quarter,lag_1,lag_2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-10-14,343.84,345.81,340.83,344.58,4085934.0,10,3,4,342.35,345.34
2021-10-15,349.0,356.61,347.74,356.0,4053607.0,10,4,4,344.58,342.35
2021-10-16,350.653333,357.72,348.57,356.813333,3796034.0,10,5,4,356.0,344.58
2021-10-17,352.306667,358.83,349.4,357.626667,3538461.0,10,6,4,356.813333,356.0
2021-10-18,353.96,359.94,350.23,358.44,3280888.0,10,0,4,357.626667,356.813333


In [13]:
# create RSI indicator
df['change'] = df['Close'].diff()
df['change'] = df['change'].shift(1) # lag the change column by 1 time step

df['gain'] = df['change'].apply(lambda x: x if x > 0 else 0)
df['loss'] = df['change'].apply(lambda x: -x if x < 0 else 0)

df['avg_gain'] = df['gain'].rolling(window = 30, min_periods = 1).mean()
df['avg_loss'] = df['loss'].rolling(window = 30, min_periods = 1).mean()

df['RSI'] = 100 - (100 / (1 + (df["avg_gain"] / df["avg_loss"])))
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,month,day,quarter,lag_1,lag_2,change,gain,loss,avg_gain,avg_loss,RSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2021-10-14,343.84,345.81,340.83,344.58,4085934.0,10,3,4,342.35,345.34,,0.0,0.0,0.0,0.0,
2021-10-15,349.0,356.61,347.74,356.0,4053607.0,10,4,4,344.58,342.35,,0.0,0.0,0.0,0.0,
2021-10-16,350.653333,357.72,348.57,356.813333,3796034.0,10,5,4,356.0,344.58,11.42,11.42,0.0,3.806667,0.0,100.0
2021-10-17,352.306667,358.83,349.4,357.626667,3538461.0,10,6,4,356.813333,356.0,0.813333,0.813333,0.0,3.058333,0.0,100.0
2021-10-18,353.96,359.94,350.23,358.44,3280888.0,10,0,4,357.626667,356.813333,0.813333,0.813333,0.0,2.609333,0.0,100.0


In [14]:
# add ARBR with n = 6, non-lagged
def compute_arbr(df: pd.DataFrame, n = 6, in_place=False, include_lagged=True):
  if not in_place:
    df = df.copy()

  sum_high_open_diff = (df["High"] - df["Open"]).rolling(window=n, min_periods=1).sum()
  sum_open_low_diff = (df["Open"] - df["Low"]).rolling(window=n, min_periods=1).sum()
  df['AR'] = 100 * (sum_high_open_diff / sum_open_low_diff)

  lag_1_close = df["Close"].shift(1)
  max_high_close_diff = (df['High'] - lag_1_close).apply(lambda v: max(0, v))
  max_close_low_diff = (lag_1_close - df['Low']).apply(lambda v: max(0, v))
  df['BR'] = 100 * (max_high_close_diff.rolling(window=n, min_periods=1).sum()
                    / max_close_low_diff.rolling(window=n, min_periods=1).sum())
  if include_lagged:
    df['AR-LAG1'] = df['AR'].shift(1)
    df['BR-LAG1'] = df['BR'].shift(1)

  return df

# add MACD columns (MACD, DEA, MACD-DEA), non-lagged
def compute_macd(df: pd.DataFrame, in_place=False, include_lagged=True):
  ema_12 = df["Close"].ewm(span=12, adjust=False).mean()
  ema_26 = df["Close"].ewm(span=26, adjust=False).mean()
  macd= ema_26 - ema_12
  dea = macd.ewm(span=9, adjust=False).mean()
  df_cp = df

  if not in_place:
    df_cp = df.copy()

  df_cp['DEA'] = dea
  df_cp['MACD'] = macd
  df_cp['MACD-DEA'] = macd - dea
  if include_lagged:
    df_cp['DEA-LAG1'] = df_cp['DEA'].shift(1)
    df_cp['MACD-LAG1'] = df_cp['MACD'].shift(1)
    df_cp['MACD-DEA-LAG1'] = df_cp['MACD-DEA'].shift(1)

  return df_cp 

df_arbr = compute_arbr(df)
df_macd = compute_macd(df_arbr)

df_macd = df_macd.dropna()

# robust scaling on these features: rmse = 7.92
df = df_macd
df_macd[:7]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,month,day,quarter,lag_1,lag_2,...,AR,BR,AR-LAG1,BR-LAG1,DEA,MACD,MACD-DEA,DEA-LAG1,MACD-LAG1,MACD-DEA-LAG1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-10-16,350.653333,357.72,348.57,356.813333,3796034.0,10,5,4,356.0,344.58,...,262.01469,185.060565,224.355972,inf,-0.481608,-1.679241,-1.197633,-0.182199,-0.910997,-0.728798
2021-10-17,352.306667,358.83,349.4,357.626667,3538461.0,10,6,4,356.813333,356.0,...,250.215983,106.220525,262.01469,185.060565,-0.850663,-2.326886,-1.476223,-0.481608,-1.679241,-1.197633
2021-10-18,353.96,359.94,350.23,358.44,3280888.0,10,0,4,357.626667,356.813333,...,224.403387,81.294964,250.215983,106.220525,-1.255064,-2.872664,-1.617601,-0.850663,-2.326886,-1.476223
2021-10-19,358.4,362.29,357.61,362.1,2835441.0,10,1,4,358.44,357.626667,...,239.767779,95.058518,224.403387,81.294964,-1.71595,-3.559497,-1.843547,-1.255064,-2.872664,-1.617601
2021-10-20,361.7,362.49,356.0,356.77,2796212.0,10,2,4,362.1,358.44,...,193.442623,76.516969,239.767779,95.058518,-2.099133,-3.631865,-1.532732,-1.71595,-3.559497,-1.843547
2021-10-21,354.6,356.38,347.27,356.21,2516486.0,10,3,4,356.77,362.1,...,115.483585,26.609775,193.442623,76.516969,-2.399807,-3.602502,-1.202695,-2.099133,-3.631865,-1.532732
2021-10-22,356.68,361.64,354.9,358.67,2774374.0,10,4,4,356.21,356.77,...,107.58507,43.010753,115.483585,26.609775,-2.666782,-3.734682,-1.0679,-2.399807,-3.602502,-1.202695


In [15]:
# create lag-1 30 day rolling mean and standard deviations
df['MA'] = df["lag_1"].rolling(30).mean()
df['M_STD'] = df["lag_1"].rolling(30).std()

df = df.dropna() # remove NaNs after rolling functions have been applied

In [16]:
columns_to_keep = ["Open", "Volume", "Close", "month", "day", "quarter", "lag_1", "lag_2", "MA", "M_STD"] # ignore RSI, since it makes predictions worse
df1 = df[columns_to_keep]
df1.head()

Unnamed: 0_level_0,Open,Volume,Close,month,day,quarter,lag_1,lag_2,MA,M_STD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-11-14,362.113333,4703532.0,362.073333,11,6,4,362.786667,363.5,349.390222,11.515804
2021-11-15,365.14,4693196.0,361.36,11,0,4,362.073333,362.786667,349.592667,11.688103
2021-11-16,364.77,4969009.0,369.56,11,1,4,361.36,362.073333,349.744222,11.813762
2021-11-17,359.58,11727270.0,359.17,11,2,4,369.56,361.36,350.142,12.280022
2021-11-18,360.73,6534921.0,348.22,11,3,4,359.17,369.56,350.166333,12.297742


## Standardize data

In [17]:
# scaler = StandardScaler()
# robust_scaler = RobustScaler()
# cols_to_standardize = ['Open', "Volume", "lag_1", "lag_2", "MA", "M_STD"]
# df_scaled = df1.copy()
# df_scaled[cols_to_standardize] = scaler.fit_transform(df1[cols_to_standardize])
# # cols_to_scale = ['AR', 'BR', "RSI"]
# # df1[cols_to_scale] = scaler.fit_transform(df1[cols_to_scale])
# df_scaled.head()

In [18]:
df1.to_csv("../data/processed/mastercard_test_processed.csv", index = True, index_label = "Date")