# Preprocess train data

In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from xgboost import XGBRegressor

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error

from joblib import dump
from joblib import load

In [34]:
df = pd.read_csv("../data/Mastercard_stock_history.csv")
df = df.drop(columns=["Dividends", "Stock Splits"])

# df['Date'] = pd.to_datetime(df['Date'])
df.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume
3867,2021-10-05,347.121403,348.130138,342.497241,342.776886,4724100
3868,2021-10-06,339.58096,348.439763,338.682072,348.25,3712000
3869,2021-10-07,349.0,357.899994,349.0,353.910004,3209200
3870,2021-10-08,356.0,360.369995,354.209991,354.959991,2336700
3871,2021-10-11,353.950012,354.880005,346.899994,347.149994,2766800


In [35]:
df_1 = pd.read_csv("../data/Mastercard_test.csv")

# convert Date into datetime, and set the date as the index of the dataframe
df_1['Date'] = pd.to_datetime(df_1['Date'], format = "%d/%m/%Y %H:%M:%S").dt.date
# df_1['Date'] = pd.to_datetime(df_1['Date'])
# set date as the index
# df_1.set_index('Date', inplace=True)

# df_1 = df_1.asfreq("D")
# df_1 = df_1.interpolate() # interpolate data to remove NaN values

df_1.dtypes

Date       object
Open      float64
High      float64
Low       float64
Close     float64
Volume      int64
dtype: object

In [36]:
# concatenate the 2 dataframes

df = pd.concat([df, df_1], axis = 0)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2006-05-25,3.748967,4.283869,3.739664,4.279217,395343000
1,2006-05-26,4.307126,4.348058,4.103398,4.17968,103044000
2,2006-05-30,4.1834,4.18433,3.986184,4.093164,49898000
3,2006-05-31,4.125723,4.219679,4.125723,4.180608,30002000
4,2006-06-01,4.179678,4.474572,4.176887,4.419686,62344000


In [37]:
df['Date'] = pd.to_datetime(df['Date'])

# set date as the index
df.set_index('Date', inplace=True)

In [38]:
df.shape

(4671, 5)

In [39]:
df = df.asfreq("D")
df = df.ffill() # forward fill
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-12-09,529.02,531.59,521.88,522.82,2942047.0
2024-12-10,522.51,529.66,520.22,529.01,2003265.0
2024-12-11,529.19,535.89,527.27,534.45,2181241.0
2024-12-12,536.29,536.75,531.97,532.41,1860815.0
2024-12-13,534.17,534.68,528.91,529.0,1608196.0


Once again, we try to forecast the close price using previous date features, as well as generating new features for our model to predict the close price

## Create new features for the model

We will create features for:

1. month
2. day of the week
3. moving average
4. moving standard deviation
5. previous days' lagged close prices
6. open price
7. quarter of the year
8. Relative Strength Indicator

In [40]:
df['month'] = df.index.month
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,month
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2006-05-25,3.748967,4.283869,3.739664,4.279217,395343000.0,5
2006-05-26,4.307126,4.348058,4.103398,4.17968,103044000.0,5
2006-05-27,4.307126,4.348058,4.103398,4.17968,103044000.0,5
2006-05-28,4.307126,4.348058,4.103398,4.17968,103044000.0,5
2006-05-29,4.307126,4.348058,4.103398,4.17968,103044000.0,5


In [41]:
df['day'] = df.index.day_of_week
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-05-25,3.748967,4.283869,3.739664,4.279217,395343000.0,5,3
2006-05-26,4.307126,4.348058,4.103398,4.17968,103044000.0,5,4
2006-05-27,4.307126,4.348058,4.103398,4.17968,103044000.0,5,5
2006-05-28,4.307126,4.348058,4.103398,4.17968,103044000.0,5,6
2006-05-29,4.307126,4.348058,4.103398,4.17968,103044000.0,5,0


In [42]:
df['quarter'] = df.index.quarter
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,month,day,quarter
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006-05-25,3.748967,4.283869,3.739664,4.279217,395343000.0,5,3,2
2006-05-26,4.307126,4.348058,4.103398,4.17968,103044000.0,5,4,2
2006-05-27,4.307126,4.348058,4.103398,4.17968,103044000.0,5,5,2
2006-05-28,4.307126,4.348058,4.103398,4.17968,103044000.0,5,6,2
2006-05-29,4.307126,4.348058,4.103398,4.17968,103044000.0,5,0,2


In [43]:
df['lag_1'] = df['Close'].shift(1)
df['lag_2'] = df['Close'].shift(2)

df = df.dropna()
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,month,day,quarter,lag_1,lag_2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-05-27,4.307126,4.348058,4.103398,4.17968,103044000.0,5,5,2,4.17968,4.279217
2006-05-28,4.307126,4.348058,4.103398,4.17968,103044000.0,5,6,2,4.17968,4.17968
2006-05-29,4.307126,4.348058,4.103398,4.17968,103044000.0,5,0,2,4.17968,4.17968
2006-05-30,4.1834,4.18433,3.986184,4.093164,49898000.0,5,1,2,4.17968,4.17968
2006-05-31,4.125723,4.219679,4.125723,4.180608,30002000.0,5,2,2,4.093164,4.17968


In [44]:
# create RSI indicator
df['change'] = df['Close'].diff()
df['change'] = df['change'].shift(1) # lag the change column by 1 time step

df['gain'] = df['change'].apply(lambda x: x if x > 0 else 0)
df['loss'] = df['change'].apply(lambda x: -x if x < 0 else 0)

df['avg_gain'] = df['gain'].rolling(window = 30, min_periods = 1).mean()
df['avg_loss'] = df['loss'].rolling(window = 30, min_periods = 1).mean()

df['RSI'] = 100 - (100 / (1 + (df["avg_gain"] / df["avg_loss"])))
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,month,day,quarter,lag_1,lag_2,change,gain,loss,avg_gain,avg_loss,RSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2006-05-27,4.307126,4.348058,4.103398,4.17968,103044000.0,5,5,2,4.17968,4.279217,,0.0,0.0,0.0,0.0,
2006-05-28,4.307126,4.348058,4.103398,4.17968,103044000.0,5,6,2,4.17968,4.17968,,0.0,0.0,0.0,0.0,
2006-05-29,4.307126,4.348058,4.103398,4.17968,103044000.0,5,0,2,4.17968,4.17968,0.0,0.0,0.0,0.0,0.0,
2006-05-30,4.1834,4.18433,3.986184,4.093164,49898000.0,5,1,2,4.17968,4.17968,0.0,0.0,0.0,0.0,0.0,
2006-05-31,4.125723,4.219679,4.125723,4.180608,30002000.0,5,2,2,4.093164,4.17968,-0.086515,0.0,0.086515,0.0,0.017303,0.0


In [45]:
# add ARBR with n = 6, non-lagged
def compute_arbr(df: pd.DataFrame, n = 6, in_place=False, include_lagged=True):
  if not in_place:
    df = df.copy()

  sum_high_open_diff = (df["High"] - df["Open"]).rolling(window=n, min_periods=1).sum()
  sum_open_low_diff = (df["Open"] - df["Low"]).rolling(window=n, min_periods=1).sum()
  df['AR'] = 100 * (sum_high_open_diff / sum_open_low_diff)

  lag_1_close = df["Close"].shift(1)
  max_high_close_diff = (df['High'] - lag_1_close).apply(lambda v: max(0, v))
  max_close_low_diff = (lag_1_close - df['Low']).apply(lambda v: max(0, v))
  df['BR'] = 100 * (max_high_close_diff.rolling(window=n, min_periods=1).sum()
                    / max_close_low_diff.rolling(window=n, min_periods=1).sum())
  if include_lagged:
    df['AR-LAG1'] = df['AR'].shift(1)
    df['BR-LAG1'] = df['BR'].shift(1)

  return df

# add MACD columns (MACD, DEA, MACD-DEA), non-lagged
def compute_macd(df: pd.DataFrame, in_place=False, include_lagged=True):
  ema_12 = df["Close"].ewm(span=12, adjust=False).mean()
  ema_26 = df["Close"].ewm(span=26, adjust=False).mean()
  macd= ema_26 - ema_12
  dea = macd.ewm(span=9, adjust=False).mean()
  df_cp = df

  if not in_place:
    df_cp = df.copy()

  df_cp['DEA'] = dea
  df_cp['MACD'] = macd
  df_cp['MACD-DEA'] = macd - dea
  if include_lagged:
    df_cp['DEA-LAG1'] = df_cp['DEA'].shift(1)
    df_cp['MACD-LAG1'] = df_cp['MACD'].shift(1)
    df_cp['MACD-DEA-LAG1'] = df_cp['MACD-DEA'].shift(1)

  return df_cp 

df_arbr = compute_arbr(df)
df_macd = compute_macd(df_arbr)

df_macd = df_macd.dropna()

# robust scaling on these features: rmse = 7.92
df = df_macd
df_macd[:7]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,month,day,quarter,lag_1,lag_2,...,AR,BR,AR-LAG1,BR-LAG1,DEA,MACD,MACD-DEA,DEA-LAG1,MACD-LAG1,MACD-DEA-LAG1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2006-05-31,4.125723,4.219679,4.125723,4.180608,30002000.0,5,2,2,4.093164,4.17968,...,26.927315,135.213759,15.304803,98.655137,0.002155,0.005254,0.003099,0.00138,0.006902,0.005521
2006-06-01,4.179678,4.474572,4.176887,4.419686,62344000.0,6,3,2,4.180608,4.093164,...,63.187787,217.817473,26.927315,135.213759,-0.001309,-0.015168,-0.013858,0.002155,0.005254,0.003099
2006-06-02,4.511782,4.530387,4.352707,4.371312,37253000.0,6,4,2,4.419686,4.180608,...,63.956075,209.373319,63.187787,217.817473,-0.006475,-0.027136,-0.020661,-0.001309,-0.015168,-0.013858
2006-06-03,4.511782,4.530387,4.352707,4.371312,37253000.0,6,5,2,4.371312,4.419686,...,64.81941,240.412611,63.956075,209.373319,-0.012421,-0.036204,-0.023783,-0.006475,-0.027136,-0.020661
2006-06-04,4.511782,4.530387,4.352707,4.371312,37253000.0,6,6,2,4.371312,4.371312,...,65.796592,283.331137,64.81941,240.412611,-0.018516,-0.042896,-0.02438,-0.012421,-0.036204,-0.023783
2006-06-05,4.376895,4.581554,4.372244,4.572251,37188000.0,6,0,2,4.371312,4.371312,...,133.973041,981.892858,65.796592,283.331137,-0.027548,-0.063679,-0.036131,-0.018516,-0.042896,-0.02438
2006-06-06,4.649463,4.70993,4.446665,4.493178,49045000.0,6,1,2,4.572251,4.371312,...,89.580507,458.566148,133.973041,981.892858,-0.036624,-0.072929,-0.036304,-0.027548,-0.063679,-0.036131


In [46]:
# create lag-1 30 day rolling mean and standard deviations
df['MA'] = df["lag_1"].rolling(30).mean()
df['M_STD'] = df["lag_1"].rolling(30).std()

df = df.dropna() # remove NaNs after rolling functions have been applied

In [47]:
columns_to_keep = ["Open", "Volume", "Close", "month", "day", "quarter", "lag_1", "lag_2", "MA", "M_STD"] # ignore RSI, since it makes predictions worse
df1 = df[columns_to_keep]
df1.head()

Unnamed: 0_level_0,Open,Volume,Close,month,day,quarter,lag_1,lag_2,MA,M_STD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2006-06-29,4.367592,14470000.0,4.46527,6,3,2,4.368524,4.401083,4.325699,0.125787
2006-06-30,4.43271,25964000.0,4.46527,6,4,2,4.46527,4.368524,4.338103,0.120293
2006-07-01,4.43271,25964000.0,4.46527,7,5,3,4.46527,4.46527,4.347592,0.118657
2006-07-02,4.43271,25964000.0,4.46527,7,6,3,4.46527,4.46527,4.349111,0.119898
2006-07-03,4.455966,7029000.0,4.40015,7,0,3,4.46527,4.46527,4.352243,0.121711


## Standardize data

In [48]:
# scaler = StandardScaler()
# robust_scaler = RobustScaler()
# cols_to_standardize = ['Open', "Volume", "lag_1", "lag_2", "MA", "M_STD"]
# df_scaled = df1.copy()
# df_scaled[cols_to_standardize] = scaler.fit_transform(df1[cols_to_standardize])
# # cols_to_scale = ['AR', 'BR', "RSI"]
# # df1[cols_to_scale] = scaler.fit_transform(df1[cols_to_scale])
# df_scaled.head()

In [49]:
df1.to_csv("../data/processed/mastercard_processed.csv", index = True, index_label = "Date")