In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from datetime import datetime, timedelta
from seaborn import set_style
set_style("whitegrid")
from seaborn import set_style

In [2]:
## Import SARIMAX from tsa.api
from statsmodels.tsa.arima.model import ARIMA

In [3]:
set_style("whitegrid")

In [4]:
# apple stock history. https://finance.yahoo.com/quote/AAPL/history?p=AAPL

# Macroeconomic Factors

# Treasury. https://home.treasury.gov/interest-rates-data-csv-archive
# Fed Fund Effective Rate. https://fred.stlouisfed.org/series/FEDFUNDS


# Market Factors

# S&P500. https://www.kaggle.com/datasets/andrewmvd/sp-500-stocks

In [5]:
pwd

'/Users/ziyuan/Library/CloudStorage/OneDrive-WashingtonUniversityinSt.Louis/Machine_Learning/Erdos_Bootcamp/Project/modeling/modeling'

In [6]:
apple_raw=pd.read_csv('../../Dataset/AAPL.csv')

In [7]:
apple_raw["Date"] = pd.to_datetime(apple_raw["Date"])

In [8]:
apple_raw=apple_raw.drop(columns=['Open','High','Low','Adj Close','Volume'])

In [9]:
apple_raw

Unnamed: 0,Date,Close
0,2010-11-26,11.250000
1,2010-11-29,11.316786
2,2010-11-30,11.112500
3,2010-12-01,11.300000
4,2010-12-02,11.362500
...,...,...
3016,2022-11-18,151.289993
3017,2022-11-21,148.009995
3018,2022-11-22,150.179993
3019,2022-11-23,151.070007


In [18]:
sp500_raw=pd.read_csv('../../Dataset/Market_Factor/sp500_index.csv')
sp500_raw.head(3)

Unnamed: 0,Date,S&P500
0,2012-11-12,1380.03
1,2012-11-13,1374.53
2,2012-11-14,1355.49


In [19]:
sp500_raw["Date"] = pd.to_datetime(sp500_raw["Date"])

In [20]:
sp500_raw=sp500_raw.rename(columns={"S&P500": 'SP500'})
sp500_raw.head(2)

Unnamed: 0,Date,SP500
0,2012-11-12,1380.03
1,2012-11-13,1374.53


In [23]:
sp500_raw

Unnamed: 0,Date,SP500
0,2012-11-12,1380.03
1,2012-11-13,1374.53
2,2012-11-14,1355.49
3,2012-11-15,1353.33
4,2012-11-16,1359.88
...,...,...
2514,2022-11-07,3806.80
2515,2022-11-08,3828.11
2516,2022-11-09,3748.57
2517,2022-11-10,3956.37


In [12]:
gdp_raw=pd.read_csv('../../Dataset/Macroeconomic_Factor/GDP_US_annual.csv')

In [13]:
gdp_raw.Year = pd.to_datetime(gdp_raw.Year, format='%Y')

In [14]:
gdp_raw=gdp_raw.rename(columns={"Year": 'Date'})
gdp_raw.head(2)

Unnamed: 0,Date,GDP_growth
0,1961-01-01,2.3
1,1962-01-01,6.1


In [15]:
gdp_raw

Unnamed: 0,Date,GDP_growth
0,1961-01-01,2.300000
1,1962-01-01,6.100000
2,1963-01-01,4.400000
3,1964-01-01,5.800000
4,1965-01-01,6.400000
...,...,...
56,2017-01-01,2.255680
57,2018-01-01,2.918857
58,2019-01-01,2.288870
59,2020-01-01,-3.404590


In [28]:
merge_test1 = pd.merge(apple_raw.assign(grouper=apple_raw['Date'].dt.to_period('Y')),
                      gdp_raw.assign(grouper=gdp_raw['Date'].dt.to_period('Y')),
                      how='left', on='grouper')
merge_test1

Unnamed: 0,Date_x,Close,grouper,Date_y,GDP_growth
0,2010-11-26,11.250000,2010,2010-01-01,2.708857
1,2010-11-29,11.316786,2010,2010-01-01,2.708857
2,2010-11-30,11.112500,2010,2010-01-01,2.708857
3,2010-12-01,11.300000,2010,2010-01-01,2.708857
4,2010-12-02,11.362500,2010,2010-01-01,2.708857
...,...,...,...,...,...
3016,2022-11-18,151.289993,2022,NaT,
3017,2022-11-21,148.009995,2022,NaT,
3018,2022-11-22,150.179993,2022,NaT,
3019,2022-11-23,151.070007,2022,NaT,


In [30]:
merge_test1.loc[merge_test1["Date_x"]=="2015-11-02"]

Unnamed: 0,Date_x,Close,grouper,Date_y,GDP_growth
1241,2015-11-02,30.295,2015,2015-01-01,2.70637


In [31]:
# drop NA (NAN, NAT)
merge_test1_1=merge_test1.dropna(subset=['Date_y', 'GDP_growth'])
merge_test1_1

Unnamed: 0,Date_x,Close,grouper,Date_y,GDP_growth
0,2010-11-26,11.250000,2010,2010-01-01,2.708857
1,2010-11-29,11.316786,2010,2010-01-01,2.708857
2,2010-11-30,11.112500,2010,2010-01-01,2.708857
3,2010-12-01,11.300000,2010,2010-01-01,2.708857
4,2010-12-02,11.362500,2010,2010-01-01,2.708857
...,...,...,...,...,...
2789,2021-12-27,180.330002,2021,2021-01-01,5.671107
2790,2021-12-28,179.289993,2021,2021-01-01,5.671107
2791,2021-12-29,179.380005,2021,2021-01-01,5.671107
2792,2021-12-30,178.199997,2021,2021-01-01,5.671107


In [32]:
merge_test1_1=merge_test1_1.rename(columns={"Date_x": 'Date'})
merge_test1_1["Date"] = pd.to_datetime(merge_test1_1["Date"])
merge_test1_1=merge_test1_1.drop(columns=['grouper','Date_y'])

In [33]:
merge_test1_1

Unnamed: 0,Date,Close,GDP_growth
0,2010-11-26,11.250000,2.708857
1,2010-11-29,11.316786,2.708857
2,2010-11-30,11.112500,2.708857
3,2010-12-01,11.300000,2.708857
4,2010-12-02,11.362500,2.708857
...,...,...,...
2789,2021-12-27,180.330002,5.671107
2790,2021-12-28,179.289993,5.671107
2791,2021-12-29,179.380005,5.671107
2792,2021-12-30,178.199997,5.671107


In [None]:
merge_test1_2 = pd.merge(merge_test1_1.assign(grouper=apple_raw['Date'].dt.to_period('Y')),
                         gdp_raw.assign(grouper=gdp_raw['Date'].dt.to_period('Y')),
                         how='left', on='grouper')
merge_test1_2

In [21]:
merge_test2_1 = pd.merge_asof(apple_raw, gdp_raw, on='Date')
merge_test2_1

Unnamed: 0,Date,Close,GDP_growth
0,2010-11-26,11.250000,2.708857
1,2010-11-29,11.316786,2.708857
2,2010-11-30,11.112500,2.708857
3,2010-12-01,11.300000,2.708857
4,2010-12-02,11.362500,2.708857
...,...,...,...
3016,2022-11-18,151.289993,5.671107
3017,2022-11-21,148.009995,5.671107
3018,2022-11-22,150.179993,5.671107
3019,2022-11-23,151.070007,5.671107


In [22]:
merge_test2_2 = pd.merge_asof(merge_test2_1, sp500_raw, on='Date')
merge_test2_2

Unnamed: 0,Date,Close,GDP_growth,SP500
0,2010-11-26,11.250000,2.708857,
1,2010-11-29,11.316786,2.708857,
2,2010-11-30,11.112500,2.708857,
3,2010-12-01,11.300000,2.708857,
4,2010-12-02,11.362500,2.708857,
...,...,...,...,...
3016,2022-11-18,151.289993,5.671107,3992.93
3017,2022-11-21,148.009995,5.671107,3992.93
3018,2022-11-22,150.179993,5.671107,3992.93
3019,2022-11-23,151.070007,5.671107,3992.93


In [25]:
merge_test2_2 = pd.merge_asof(sp500_raw, merge_test2_1, on='Date')
merge_test2_2

Unnamed: 0,Date,SP500,Close,GDP_growth
0,2012-11-12,1380.03,19.386786,2.280688
1,2012-11-13,1374.53,19.389286,2.280688
2,2012-11-14,1355.49,19.174286,2.280688
3,2012-11-15,1353.33,18.772142,2.280688
4,2012-11-16,1359.88,18.845715,2.280688
...,...,...,...,...
2514,2022-11-07,3806.80,138.919998,5.671107
2515,2022-11-08,3828.11,139.500000,5.671107
2516,2022-11-09,3748.57,134.869995,5.671107
2517,2022-11-10,3956.37,146.869995,5.671107


In [21]:
merge_test.to_csv('../../Dataset/merge_data.csv')