In [4]:
# apple stock history. https://finance.yahoo.com/quote/AAPL/history?p=AAPL


# Macroeconomic Factors

# 1yr Treasury rate. https://www.macrotrends.net/2492/1-year-treasury-rate-yield-chart
# 10yr Treasury rate. https://www.macrotrends.net/2016/10-year-treasury-bond-rate-yield-chart
# Fed Funds Rate (daily data). https://www.macrotrends.net/2015/fed-funds-rate-historical-chart
# GDP (absolute value). https://fred.stlouisfed.org/series/GDP
# GDP growth (yearly). https://data.worldbank.org/indicator/NY.GDP.MKTP.KD.ZG?locations=US
## GDP growth (quarterly). https://data.oecd.org/gdp/real-gdp-forecast.htm
# CPI (monthly). https://fred.stlouisfed.org/series/CPIAUCSL
## Treasury bill. https://home.treasury.gov/interest-rates-data-csv-archive
## Fed Funds Effective Rate (monthly). https://fred.stlouisfed.org/series/FEDFUNDS


# Financial Factors

# Gross Profit. https://www.macrotrends.net/stocks/charts/AAPL/apple/gross-profit
# P/E. https://www.macrotrends.net/stocks/charts/AAPL/apple/pe-ratio
# P/S. https://www.macrotrends.net/stocks/charts/AAPL/apple/ps-ratio


# Market Factors

# Dow Jones Industrial Avg. https://www.macrotrends.net/1319/dow-jones-100-year-historical-chart
# S&P500. https://www.kaggle.com/datasets/andrewmvd/sp-500-stocks

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime, timedelta
from seaborn import set_style

In [2]:
set_style("whitegrid")

## Load All Factors

In [3]:
pwd

'/Users/ziyuan/Library/CloudStorage/OneDrive-WashingtonUniversityinSt.Louis/Machine_Learning/Erdos_Bootcamp/Project/modeling/modeling'

In [4]:
fin_factor=pd.read_csv('../../Dataset/merge_data_fin_factor.csv')

In [5]:
fin_factor["Date"] = pd.to_datetime(fin_factor["Date"])

In [6]:
fin_factor.head(3)

Unnamed: 0,Date,Close,Gross_Profit,ROE,ROI,PE_ratio,PS_ratio
0,2010-11-26,11.25,7512,0.34,0.73,15.96,3.43
1,2010-11-29,11.316786,7512,0.34,0.7,15.96,3.43
2,2010-11-30,11.1125,7512,0.34,0.69,15.96,3.43


In [7]:
# check if there's any NaN
fin_factor.isnull().values.any()

True

In [9]:
fin_factor = fin_factor.drop(columns=['ROI'])
fin_factor.head(3)

Unnamed: 0,Date,Close,Gross_Profit,ROE,PE_ratio,PS_ratio
0,2010-11-26,11.25,7512,0.34,15.96,3.43
1,2010-11-29,11.316786,7512,0.34,15.96,3.43
2,2010-11-30,11.1125,7512,0.34,15.96,3.43


In [10]:
market_factor=pd.read_csv('../../Dataset/merge_data_market_factor.csv')

In [11]:
market_factor["Date"] = pd.to_datetime(market_factor["Date"])

In [12]:
market_factor.head(3)

Unnamed: 0,Date,Close,Volume,Dow_Jones,SP500
0,2012-11-12,19.386786,515802000,12815.08,1380.03
1,2012-11-13,19.389286,532949200,12756.18,1374.53
2,2012-11-14,19.174286,477170400,12570.95,1355.49


In [13]:
macro_factor=pd.read_csv('../../Dataset/merge_data_macro_factor.csv')

In [14]:
macro_factor["Date"] = pd.to_datetime(macro_factor["Date"])

In [15]:
macro_factor.head(3)

Unnamed: 0,Date,tnote,tbill,Fed_rate,GDP,GDP_growth,CPI
0,1962-01-02,4.06,3.22,2.75,594.013,6.1,30.04
1,1962-01-03,4.03,3.24,2.5,594.013,6.1,30.04
2,1962-01-04,3.99,3.24,2.75,594.013,6.1,30.04


## Merge

In [16]:
merge_test1 = pd.merge_asof(market_factor, fin_factor, on='Date')
merge_test1

Unnamed: 0,Date,Close_x,Volume,Dow_Jones,SP500,Close_y,Gross_Profit,ROE,PE_ratio,PS_ratio
0,2012-11-12,19.386786,515802000,12815.08,1380.03,19.386786,14401,0.40,12.93,3.45
1,2012-11-13,19.389286,532949200,12756.18,1374.53,19.389286,14401,0.40,12.93,3.45
2,2012-11-14,19.174286,477170400,12570.95,1355.49,19.174286,14401,0.40,12.93,3.45
3,2012-11-15,18.772142,789910800,12542.38,1353.33,18.772142,14401,0.40,12.93,3.45
4,2012-11-16,18.845715,1266893600,12588.31,1359.88,18.845715,14401,0.40,12.93,3.45
...,...,...,...,...,...,...,...,...,...,...
2523,2022-11-18,151.289993,74794600,33745.69,3992.93,151.289993,38095,1.61,22.58,5.74
2524,2022-11-21,148.009995,58724100,33700.28,3992.93,148.009995,38095,1.61,22.58,5.74
2525,2022-11-22,150.179993,51804100,34098.10,3992.93,150.179993,38095,1.61,22.58,5.74
2526,2022-11-23,151.070007,58301400,34194.06,3992.93,151.070007,38095,1.61,22.58,5.74


In [17]:
merge_test1 = merge_test1.drop(columns=['Close_y'])

In [18]:
merge_test1 = merge_test1.rename(columns={"Close_x": "Close"})

In [19]:
merge_test1.head(3)

Unnamed: 0,Date,Close,Volume,Dow_Jones,SP500,Gross_Profit,ROE,PE_ratio,PS_ratio
0,2012-11-12,19.386786,515802000,12815.08,1380.03,14401,0.4,12.93,3.45
1,2012-11-13,19.389286,532949200,12756.18,1374.53,14401,0.4,12.93,3.45
2,2012-11-14,19.174286,477170400,12570.95,1355.49,14401,0.4,12.93,3.45


In [20]:
merge_test2 = pd.merge_asof(merge_test1, macro_factor, on='Date')
merge_test2

Unnamed: 0,Date,Close,Volume,Dow_Jones,SP500,Gross_Profit,ROE,PE_ratio,PS_ratio,tnote,tbill,Fed_rate,GDP,GDP_growth,CPI
0,2012-11-12,19.386786,515802000,12815.08,1380.03,14401,0.40,12.93,3.45,1.61,0.18,0.16,16420.386,2.280688,231.249
1,2012-11-13,19.389286,532949200,12756.18,1374.53,14401,0.40,12.93,3.45,1.59,0.18,0.16,16420.386,2.280688,231.249
2,2012-11-14,19.174286,477170400,12570.95,1355.49,14401,0.40,12.93,3.45,1.59,0.18,0.16,16420.386,2.280688,231.249
3,2012-11-15,18.772142,789910800,12542.38,1353.33,14401,0.40,12.93,3.45,1.58,0.17,0.16,16420.386,2.280688,231.249
4,2012-11-16,18.845715,1266893600,12588.31,1359.88,14401,0.40,12.93,3.45,1.58,0.16,0.16,16420.386,2.280688,231.249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2523,2022-11-18,151.289993,74794600,33745.69,3992.93,38095,1.61,22.58,5.74,3.82,4.74,3.83,25663.289,1.800000,298.062
2524,2022-11-21,148.009995,58724100,33700.28,3992.93,38095,1.61,22.58,5.74,3.83,4.75,3.83,25663.289,1.800000,298.062
2525,2022-11-22,150.179993,51804100,34098.10,3992.93,38095,1.61,22.58,5.74,3.76,4.79,3.83,25663.289,1.800000,298.062
2526,2022-11-23,151.070007,58301400,34194.06,3992.93,38095,1.61,22.58,5.74,3.71,4.75,3.83,25663.289,1.800000,298.062


In [21]:
# check if there's any NaN
merge_test2.isnull().values.any()

False

In [22]:
merge_test2.to_csv('../../Dataset/merge_data_all_factor.csv', index=False)