In [1]:
import datetime as dt
import pandas as pd
from pandas_datareader import data as pdr
import plotly.offline as plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import yfinance as yf
yf.pdr_override()
import os
plotly.init_notebook_mode(connected=True)
pd.options.plotting.backend = 'plotly'

## Specifying date range

In [2]:
# selecting dates to be 10 year window after Feng et al. 2012's paper
end = dt.datetime(2023, 1, 1)
start = dt.datetime(2013, 1, 1)
start, end

(datetime.datetime(2013, 1, 1, 0, 0), datetime.datetime(2023, 1, 1, 0, 0))

## Select stock/ticker

# Importing stock tickers from 
from https://stockmarketmba.com/stocksinthesp500.php


In [3]:
working_dir = os.getcwd()

In [4]:
os.listdir(working_dir)

['.ipynb_checkpoints',
 'data',
 'DataImport-1.1.ipynb',
 'model_test.py',
 'PCS_Update.ipynb',
 'README.md',
 'Trail_Agent_code.ipynb']

In [5]:
path = "data/Stocks_in_SP_500_Index.xlsx"
data = pd.read_excel(path,skiprows=1)
data.head()

Unnamed: 0,Symbol,Description,Category2,Category3,GICS Sector,Market cap,Dividend yield,Price to TTM earnings,Price to TTM sales,Price to book value,Action
0,AAPL,Apple Inc,Common stocks,Large cap,Information Technology,1988832912360,0.0073,0.0,0.0,0.0,Analyze
1,MSFT,Microsoft Corp,Common stocks,Large cap,Information Technology,1657655067218,0.0114,0.0,0.0,0.0,Analyze
2,GOOG,Alphabet Inc Class C,Common stocks,Large cap,Communication Services,1042954820000,0.0,0.0,0.0,0.0,Analyze
3,GOOGL,Alphabet Inc Class A,Common stocks,Large cap,Communication Services,1042954820000,0.0,0.0,0.0,0.0,Analyze
4,AMZN,Amazon.Com Inc.,Common stocks,Large cap,Consumer Discretionary,847961495109,0.0,0.0,0.0,0.0,Analyze


In [6]:
tickers = data['Symbol'].to_list()

In [7]:
tickers

['AAPL',
 'MSFT',
 'GOOG',
 'GOOGL',
 'AMZN',
 'BRK.B',
 'JNJ',
 'UNH',
 'XOM',
 'V',
 'JPM',
 'WMT',
 'PG',
 'NVDA',
 'TSLA',
 'META',
 'LLY',
 'CVX',
 'MA',
 'HD',
 'ABBV',
 'MRK',
 'PFE',
 'BAC',
 'KO',
 'PEP',
 'AVGO',
 'ORCL',
 'TMO',
 'COST',
 'ABT',
 'CSCO',
 'MCD',
 'NKE',
 'DHR',
 'TMUS',
 'VZ',
 'ACN',
 'DIS',
 'NEE',
 'WFC',
 'CMCSA',
 'PM',
 'BMY',
 'ADBE',
 'TXN',
 'SCHW',
 'LIN',
 'UPS',
 'RTX',
 'COP',
 'MS',
 'AMGN',
 'HON',
 'T',
 'NFLX',
 'CRM',
 'IBM',
 'CAT',
 'LMT',
 'UNP',
 'LOW',
 'DE',
 'QCOM',
 'BA',
 'SBUX',
 'CVS',
 'GS',
 'INTC',
 'ELV',
 'SPGI',
 'AXP',
 'GILD',
 'MDT',
 'INTU',
 'BLK',
 'PLD',
 'AMD',
 'AMT',
 'ADP',
 'ISRG',
 'SYK',
 'EL',
 'TJX',
 'CI',
 'CB',
 'C',
 'MDLZ',
 'PYPL',
 'BKNG',
 'AMAT',
 'MMC',
 'NOC',
 'ADI',
 'MO',
 'DUK',
 'GE',
 'REGN',
 'PGR',
 'SO',
 'SLB',
 'VRTX',
 'NOW',
 'EOG',
 'BDX',
 'TGT',
 'HCA',
 'ZTS',
 'MMM',
 'USB',
 'ITW',
 'GD',
 'APD',
 'MRNA',
 'CL',
 'BSX',
 'WM',
 'CSX',
 'PNC',
 'FISV',
 'ETN',
 'AON',
 'HUM',
 'E

## pandas_datareader module

In [8]:
# 1. pdr.DataReader(stocks, 'yahoo', start, end)
# 2. pdr.get_data_yahoo(stocks, start, end)

df = pdr.get_data_yahoo(tickers, start, end)

[*********************100%***********************]  503 of 503 completed

2 Failed downloads:
- BF.B: No data found for this date range, symbol may be delisted
- BRK.B: No timezone found, symbol may be delisted


# Making a DataFrame containing Adjusted close Daily returns

In [9]:
df['Adj Close'].columns

Index(['A', 'AAL', 'AAP', 'AAPL', 'ABBV', 'ABC', 'ABT', 'ACGL', 'ACN', 'ADBE',
       ...
       'WYNN', 'XEL', 'XOM', 'XRAY', 'XYL', 'YUM', 'ZBH', 'ZBRA', 'ZION',
       'ZTS'],
      dtype='object', length=503)

In [10]:
#checking for missing values & printing the columns with missing values
col_with_missing_values = {}
for col in df['Adj Close'].columns:
    # counting the columns with missing values in Adj Close
    sum_of_missing_values = df['Adj Close'][col].isnull().sum()
    # also counting the columns with missing values in Volume
    sum_of_missing_values += df['Volume'][col].isnull().sum()
    if sum_of_missing_values > 0:
        col_with_missing_values[col] = sum_of_missing_values
        print(col)

ALLE
ANET
BF.B
BRK.B
CARR
CDAY
CDW
CEG
CFG
CTLT
CTVA
CZR
DOW
ETSY
FOX
FOXA
FTV
HLT
HPE
HWM
INVH
IQV
IR
KEYS
KHC
LW
MRNA
NCLH
NWS
NWSA
OGN
OTIS
PAYC
PYPL
QRVO
SEDG
SYF
VICI
WRK
ZTS


In [11]:
df_col_missing_values = pd.DataFrame(col_with_missing_values, index = [0]).T
df_col_missing_values.columns = ['Missing Values']

In [12]:
list_of_tickers_with_missing_values = df_col_missing_values.index.to_list()

In [13]:
print(f'stocks with missing values {list_of_tickers_with_missing_values}')
print(f'stocks to de removed: {len(list_of_tickers_with_missing_values)}')


stocks with missing values ['ALLE', 'ANET', 'BF.B', 'BRK.B', 'CARR', 'CDAY', 'CDW', 'CEG', 'CFG', 'CTLT', 'CTVA', 'CZR', 'DOW', 'ETSY', 'FOX', 'FOXA', 'FTV', 'HLT', 'HPE', 'HWM', 'INVH', 'IQV', 'IR', 'KEYS', 'KHC', 'LW', 'MRNA', 'NCLH', 'NWS', 'NWSA', 'OGN', 'OTIS', 'PAYC', 'PYPL', 'QRVO', 'SEDG', 'SYF', 'VICI', 'WRK', 'ZTS']
stocks to de removed: 40


In [14]:
df_adj_close_copy = df['Adj Close'].copy()
df_volume_copy = df['Volume'].copy()

In [15]:
# removing stocks with missing values
df_adj_close_copy.drop(list_of_tickers_with_missing_values, axis=1, inplace=True)
df_volume_copy.drop(list_of_tickers_with_missing_values, axis=1, inplace=True)
df_adj_close_clean = df_adj_close_copy
df_volume_clean = df_volume_copy


In [16]:
df_adj_close_clean

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WY,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,27.435246,13.179525,67.488419,16.862816,23.105904,37.189941,26.401583,14.793333,57.290276,38.340000,...,20.031267,96.279884,19.881603,58.083370,37.638947,24.058149,39.864067,60.554214,40.959999,18.467676
2013-01-03,27.533506,12.877846,67.488419,16.649975,22.915112,37.112972,27.406565,14.750000,57.082897,37.750000,...,20.031267,97.315521,19.823509,57.978600,37.536770,23.979418,40.113365,61.421837,41.000000,18.492868
2013-01-04,28.077238,13.886581,68.540459,16.186203,22.625629,37.360950,27.241819,14.876667,57.398132,38.130001,...,20.196188,98.489769,19.896122,58.247032,37.861889,23.944416,40.552605,61.734921,40.669998,19.021954
2013-01-07,27.874157,13.990284,68.307701,16.090988,22.671680,37.480675,27.464231,14.730000,57.149261,37.939999,...,20.258034,98.824089,19.685545,57.572647,37.991920,23.629471,40.297363,61.886948,40.900002,18.929575
2013-01-08,27.651423,14.291960,67.190483,16.134289,22.178246,37.429375,27.472469,14.750000,57.481094,38.139999,...,20.395468,98.742569,19.721851,57.932781,37.555344,23.340776,38.605713,61.976398,40.930000,18.551653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,149.007767,12.710000,143.279999,131.860001,161.564163,170.009995,107.692398,63.380001,265.006012,338.450012,...,31.490000,80.720001,70.446365,108.680000,31.701571,109.730003,128.899994,126.690002,248.220001,48.450001
2022-12-27,149.327301,12.530000,145.020004,130.029999,161.455200,169.000000,108.080643,63.619999,264.229187,335.089996,...,31.469999,84.330002,71.082001,110.190002,31.940601,110.720001,129.899994,127.279999,251.000000,48.840000
2022-12-28,147.869461,12.320000,145.300003,126.040001,160.702347,167.360001,107.343979,62.599998,262.048126,328.329987,...,30.629999,80.089996,70.570000,108.379997,30.855000,108.940002,129.309998,125.989998,246.839996,47.970001
2022-12-29,150.865005,12.700000,146.309998,129.610001,161.029236,166.050003,109.812798,63.110001,267.286682,337.579987,...,31.320000,81.260002,71.070000,109.199997,32.279999,111.639999,129.990005,127.830002,257.529999,49.080002


In [17]:
df_volume_clean

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WY,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,8790205,6662000,800500,560518000,13767900,1972400,20266400,2932800,4040500,6483800,...,6595000,2487000,2702000,16143700,856600,927300,6091328,1439425,213800,2551100
2013-01-03,5751791,5398400,520600,352965200,16739300,2026700,22148200,1966200,3340700,3906000,...,3967200,1731900,2403600,13268200,525500,610800,3654574,1622250,99900,2267900
2013-01-04,6432897,12048300,615000,594333600,21372100,2461500,15820100,1591800,3145600,3809300,...,3639400,2129100,1750100,11427900,660400,563400,3782685,1226524,146000,3577700
2013-01-07,3589505,5730600,1054400,484156400,17897100,1803600,13120000,1296900,2262800,3632100,...,2460400,1501000,2856800,11799800,554100,481300,5289417,985710,90600,2286000
2013-01-08,3896925,8034400,840000,458707200,17863300,1533000,15042300,1810800,2502800,3080900,...,5202600,976300,3761500,14226400,698900,1084500,17253686,932356,112600,3758600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,779400,17181200,524500,63814900,2685100,528000,2618400,890300,1213300,1629800,...,1932800,1134800,1201300,11539400,1266800,321100,700800,789800,180900,1263300
2022-12-27,879500,18878200,717100,69007800,2669200,740300,2927400,666700,1009600,1464300,...,2266000,4026500,1580600,11962100,998800,468200,1300200,755600,252500,704200
2022-12-28,784300,20470400,685500,85438400,2944500,804700,3265200,1107900,1387000,1672100,...,2244100,2131500,1513700,10702100,1430500,480400,964800,750100,241200,680300
2022-12-29,854400,19706300,719100,75703700,3112500,1480000,3047800,1063400,1516800,1793100,...,1846000,1431100,1398000,10534000,1139500,516300,875700,686600,274900,685300


In [19]:
# taking Daily returns (percent change of  Adj close of the stocks)  and storing in a new dataframe
# also dropping the first row as it will be NaN after percent change transformaion
df_clean_daily_returns = df_adj_close_clean.pct_change().dropna()
df_clean_daily_returns

Unnamed: 0_level_0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WY,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-03,0.003582,-0.022890,0.000000,-0.012622,-0.008257,-0.002070,0.038065,-0.002929,-0.003620,-0.015389,...,0.000000,0.010757,-0.002922,-0.001804,-0.002715,-0.003273,0.006254,0.014328,0.000977,0.001364
2013-01-04,0.019748,0.078331,0.015588,-0.027854,-0.012633,0.006682,-0.006011,0.008588,0.005522,0.010066,...,0.008233,0.012066,0.003663,0.004630,0.008661,-0.001460,0.010950,0.005097,-0.008049,0.028610
2013-01-07,-0.007233,0.007468,-0.003396,-0.005882,0.002035,0.003205,0.008164,-0.009859,-0.004336,-0.004983,...,0.003062,0.003394,-0.010584,-0.011578,0.003434,-0.013153,-0.006294,0.002463,0.005655,-0.004856
2013-01-08,-0.007991,0.021563,-0.016356,0.002691,-0.021764,-0.001369,0.000300,0.001358,0.005806,0.005272,...,0.006784,-0.000825,0.001844,0.006255,-0.011491,-0.012218,-0.041979,0.001445,0.000733,-0.019965
2013-01-09,0.027008,-0.001979,0.003049,-0.015629,0.005636,0.000000,0.006597,0.003616,0.007072,0.013634,...,0.022911,0.003551,0.001472,-0.003843,-0.000495,0.015742,0.001777,0.018474,0.010750,-0.010864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,0.001476,0.011943,0.008446,-0.002798,-0.001041,0.004372,0.001388,0.008433,0.005023,0.005735,...,0.014171,-0.006401,0.012852,0.026445,0.011118,-0.000728,0.000621,0.001106,0.002869,0.003521
2022-12-27,0.002144,-0.014162,0.012144,-0.013878,-0.000674,-0.005941,0.003605,0.003787,-0.002931,-0.009928,...,-0.000635,0.044723,0.009023,0.013894,0.007540,0.009022,0.007758,0.004657,0.011200,0.008050
2022-12-28,-0.009763,-0.016760,0.001931,-0.030685,-0.004663,-0.009704,-0.006816,-0.016033,-0.008254,-0.020174,...,-0.026692,-0.050279,-0.007203,-0.016426,-0.033988,-0.016077,-0.004542,-0.010135,-0.016574,-0.017813
2022-12-29,0.020258,0.030844,0.006951,0.028324,0.002034,-0.007827,0.022999,0.008147,0.019991,0.028173,...,0.022527,0.014609,0.007085,0.007566,0.046184,0.024784,0.005259,0.014604,0.043307,0.023139


In [20]:
# Plotting mean (adj close) daily returns, averaged per day across all stocks:
df_clean_daily_returns.mean(1).plot(title='Mean Daily Returns across all stocks')

In [21]:
# Plotting mean daily volume, averaged per day across all stocks:
df_volume_clean.mean(1).plot(title='Mean Daily Volume across all stocks')


In [22]:
# saving the dataframes to csv files
df_clean_daily_returns.to_csv('data/df_clean_daily_returns.csv')
df_volume_clean.to_csv('data/df_volume_clean.csv')

# Code below is unused so far

# Making a DataFrame containing Daily volume

In [23]:
df['Volume'].columns

Index(['A', 'AAL', 'AAP', 'AAPL', 'ABBV', 'ABC', 'ABT', 'ACGL', 'ACN', 'ADBE',
       ...
       'WYNN', 'XEL', 'XOM', 'XRAY', 'XYL', 'YUM', 'ZBH', 'ZBRA', 'ZION',
       'ZTS'],
      dtype='object', length=503)

In [24]:
col_with_missing_values = {}
for col in df['Volume'].columns:
    sum_of_missing_values = df['Adj Close'][col].isnull().sum()
    if sum_of_missing_values > 0:
        col_with_missing_values[col] = sum_of_missing_values
        print(col)

ALLE
ANET
BF.B
BRK.B
CARR
CDAY
CDW
CEG
CFG
CTLT
CTVA
CZR
DOW
ETSY
FOX
FOXA
FTV
HLT
HPE
HWM
INVH
IQV
IR
KEYS
KHC
LW
MRNA
NCLH
NWS
NWSA
OGN
OTIS
PAYC
PYPL
QRVO
SEDG
SYF
VICI
WRK
ZTS


## Acessing attributes of dataframe

In [25]:
close = df.Close

In [26]:
close[close.index > end - dt.timedelta(days=100)].describe()

Unnamed: 0,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
count,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,...,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0
mean,141.275,13.461912,161.760147,143.183087,152.418089,156.387354,103.165441,54.966765,275.259706,316.742792,...,72.613823,66.590735,105.655882,30.081324,103.628971,120.13647,115.295441,259.058235,50.205147,148.118677
std,11.258239,0.815157,15.984451,6.673953,9.121142,11.898258,4.260518,6.262076,14.522118,22.157599,...,10.343927,3.823638,7.284724,1.282007,9.611404,8.740005,7.909126,13.208136,2.208463,5.278865
min,121.550003,11.86,139.380005,126.040001,134.210007,135.330002,95.059998,41.669998,250.070007,275.200012,...,54.48,57.939999,83.980003,26.83,86.199997,105.25,104.550003,226.880005,45.740002,131.139999
25%,131.987495,12.7275,146.934998,138.91,144.4175,142.935005,99.482498,48.537499,262.894997,296.777504,...,64.297499,64.247501,103.17,29.1725,93.08,110.884998,108.012503,251.347496,48.414999,145.729996
50%,144.790001,13.65,159.485001,143.805,151.805,158.555,103.449997,57.01,273.380005,326.179993,...,74.254997,67.904999,106.994999,30.295,108.605003,122.684998,113.329998,260.744995,50.375,148.575005
75%,150.880001,14.1025,174.884998,148.287498,161.615002,167.177498,106.912502,60.01,288.087509,336.522491,...,81.787498,69.674997,110.802502,30.98,111.257502,128.607498,123.559999,267.182503,51.835,151.697498
max,157.660004,14.93,190.669998,155.740005,165.990005,173.990005,111.529999,63.619999,302.829987,345.959991,...,86.669998,71.599998,114.18,32.279999,114.989998,131.220001,128.429993,288.0,55.389999,157.419998


## Creating Moving Averages

In [27]:
df['MA20'] = df['Close'].rolling(window = 20, min_periods = 1).mean()
df['MA9'] = df['Close'].rolling(window = 9, min_periods = 1).mean()

ValueError: Wrong number of items passed 503, placement implies 1

## Plotting data

In [None]:
## https://www.youtube.com/watch?v=yQtT_4RgT2k
# fig = make_subplots(rows = 2, cols = 1, shared_xaxes= True, vertical_spacing = 0.1, subplot_titles = ('QQQ', 'Volume'),
#                     row_width = [0.2, 0.7])
# fig.add_trace(go.Candlestick(x=df.index, open=df["Open"], high=df["High"],
#                 low=df["Low"], close=df["Close"], name="OHLC"), 
#                 row=1, col=1)

In [None]:
figure = go.Figure(
        data = [
            go.Candlestick(
            x=df.index, 
            close=df["Close"], 
            open=df["Open"], 
            high=df["High"],
            low=df["Low"],
            increasing_line_color = 'red',
            decreasing_line_color = 'red',
            )
        ]
)
figure.show()

In [None]:
figure.show()

In [None]:
close.plot()

In [28]:
# figure.add_trace()

In [29]:
close['QQQ'].pct_change().plot(kind='hist')

KeyError: 'QQQ'