In [5]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
from pandas_datareader import data as pdr
yf.pdr_override() # <== that's all it takes :-)
import numpy as np
import statistics
from pprint import pprint
from math import isnan

sp500_history = pd.read_csv('./sp500_history.csv')
sp500_history['date'] = pd.to_datetime(sp500_history['date']) 
sp500_history = sp500_history[['date', 'value', 'variable']]

start_date = datetime.strptime('2016-1-1', '%Y-%m-%d')
end_date = datetime.strptime('2020-12-31', '%Y-%m-%d')

# greater than the start date and smaller than the end date
mask = (sp500_history['date'] > start_date) & (sp500_history['date'] <= end_date)
sp500_history = sp500_history.loc[mask]

# Sort
sp500_history.sort_values(['date', 'variable'], ascending=[True, True])

Unnamed: 0,date,value,variable
261,2016-01-05,WLTW,added_ticker
260,2016-01-05,FOSL,removed_ticker
262,2016-01-19,EXR,added_ticker
263,2016-01-19,CB,removed_ticker
264,2016-02-01,FRT,added_ticker
...,...,...,...
453,2019-12-23,ZBRA,added_ticker
454,2019-12-23,STE,added_ticker
449,2019-12-23,AMG,removed_ticker
451,2019-12-23,MAC,removed_ticker


In [6]:
calendar = pd.read_excel('./Nasdaq_Trading_Calendar.xlsx', sheet_name=None)
rebal_dates = []
for year in calendar:
    sheet = calendar[year]
    mask = sheet['S&P Indexes Rebalance S&P 500, S&P 400, and S&P 600'] == 1
    year_rebal_dates = sheet.loc[mask]
    for date in year_rebal_dates['Date'].values:
        rebal_dates.append(date)
pprint(rebal_dates)

[numpy.datetime64('2020-03-20T00:00:00.000000000'),
 numpy.datetime64('2020-06-19T00:00:00.000000000'),
 numpy.datetime64('2020-09-18T00:00:00.000000000'),
 numpy.datetime64('2020-12-18T00:00:00.000000000'),
 numpy.datetime64('2019-03-15T00:00:00.000000000'),
 numpy.datetime64('2019-06-21T00:00:00.000000000'),
 numpy.datetime64('2019-09-20T00:00:00.000000000'),
 numpy.datetime64('2019-12-20T00:00:00.000000000'),
 numpy.datetime64('2018-03-16T00:00:00.000000000'),
 numpy.datetime64('2018-06-15T00:00:00.000000000'),
 numpy.datetime64('2018-09-21T00:00:00.000000000'),
 numpy.datetime64('2018-12-21T00:00:00.000000000'),
 numpy.datetime64('2017-03-17T00:00:00.000000000'),
 numpy.datetime64('2017-06-16T00:00:00.000000000'),
 numpy.datetime64('2017-09-15T00:00:00.000000000'),
 numpy.datetime64('2017-12-15T00:00:00.000000000'),
 numpy.datetime64('2016-03-18T00:00:00.000000000'),
 numpy.datetime64('2016-06-17T00:00:00.000000000'),
 numpy.datetime64('2016-09-16T00:00:00.000000000'),
 numpy.datet

In [40]:
# rebal_type is 'regular' or 'ad_hoc'
# add_delete is 'add' or 'delete'
# entry_date and exit_date are in terms of number of days before (-) or after (+) effective date
# entry_time and exit_time are 'Open' or 'Close'
strategy_attributes = ['rebal_type', 'add_delete', 'entry_date', 'entry_time', 'exit_date', 'exit_time']

output_columns = ['eff_date', 'original_date']
output_columns.extend(strategy_attributes)
output_columns.extend(['total', 'up', 'count'])

df_output = pd.DataFrame(columns=output_columns)


In [41]:
MAX_DATE_EXTENSION = 10
days_bef_aft = 20 # the range of days before and after effective date to pull from Yahoo finance
last_date = sp500_history.iloc[0, sp500_history.columns.get_loc('date')]
last_changes = []
is_begin = True

rebal_type = ''
last_rebal_type = ''

add_delete = ''
last_add_delete = ''

entry_date = -1
entry_time = 'Close'
exit_date = 0
exit_time = 'Close'

for idx, row in sp500_history.iterrows():
    
    # End date is exclusive, so need to increase by 1
    prices = pdr.get_data_yahoo(row['value'], start=row['date'] - timedelta(days=days_bef_aft), end=row['date'] + timedelta(days=days_bef_aft + 1))
    if len(prices) == 0:
        continue
        
    # Find the adjusted effective date 
    # which is one trading day before the wikipedia effective (i.e. market opening) date
    eff_date_index = prices.index.get_loc(row['date'],method='pad')
    if eff_date_index < 0:
        print('SOMETHINGS WRONG')
        print(prices)
        print(row['date'])
        continue
#     print(prices)
#     print(eff_date_index)
    eff_date = prices.iloc[eff_date_index].name
#     print(eff_date)
    
#     if row['date'] not in prices.index:
#         print('SOMETHINGS WRONG')
#         print(prices)
#         print(row['date'])
#         continue
        
#     eff_date = row['date'] - timedelta(days=1)
#     date_extension = 0
#     while not eff_date in prices.index and date_extension < MAX_DATE_EXTENSION:
#         eff_date -= timedelta(days=1)
#         date_extension += 1
#     if date_extension >= MAX_DATE_EXTENSION:
#         continue
    eff_date_close = prices.loc[eff_date][entry_time]
    if isnan(eff_date_close):
        continue
        
    # Find the entry date, which is one trading day before the adjusted effective date
    prev_date = eff_date - timedelta(days=1)
    date_extension = 0
    while not prev_date in prices.index and date_extension < MAX_DATE_EXTENSION:
        prev_date -= timedelta(days=1)
        date_extension += 1
    if date_extension >= MAX_DATE_EXTENSION:
        continue
    prev_close = prices.loc[prev_date][exit_time]
    if isnan(prev_close):
        continue

    price_change = eff_date_close / prev_close - 1 # percentage change in price between entry and exit
    
    if eff_date in rebal_dates:
        rebal_type = 'regular'
            
    else:
        rebal_type = 'ad_hoc'

    if row['variable'] == 'added_ticker':
        add_delete = 'add'
    else:
        add_delete = 'delete'

    # Calculate mean and all
    if not is_begin and (last_date != eff_date or last_add_delete != add_delete):
        count = len(last_changes)
        up = len(list(filter(lambda x: (x >= 0), last_changes))) 
        
        data = [up, count]

        df_output.loc[(df_output['eff_date']==last_date) & 
                 (df_output['rebal_type']==last_rebal_type) & 
                 (df_output['add_delete']==last_add_delete), 'up':] = data

        total = 0
        last_changes = []
    
    # Still in the same date
    else:
        if is_begin:
            total = 0
            is_begin = False
        else:
            total = last_row['total']
    
    total += price_change
    
    data = [{'eff_date': pd.to_datetime(eff_date), 'rebal_type': rebal_type, 'add_delete': add_delete, 
             'entry_date': entry_date, 'entry_time': entry_time, 
             'exit_date': exit_date, 'exit_time': exit_time, 
             'total': total, 'original_date': row['date']}]
            
    if not (df_output['eff_date']==eff_date).any():
        df_output = df_output.append(pd.DataFrame(data), ignore_index=True)
    else:
        df_output[df_output['eff_date']==eff_date]['total'] = total

    last_date = eff_date
    last_changes.append(price_change)
    last_add_delete = add_delete
    last_rebal_type = rebal_type
    last_row = df_output[(df_output['eff_date']==last_date) & 
                         (df_output['rebal_type']==rebal_type) & 
                         (df_output['add_delete']==add_delete)]


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[*********************100%***********************]  1 of 1 completed

1 Failed download:
- BRCM: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- PCP: Data doesn't exist for startDate = 1452528000, endDate = 1456070400
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- GMCR: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- ESV: No data found, s

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- WFM: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- Q: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
- SPLS: No data found for this date range, symbol may be delisted
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of

In [35]:
eff_date

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [39]:
prices.iloc[eff_date_index].name

Timestamp('2016-01-05 00:00:00')

In [10]:
def get_uniques(input_list):
    # insert the list to the set 
    unique_set = set(input_list) 
    # convert the set to the list 
    unique_list = (list(unique_set)) 
    return unique_list

In [16]:
'''
Calculate population statistics
'''

stats_df_columns = []
stats_df_columns.extend(strategy_attributes)
stats_df_columns.extend(['total', 'up', 'count', 'mean', 'std', 'max', 'min', 'sharpe', 'uppct'])

stats_df = pd.DataFrame(columns=stats_df_columns)

# get the unique values for each strategy attribute
strategy_attribute_unique_values = map(lambda strategy_attribute: get_uniques(df_output[strategy_attribute]), strategy_attributes)   

import itertools
# find strategies consisting of different combinations of unique values for each strategy attribute
strategies = list(itertools.product(*strategy_attribute_unique_values)) 

for strategy in strategies:
    # find the relevant rows for each strategy 
    relevant_rows = df_output.loc[(df_output[strategy_attributes]==strategy).all(axis=1), :]
    
    relevant_rows_sum = relevant_rows.loc[:, 'total':].sum(axis=0)
    mean = relevant_rows_sum['total'] / relevant_rows_sum['count']
    if relevant_rows_sum['count'] > 1:
        std = statistics.stdev(relevant_rows['total'])
    else:
        std = 0
    max_val = max(relevant_rows['total'])
    min_val = min(relevant_rows['total'])
    if std == 0:
        sharpe = mean / 0.00001 * np.sqrt(252)
    else:
        sharpe = mean / std * np.sqrt(252)
    uppct = relevant_rows_sum['up'] / relevant_rows_sum['count']
    strategy_dict = dict(zip(strategy_attributes, strategy)) 
    data = {'total': relevant_rows_sum['total'], 'up': relevant_rows_sum['up'], 'count': relevant_rows_sum['count'], 
            'mean': mean, 'std': std, 'max': max_val, 'min': min_val, 'sharpe': sharpe, 'uppct': uppct} 
    data.update(strategy_dict)
    stats_df = stats_df.append(pd.DataFrame([data]), ignore_index=True)


In [17]:
'''
Writing to excel
'''
writer = pd.ExcelWriter('sp500_analysis.xlsx', engine='xlsxwriter')
df_output.to_excel(writer, sheet_name='data')
stats_df.to_excel(writer, sheet_name='strategy_stats')
writer.save()