In [None]:
import pandas as pd
import numpy as np
from pandas_datareader import data, wb
from datetime import datetime
import matplotlib.pyplot as plt
from scipy import stats, integrate
from pypfopt.expected_returns import mean_historical_return
from pypfopt.risk_models import CovarianceShrinkage
from pypfopt.efficient_frontier import EfficientFrontier
import bs4 as bs
import requests
import yfinance as yf
import seaborn as sns
import cvxpy as cp
from sklearn.cluster import AgglomerativeClustering

## Get Universe - Current S&P 500 stocks that exhisted in 2002

In [2]:
"""
resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})
tickers = []
for row in table.findAll('tr')[1:]:
    ticker = row.findAll('td')[0].text
    tickers.append(ticker)

tickers = [s.replace('\n', '') for s in tickers]
start = datetime(2000,1,1)
end = datetime(2022,1,1)
data = yf.download(tickers, start=start, end=end)

data.index = pd.to_datetime(data.index)
data = data.sort_index()
SP_Close = data['Adj Close']
SP_Close=SP_Close.dropna(axis=0, how='all')
SP_Close=SP_Close.dropna(axis=1)
pd.set_option('display.max_rows', 20 )
SP_Close
"""

"\nresp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')\nsoup = bs.BeautifulSoup(resp.text, 'lxml')\ntable = soup.find('table', {'class': 'wikitable sortable'})\ntickers = []\nfor row in table.findAll('tr')[1:]:\n    ticker = row.findAll('td')[0].text\n    tickers.append(ticker)\n\ntickers = [s.replace('\n', '') for s in tickers]\nstart = datetime(2000,1,1)\nend = datetime(2022,1,1)\ndata = yf.download(tickers, start=start, end=end)\n\ndata.index = pd.to_datetime(data.index)\ndata = data.sort_index()\nSP_Close = data['Adj Close']\nSP_Close=SP_Close.dropna(axis=0, how='all')\nSP_Close=SP_Close.dropna(axis=1)\npd.set_option('display.max_rows', 20 )\nSP_Close\n"

In [None]:
#read in stock data from pi
SP_Close = pd.read_pickle('SP_Close.pkl')

## Testing Initial Portfolio without Bayes

In [None]:
five_year_data=SP_Close[:1265] #5 years of data
five_year_data.tail()

In [None]:
mu = mean_historical_return(five_year_data)
S = CovarianceShrinkage(five_year_data).ledoit_wolf()
ef = EfficientFrontier(mu, S)
weights = ef.max_sharpe()
cleaned_weights = ef.clean_weights()
#ef.save_weights_to_file("weights.txt")  # saves to file
print(ef.portfolio_performance(verbose=True))

## Look at some correlations and clustering to gather ideas

In [None]:
#corr = five_year_data.corr()
#plt.figure(figsize=(12,8))
#sns.clustermap(corr)

In [None]:
import scipy.cluster.hierarchy as shc
#plt.figure(figsize=(10, 7))  
#plt.title("Dendrograms")  
#dend = shc.dendrogram(shc.linkage(corr, method='ward'))
#plt.axhline(y=6, color='r', linestyle='--')

In [None]:
from sklearn.cluster import AgglomerativeClustering
#cluster = AgglomerativeClustering(n_clusters=None, affinity='euclidean', linkage='ward',distance_threshold=4)  
#kclusters = cluster.fit_predict(corr)
#stock_clusters=pd.DataFrame(SP_Close.columns.values)
#stock_clusters=stock_clusters.set_index(0)
#stock_clusters['cluster']=kclusters
#print(stock_clusters['cluster'].max())

In [None]:
#plt.hist(stock_clusters,bins=stock_clusters['cluster'].max())

In [None]:
#shrinkage
def calculate_posterior_mean(like_pop, prior_pop): #for returns use geo mean then multiply by 253 to annualize at end
    w0 = prior_pop.std()
    mu0 = prior_pop.mean() - w0**2/2
    w = like_pop.std()
    dbar = like_pop.mean() - w**2/2
    
    B = w**2/(w**2+w0**2)
    mu_s =dbar+B*(mu0-dbar)

    return mu_s * 253

In [None]:
def Portfolio_backtest(weight_vector) :
    return_frame = SP_Close.loc[weight_vector.index]
    return_frame = np.exp(np.log(return_frame).diff())-1
    weighted_returns=weight_vector.shift(1)*return_frame
    port_returns = np.sum(weighted_returns, axis=1) #shift so that we are using the weights we had over that period rather than the ones we found with hindsight
    total_return=(port_returns+1).cumprod()
    total_return.plot()
    plt.title('Cumulative Return')
    return port_returns

## Implement Clustering Influenced Dynamic Constraints

In [None]:
def Cluster_Constraints(clusters,u_coef,l_coef): # weight coef for how overweight you can go
    cluster_list = set(clusters.cluster)
    cluster_map = dict(zip(clusters.index,clusters.cluster))
    cluster_totals=[clusters.value_counts()[x] for x in cluster_list]
    cluster_pct=np.array(cluster_totals)/len(clusters)
    cluster_upper_l = dict(zip(cluster_list, cluster_pct*u_coef))
    cluster_lower_l = dict(zip(cluster_list, - l_coef*cluster_pct))
    return cluster_map, cluster_upper_l, cluster_lower_l

In [None]:
#for testing preset clusters
def get_clusters(corr, cluster_number = 45, Thresh = None):
    if Thresh == None:
        cluster = AgglomerativeClustering(n_clusters=cluster_number, affinity='euclidean', linkage='ward')
    else:
        cluster = AgglomerativeClustering(n_clusters=None, affinity='euclidean', linkage='ward',distance_threshold=Thresh) 
    kclusters = cluster.fit_predict(corr)
    stock_clusters=pd.DataFrame(SP_Close.columns.values)
    stock_clusters=stock_clusters.set_index(0)
    stock_clusters['cluster']=kclusters
    return stock_clusters

In [None]:
class constrained_clusterings_precomputed:
    
    def __init__(self,lookback):
        llist = [1,2,3,4,5]
        if (int(lookback) != lookback) or (int(lookback) not in llist):
            raise ValueError("Only lookback periods of",llist,"supported")
            
        self.df = pd.read_pickle(str(int(lookback))+'.pkl')
        self.names_df = pd.read_pickle('names.pkl')
        
#         corr_threshold = 0.7
#         pct_threshold = 0.1
#         date = datetime.strptime('3-31-2030','%m-%d-%Y')
        
            
    def calcthresholds(self,corr_threshold,pct_threshold,date,DEBUG=False):
        if date < min(self.df.index):
            raise ValueError("Date must be >=",min(self.df.index))
        if date > max(self.df.index):
            print("WARNING: USING",max(self.df.index),"FOR INPUT DATE",date)
        inddate = max(self.df.index[self.df.index <= date])
#         print(inddate)
#         print(self.df.loc[inddate])
        if DEBUG:
            print("DEBUG: USING",inddate,"FOR INPUT DATE",date)
        
        clusters = self.df.loc[inddate]['Clusters']
        numstocks = self.df.loc[inddate]['Numstocks']
        internals = self.df.loc[inddate]['Corrs']
        sz = clusters.shape[0]
        
        tmp = np.sum((internals<corr_threshold)*(numstocks/sz),axis=1)<pct_threshold
        ret = (np.arange(0,sz)+1)[tmp]
        if ret.shape[0] <= 0:
            return None
        OPTIMAL_NUMBER_OF_CLUSTERS = ret[0]
        if DEBUG:
            print("DEBUG: OPTIMAL NUMBER OF CLUSTERS:",OPTIMAL_NUMBER_OF_CLUSTERS)
        
        ret_df = self.names_df
        ret_df = ret_df.set_index(0)
        ret_df['cluster'] = clusters[OPTIMAL_NUMBER_OF_CLUSTERS-1,:].astype(int)
        
#         print(OPTIMAL_NUMBER_OF_CLUSTERS)
#         print(ret_df)
        return ret_df

In [None]:
import os
filelist = ['1.pkl','2.pkl','3.pkl','4.pkl','5.pkl','names.pkl']
for file in filelist:
    if not os.path.exists(file):
        raise ValueError("All of",filelist,"must be present")
        
look1 = constrained_clusterings_precomputed(1)
look2 = constrained_clusterings_precomputed(2)
look3 = constrained_clusterings_precomputed(3)
look4 = constrained_clusterings_precomputed(4)
look5 = constrained_clusterings_precomputed(5)

In [None]:
def get_clusters_precomputed(lookback,corr_threshold,pct_threshold,date,DEBUG=False):
    llist = [1,2,3,4,5]
    if (int(lookback) != lookback) or (int(lookback) not in llist):
        raise ValueError("Only lookback periods of",llist,"supported")
        
    # Apparently pattern matching wasn't implemented until Python 3.10
    if lookback == 1:
        return look1.calcthresholds(corr_threshold,pct_threshold,date,DEBUG)
    elif lookback == 2:
        return look2.calcthresholds(corr_threshold,pct_threshold,date,DEBUG)
    elif lookback == 3:
        return look3.calcthresholds(corr_threshold,pct_threshold,date,DEBUG)
    elif lookback == 4:
        return look4.calcthresholds(corr_threshold,pct_threshold,date,DEBUG)
    elif lookback == 5:
        return look5.calcthresholds(corr_threshold,pct_threshold,date,DEBUG)
    else:
        raise ValueError("Something went wrong")

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
def Markowits_Bayes_Cluster_Rebalance(securities_vector, rebalance_period=21,prior_period = 253*5, update_period=0,
                                      verbose=False,weight_bounds=(-.01,.1), cluster_u_coef=1.25, cluster_l_coef=.5,
                                      corr_thresh = .75,leave_out_pct = .1, clust_num = None):
    i=1
    i_ =round((len(securities_vector)-prior_period)/rebalance_period)
    weights_list=[]
    dates=[]
    cluster_list=[]
    num=clust_num
    for p in range(0,len(securities_vector)-prior_period,rebalance_period):
        prior_vector = securities_vector[p:prior_period+p]
        update_vector = securities_vector[prior_period+p-update_period:prior_period+p]
        if verbose:
            print("Iteration " + str(i) + " of " + str(i_))
            print(str(prior_vector.index[0]) + " - " + str(prior_vector.index[-1]))
            
        
        if clust_num == None:
            clusters= get_clusters_precomputed(prior_period/253,corr_thresh, leave_out_pct, date=prior_vector.index[-1])
            num = clusters.max().values[0]
            cluster_list.append(max(clusters))    
        else:
            clusters = get_clusters(prior_vector.corr(), cluster_number = clust_num)
            cluster_list.append(clust_num) 
        c_map, c_upper, c_lower = Cluster_Constraints(clusters,cluster_u_coef,cluster_l_coef)
        mu = mean_historical_return(prior_vector)  
        if update_period != 0:
            for i in range(num):
                stocksNcluster = clusters.loc[clusters['cluster']==i].index
                for stock in stocksNcluster:
                    mu.loc[stock] = calculate_posterior_mean(update_vector[stock].pct_change().dropna().to_numpy(),
                        prior_vector[stocksNcluster].pct_change().dropna().to_numpy())
        
        S = CovarianceShrinkage(prior_vector).ledoit_wolf()
        ef = EfficientFrontier(mu, S,weight_bounds=weight_bounds)
        ef.add_sector_constraints(c_map, c_lower, c_upper)
        weights = ef.max_sharpe()
        cleaned_weights = ef.clean_weights()
        weights_list.append(cleaned_weights)
        dates.append(prior_vector.index[-1])
        i+=1
    cluster_ts=pd.DataFrame(cluster_list)
    cluster_ts.index=dates
    cluster_ts.columns = ['Clusters']
    weight_df=pd.DataFrame(weights_list)
    weight_df.index=dates

    return weight_df, cluster_ts

## Hyperparameter tuning

In [None]:
#cluster_constrained_markowitz_set_num, _ = Markowits_Bayes_Cluster_Rebalance(SP_Close,21,252*5,verbose=False,clust_num=40)


In [None]:
#Portfolio_backtest(cluster_constrained_markowitz_set_num)

In [None]:
import itertools
import random
import time
import copy
import pickle
def random_grid_search(securities_vec, number_of_samples = 50, 
                       rebal_prd_list = [63,253],#[21,63,253],
                       prior_prd_list = [253,2*253,5*253],
                       update_prd_list = [63, 253],
                       low_stk_bound_list = [-.007, -.005, -.003],
                       up_stk_bound_list = [.01,.02,.05,.1],
                       cluster_u_coef_list = [2,3,5],
                       cluster_l_coef_list = [-.02, -.05, -.1],
                       corr_thesh_list = [.5,.6,.7,.8],
                       stock_tresh_list = [.05] ):

#     permutations = []
    tuple_returns_dict = {}
#     for rebal,prior,update,lstk,ustk,clu,cll,corr,stock in itertools.product(rebal_prd_list, prior_prd_list, update_prd_list, low_stk_bound_list, \
#                                                     up_stk_bound_list,cluster_u_coef_list, cluster_l_coef_list, corr_thesh_list, stock_tresh_list):
#         if update_prd_list >= rebal_prd_list:
#             permutations.append((rebal,prior,update,lstk,ustk,clu,cll,corr,stock))
#     list_of_tuples = random.sample(permutations, number_of_samples)

    PICKLE_NAME = 'cc3.pickle'
    assert("new" in PICKLE_NAME)
    with open(PICKLE_NAME, 'rb') as handle:
        list_of_tuples_all = pickle.load(handle)
    list_of_tuples = copy.deepcopy(list_of_tuples_all)
    
    for tuuple_ind in range(min(number_of_samples,len(list_of_tuples))):
        tuuple = list_of_tuples[tuuple_ind]
        print(tuuple)
        try:
            weights,_ = Markowits_Bayes_Cluster_Rebalance(securities_vec, rebalance_period=tuuple[0],prior_period = tuuple[1],update_period=tuuple[2],\
                                      verbose=False, weight_bounds=(tuuple[3],tuuple[4]), cluster_u_coef=tuuple[5], cluster_l_coef=tuuple[6],\
                                          corr_thresh = tuuple[7], leave_out_pct = tuuple[8], clust_num = None)
            performance_series = Portfolio_backtest(weights)
            performance_series = performance_series.loc["2005":]
        except KeyboardInterrupt:
            print('KeyboardInterrupt')
            break
#             with open('res_'+str(int(time.time()))+'.pickle', 'wb') as handle:
#                 pickle.dump(tuple_returns_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
#             with open(PICKLE_NAME, 'wb') as handle:
#                 pickle.dump(list_of_tuples_all, handle, protocol=pickle.HIGHEST_PROTOCOL)
#             return tuple_returns_dict
        except:
            weights = None
            performance_series = None
            print('Infeasible')
        
        list_of_tuples_all.pop(0)
        tuple_returns_dict[tuuple] = performance_series
#         print(performance_series)
#         return performance_series
    with open('res_'+str(int(time.time()))+'.pickle', 'wb') as handle:
        pickle.dump(tuple_returns_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(PICKLE_NAME, 'wb') as handle:
        pickle.dump(list_of_tuples_all, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return tuple_returns_dict

In [None]:
import os

listf = ["cc1_new.pickle","cc2_new.pickle","cc3_new.pickle"]
numf = 0
for i in listf:
    if os.path.exists(i):
        numf += 1
        namef = i
if numf > 1:
    print("Please remove all \"cc__new.pickle\" files except the one you are running:\nDavid:\"cc1_new.pickle\"\nTainon:\"cc2_new.pickle\"\nEvan:\"cc3_new.pickle\"")
    assert(0==1)
if numf < 1:
    print("Make sure you've downloaded the right \"cc__new.pickle\" file:\nDavid:\"cc1_new.pickle\"\nTainon:\"cc2_new.pickle\"\nEvan:\"cc3_new.pickle\"")
    assert(0==1)
print("Currently running on:\n"+namef)
    
while True:
    dictionary = random_grid_search(SP_Close[:'2018-01-01'], number_of_samples=30)

In [None]:
with open('res_1652548211.pickle', 'rb') as handle:
    origg = pickle.load(handle)
print(origg)

In [None]:
with open('dd1.pickle', 'rb') as handle:
    origg = pickle.load(handle)
print(len(origg))
for i in origg:
    print(i)