In [1]:
import pandas as pd
import numpy as np
from pandas_datareader import data, wb
from datetime import datetime
import matplotlib.pyplot as plt
from scipy import stats, integrate
from pypfopt.expected_returns import mean_historical_return
from pypfopt.risk_models import CovarianceShrinkage
import bs4 as bs
import requests
import yfinance as yf
import seaborn as sns

import cvxpy as cp

# To use: Download the 6 pkl file from https://drive.google.com/drive/u/2/folders/1VecVVsvXqSng0BnA1w85H3TbwB5XZwVA, then run the following 3 cells

In [2]:
class constrained_clusterings_precomputed:
    
    def __init__(self,lookback):
        llist = [1,2,3,4,5]
        if (int(lookback) != lookback) or (int(lookback) not in llist):
            raise ValueError("Only lookback periods of",llist,"supported")
            
        self.df = pd.read_pickle(str(int(lookback))+'.pkl')
        self.names_df = pd.read_pickle('names.pkl')
        
#         corr_threshold = 0.7
#         pct_threshold = 0.1
#         date = datetime.strptime('3-31-2030','%m-%d-%Y')
        
            
    def calcthresholds(self,corr_threshold,pct_threshold,date,DEBUG=False):
        if date < min(self.df.index):
            raise ValueError("Date must be >=",min(self.df.index))
        if date > max(self.df.index):
            print("WARNING: USING",max(self.df.index),"FOR INPUT DATE",date)
        inddate = max(self.df.index[self.df.index <= date])
#         print(inddate)
#         print(self.df.loc[inddate])
        if DEBUG:
            print("DEBUG: USING",inddate,"FOR INPUT DATE",date)
        
        clusters = self.df.loc[inddate]['Clusters']
        numstocks = self.df.loc[inddate]['Numstocks']
        internals = self.df.loc[inddate]['Corrs']
        sz = clusters.shape[0]
        
        tmp = np.sum((internals<corr_threshold)*(numstocks/sz),axis=1)<pct_threshold
        ret = (np.arange(0,sz)+1)[tmp]
        if ret.shape[0] <= 0:
            return None
        OPTIMAL_NUMBER_OF_CLUSTERS = ret[0]
        if DEBUG:
            print("DEBUG: OPTIMAL NUMBER OF CLUSTERS:",OPTIMAL_NUMBER_OF_CLUSTERS)
        
        ret_df = self.names_df
        ret_df = ret_df.set_index(0)
        ret_df['cluster'] = clusters[OPTIMAL_NUMBER_OF_CLUSTERS-1,:].astype(int)
        
#         print(OPTIMAL_NUMBER_OF_CLUSTERS)
#         print(ret_df)
        return ret_df

In [3]:
import os

filelist = ['1.pkl', '2.pkl', '3.pkl', '4.pkl', '5.pkl', 'names.pkl']
for file in filelist:
    if not os.path.exists(file):
        raise ValueError("All of",filelist,"must be present")
        
look1 = constrained_clusterings_precomputed(1)
look2 = constrained_clusterings_precomputed(2)
look3 = constrained_clusterings_precomputed(3)
look4 = constrained_clusterings_precomputed(4)
look5 = constrained_clusterings_precomputed(5)

In [4]:
def get_clusters_precomputed(lookback,corr_threshold,pct_threshold,date,DEBUG=False):
    llist = [1,2,3,4,5]
    if (int(lookback) != lookback) or (int(lookback) not in llist):
        raise ValueError("Only lookback periods of",llist,"supported")
        
    # Apparently pattern matching wasn't implemented until Python 3.10
    if lookback == 1:
        return look1.calcthresholds(corr_threshold,pct_threshold,date,DEBUG)
    elif lookback == 2:
        return look2.calcthresholds(corr_threshold,pct_threshold,date,DEBUG)
    elif lookback == 3:
        return look3.calcthresholds(corr_threshold,pct_threshold,date,DEBUG)
    elif lookback == 4:
        return look4.calcthresholds(corr_threshold,pct_threshold,date,DEBUG)
    elif lookback == 5:
        return look5.calcthresholds(corr_threshold,pct_threshold,date,DEBUG)
    else:
        raise ValueError("Something went wrong")

# Then, run get_clusters_precomputed with the desired parameters. The output will be the optimal clusteing df. Examples below

In [5]:
corr_threshold = 0.7
pct_threshold = 0.1
date = datetime.strptime('3-31-2010','%m-%d-%Y')

c1 = get_clusters_precomputed(1,corr_threshold,pct_threshold,date,DEBUG=True)
c2 = get_clusters_precomputed(2,corr_threshold,pct_threshold,date,DEBUG=True)
c3 = get_clusters_precomputed(3,corr_threshold,pct_threshold,date,DEBUG=True)
c4 = get_clusters_precomputed(4,corr_threshold,pct_threshold,date,DEBUG=True)
c5 = get_clusters_precomputed(5,corr_threshold,pct_threshold,date,DEBUG=True)

DEBUG: USING 2010-03-01 00:00:00 FOR INPUT DATE 2010-03-31 00:00:00
DEBUG: OPTIMAL NUMBER OF CLUSTERS: 1
DEBUG: USING 2010-03-01 00:00:00 FOR INPUT DATE 2010-03-31 00:00:00
DEBUG: OPTIMAL NUMBER OF CLUSTERS: 9
DEBUG: USING 2010-03-01 00:00:00 FOR INPUT DATE 2010-03-31 00:00:00
DEBUG: OPTIMAL NUMBER OF CLUSTERS: 21
DEBUG: USING 2010-03-01 00:00:00 FOR INPUT DATE 2010-03-31 00:00:00
DEBUG: OPTIMAL NUMBER OF CLUSTERS: 17
DEBUG: USING 2010-03-01 00:00:00 FOR INPUT DATE 2010-03-31 00:00:00
DEBUG: OPTIMAL NUMBER OF CLUSTERS: 7


In [6]:
corr_threshold = 0.7
pct_threshold = 0.1
date = datetime.strptime('3-31-2010','%m-%d-%Y')

c1 = get_clusters_precomputed(1,corr_threshold,pct_threshold,date)
c2 = get_clusters_precomputed(2,corr_threshold,pct_threshold,date)
c3 = get_clusters_precomputed(3,corr_threshold,pct_threshold,date)
c4 = get_clusters_precomputed(4,corr_threshold,pct_threshold,date)
c5 = get_clusters_precomputed(5,corr_threshold,pct_threshold,date)

print(c1)
print(c2)
print(c3)
print(c4)
print(c5)

      cluster
0            
A           0
AAPL        0
ABC         0
ABMD        0
ABT         0
...       ...
XOM         0
XRAY        0
YUM         0
ZBRA        0
ZION        0

[367 rows x 1 columns]
      cluster
0            
A           4
AAPL        1
ABC         1
ABMD        8
ABT         7
...       ...
XOM         8
XRAY        4
YUM         3
ZBRA        4
ZION        8

[367 rows x 1 columns]
      cluster
0            
A          18
AAPL        2
ABC         5
ABMD        1
ABT         1
...       ...
XOM         4
XRAY       12
YUM         2
ZBRA        9
ZION        8

[367 rows x 1 columns]
      cluster
0            
A           7
AAPL        4
ABC        10
ABMD       13
ABT         4
...       ...
XOM        11
XRAY       12
YUM         4
ZBRA        6
ZION        0

[367 rows x 1 columns]
      cluster
0            
A           1
AAPL        3
ABC         2
ABMD        2
ABT         3
...       ...
XOM         5
XRAY        5
YUM         3
ZBRA        0
ZION    