### Clustering analysis: 1 hour, single period

##### Hourly BTC chart

In [1]:
from algom.utils.data_object import dataObject
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pandas as pd
import numpy as np


<br>

#### Load OHLC input data

In [12]:
data = dataObject("""
with

ticker_data as (
  select
    ticker_time,
    ticker_time_sec,
    ticker,
    `interval` as ticker_interval,
    ROR_n6,
    ROR_n12,
    ROR_n24,
    ROR_n48,
    ROR_n72,
    ROR_n96,
    ROR_n120,
    ROR_n144,
    ROR_n168,
    volume_base,
    volume,
    open as open1,
    high as high1,
    low as low1,
    close as close1,
    rank() over (partition by ticker_time order by etl_time desc) as load_rank,
  from `algom-trading.cryptocompare.features_BTC_USD_hour_*`
  where
    _table_suffix in ('2016', '2017', '2018', '2019')
  )

select *,
  round(log(safe_divide(high1, open3)), 5)  as open_high1,
  round(log(safe_divide(low1, open3)), 5)   as open_low1,
  round(log(safe_divide(close1, open3)), 5) as open_close1,

  round(log(safe_divide(high2, open3)), 5)  as open_high2,
  round(log(safe_divide(low2, open3)), 5)   as open_low2,
  round(log(safe_divide(close2, open3)), 5) as open_close2,

  round(log(safe_divide(high3, open3)), 5)  as open_high3,
  round(log(safe_divide(low3, open3)), 5)   as open_low3,
  round(log(safe_divide(close3, open3)), 5) as open_close3,
from (
  select * except(load_rank),
  
  lag(open1) over (order by ticker_time) as open2,
  lag(high1) over (order by ticker_time)  as high2,
  lag(low1) over (order by ticker_time)  as low2,
  lag(close1) over (order by ticker_time)  as close2,

  lag(open1, 2) over (order by ticker_time) as open3,
  lag(high1, 2) over (order by ticker_time)  as high3,
  lag(low1, 2) over (order by ticker_time)  as low3,
  lag(close1, 2) over (order by ticker_time)  as close3,

  lag(open1, 3) over (order by ticker_time) as open4,
  lag(high1, 3) over (order by ticker_time)  as high4,
  lag(low1, 3) over (order by ticker_time)  as low4,
  lag(close1, 3) over (order by ticker_time)  as close4,
  from ticker_data
  where load_rank = 1
  )
where close3 is not null
order by ticker_time
""")

Downloading: 100%|██████████| 36279/36279 [00:17<00:00, 2064.18rows/s]


In [13]:
data.df.tail(10)

Unnamed: 0,ticker_time,ticker_time_sec,ticker,ticker_interval,ROR_n6,ROR_n12,ROR_n24,ROR_n48,ROR_n72,ROR_n96,...,close4,open_high1,open_low1,open_close1,open_high2,open_low2,open_close2,open_high3,open_low3,open_close3
36269,2019-12-31 15:00:00+00:00,1577804400,BTC-USD,hour,-0.00262,-0.002193,-0.007632,-0.028716,-0.016252,-0.004971,...,7245.1,0.00075,-0.00812,-0.00521,0.011,-0.0014,-0.00079,0.0034,-0.00106,0.0012
36270,2019-12-31 16:00:00+00:00,1577808000,BTC-USD,hour,-0.004805,-0.003816,-0.00597,-0.02836,-0.016941,0.000908,...,7253.78,-0.00376,-0.00799,-0.00616,-0.00045,-0.00932,-0.00641,0.0098,-0.0026,-0.00199
36271,2019-12-31 17:00:00+00:00,1577811600,BTC-USD,hour,-0.009355,-0.01131,-0.011872,-0.036039,-0.024831,-0.004799,...,7239.38,-0.00209,-0.01258,-0.00902,-0.00178,-0.006,-0.00418,0.00154,-0.00733,-0.00442
36272,2019-12-31 18:00:00+00:00,1577815200,BTC-USD,hour,-0.010653,-0.010712,-0.010733,-0.037435,-0.023666,-0.007405,...,7207.45,-0.00452,-0.00913,-0.00544,0.00233,-0.00816,-0.0046,0.00265,-0.00158,0.00024
36273,2019-12-31 19:00:00+00:00,1577818800,BTC-USD,hour,-0.00893,-0.008037,-0.006834,-0.041736,-0.022532,-0.004552,...,7209.2,-0.00268,-0.00842,-0.00277,-0.00477,-0.00937,-0.00569,0.00208,-0.0084,-0.00484
36274,2019-12-31 20:00:00+00:00,1577822400,BTC-USD,hour,-0.009354,-0.008418,-0.013016,-0.033447,-0.02359,-0.008185,...,7174.37,0.00221,-0.00271,-0.00033,0.00216,-0.00357,0.00208,8e-05,-0.00453,-0.00084
36275,2019-12-31 21:00:00+00:00,1577826000,BTC-USD,hour,-0.005272,-0.007893,-0.012075,-0.032076,-0.02201,-0.008295,...,7168.33,0.0034,-0.00133,0.00017,0.00305,-0.00187,0.00051,0.00301,-0.00273,0.00292
36276,2019-12-31 22:00:00+00:00,1577829600,BTC-USD,hour,-0.003003,-0.007808,-0.011792,-0.030591,-0.018985,-0.008661,...,7189.29,0.00214,-0.00293,-0.00024,0.00048,-0.00425,-0.00275,0.00013,-0.00479,-0.00241
36277,2019-12-31 23:00:00+00:00,1577833200,BTC-USD,hour,0.001325,-0.00803,-0.007013,-0.029571,-0.018256,-0.009689,...,7171.98,0.00411,0.00024,0.00166,0.00455,-0.00052,0.00217,0.00289,-0.00184,-0.00034
36278,2020-01-01 00:00:00+00:00,1577836800,BTC-USD,hour,0.000845,-0.009808,-0.010285,-0.025168,-0.019785,-0.019223,...,7169.55,0.00408,0.00067,0.00067,0.00445,0.00058,0.002,0.00489,-0.00018,0.00251


<br><br>

### Calculate Elbow (ie inertia plot)

In [None]:
# Get features
feature_list = [
    'open_high1',
    'open_low1',
    'open_close1',
    'open_high2',
    'open_low2',
    'open_close2'
]
X = data.df[feature_list]
cluster_summary = []

# Run kmeans
for n in range(2, 101, 1):
    print("RUNNING: Running cluster summary for k={}.".format(n))
    kmeans = KMeans(
        n_clusters=n,
        random_state=0,
    )
    kmeans = kmeans.fit(X)
    labels = kmeans.labels_
    silhouette = silhouette_score(X, labels)    
    cluster_summary.append({
        'n_clusters': kmeans.n_clusters,
        'inertia': kmeans.inertia_,
        'silhouette': silhouette,
    })


RUNNING: Running cluster summary for k=2.
RUNNING: Running cluster summary for k=3.
RUNNING: Running cluster summary for k=4.
RUNNING: Running cluster summary for k=5.
RUNNING: Running cluster summary for k=6.
RUNNING: Running cluster summary for k=7.
RUNNING: Running cluster summary for k=8.
RUNNING: Running cluster summary for k=9.
RUNNING: Running cluster summary for k=10.
RUNNING: Running cluster summary for k=11.
RUNNING: Running cluster summary for k=12.
RUNNING: Running cluster summary for k=13.


In [None]:
# pd.DataFrame(cluster_summary).to_csv('n_cluster_2candle.csv')

In [None]:
# Oputput inertia plot
plot = pd.DataFrame(cluster_summary).plot(
    x='n_clusters', y=['inertia', 'silhouette'])



<br><br>

## Produce clusters


_N_ based on interia plot above

In [7]:
n = 16

# Get features
feature_list = [
    'open_high1',
    'open_low1',
    'open_close1',
    'open_high2',
    'open_low2',
    'open_close2',
    'open_open3',
    'open_high3',
    'open_low3',
    'open_close3']
X = data.df[feature_list]
kmeans = KMeans(
    n_clusters=n,
    random_state=0,
)
kmeans = kmeans.fit(X)
labels = pd.DataFrame(kmeans.labels_, columns=['labels'])
data.df = data.df.join(labels, lsuffix='drop')
data.df = data.df[[h for h in list(data.df) if 'drop' not in h]]
data.df.sample(5)


Unnamed: 0,ticker_time,ticker_time_sec,ticker,ticker_interval,volume_base,volume,open2,high2,low2,close2,...,high1,low1,close1,open_high1,open_low1,open_close1,open_high2,open_low2,open_close2,labels
20604,2020-01-22 14:00:00+00:00,1579701600,BTC-USD,hour,817.86,7075201.72,8633.89,8656.27,8626.48,8641.45,...,8654.93,8620.5,8633.89,0.00034,-0.00364,-0.00209,0.0005,-0.00295,-0.00121,7
7271,2019-02-07 23:00:00+00:00,1549580400,BTC-USD,hour,856.53,2888011.0,3378.27,3379.61,3371.64,3375.33,...,3380.95,3374.15,3378.27,0.00132,-0.00069,0.00053,0.00093,-0.00143,-0.00034,7
304,2020-08-14 04:00:00+00:00,1597377600,BTC-USD,hour,906.71,10648164.01,11743.19,11771.94,11720.19,11742.1,...,11764.61,11688.81,11743.19,0.00158,-0.00488,-0.00024,0.00221,-0.0022,-0.00033,7
17831,2020-09-03 08:00:00+00:00,1599120000,BTC-USD,hour,1887.93,21551230.24,11412.59,11469.2,11365.96,11400.54,...,11431.18,11263.63,11412.59,0.01222,-0.00254,0.0106,0.01554,0.0065,0.00954,5
18189,2018-09-27 09:00:00+00:00,1538038800,BTC-USD,hour,2016.34,13061449.45,6458.76,6477.44,6456.4,6459.7,...,6460.36,6434.76,6458.76,0.00045,-0.00352,0.00021,0.00309,-0.00016,0.00035,7


In [8]:
# Output features and labels
data.to_db(
    project_id='algom-trading',
    destination_table='clustering.2candle_cluster_n{}'.format(n),
    if_exists='replace'
)

1it [00:12, 12.75s/it]


<br><br>

## Variance analysis

Run several clusters at _k_ and calculate basic metrics across each feature and future returns.


In [20]:
metric_list = [
    'volume_base',
    'open_high1',
    'open_low1',
    'open_close1',
    'open_high2',
    'open_low2',
    'open_close2',
    'open_high3',
    'open_low3',
    'open_close3',
    'ROR_n6',
    'ROR_n12',
    'ROR_n24',
    'ROR_n48',
    'ROR_n72',
    'ROR_n96',
    'ROR_n120',
    'ROR_n144',
    'ROR_n168',
]

In [21]:
def get_cluster_summary(df, k, metric_list, feature_list=None):

    # Get features
    feature_list = feature_list or [
        'open_high1',
        'open_low1',
        'open_close1',
        'open_high2',
        'open_low2',
        'open_close2',
        'open_high3',
        'open_low3',
        'open_close3'
    ]
    X = df[feature_list]
    kmeans = KMeans(
        n_clusters=k,
        random_state=0)
    kmeans = kmeans.fit(X)
    labels = pd.DataFrame(kmeans.labels_, columns=['labels'])
    df = df.join(labels, lsuffix='drop')
    df = df[[h for h in list(df) if 'drop' not in h]]

    # Output metric summary in dataframe
    # Iterate by metric/feature
    calc_list = ['count', 'min', 'mean', 'median', 'max', 'std', 'var', 'mad']
    metric_summary = pd.DataFrame()
    for metric in metric_list:
        stats = df.groupby(by=['labels'])[metric].agg(calc_list).reset_index()
        stats['inertia'] = kmeans.inertia_
        stats['variable'] = metric
        stats['k'] = k
        metric_summary = metric_summary.append(stats)
    return metric_summary


In [22]:
summary = pd.DataFrame()
for k in range(5, 101, 5):
    print('RUNNING: k={}'.format(k))
    tmp = get_cluster_summary(
        df=data.df, 
        k=k,
        metric_list=metric_list
    )
    tmp['type'] = '3 candle'
    summary = summary.append(tmp)


RUNNING: k=5
RUNNING: k=10
RUNNING: k=15
RUNNING: k=20
RUNNING: k=25
RUNNING: k=30
RUNNING: k=35
RUNNING: k=40
RUNNING: k=45
RUNNING: k=50
RUNNING: k=55
RUNNING: k=60
RUNNING: k=65
RUNNING: k=70
RUNNING: k=75
RUNNING: k=80
RUNNING: k=85
RUNNING: k=90
RUNNING: k=95
RUNNING: k=100


In [23]:
output = dataObject(summary)
output.to_db(
    project_id='algom-trading',
    destination_table='clustering_summary.3candle_cluster_summary',
    if_exists='replace'
)
summary.head()

1it [00:06,  6.57s/it]


Unnamed: 0,labels,count,min,mean,median,max,std,var,mad,inertia,variable,k,type
0,0,23822,0.004,1965.324118,1528.395,38338.02,1590.893786,2530943.0,1087.108196,15.764867,volume_base,5,3 candle
1,1,4391,0.1471,5204.326458,4022.97,44008.82,4175.599187,17435630.0,2918.054356,15.764867,volume_base,5,3 candle
2,2,984,0.5561,8234.629745,6462.455,61787.67,6235.978169,38887420.0,4273.156992,15.764867,volume_base,5,3 candle
3,3,6351,0.6445,4066.209735,3307.97,37619.92,3125.871241,9771071.0,2149.575167,15.764867,volume_base,5,3 candle
4,4,731,4.363,11580.74746,9590.66,60654.85,8088.118984,65417670.0,6093.170538,15.764867,volume_base,5,3 candle


In [24]:
# list(data.df)