### Clustering analysis: 1 hour, single period

##### Hourly BTC chart

In [32]:
from algom.utils.data_object import dataObject
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import pandas as pd
import numpy as np


<br>

#### Load OHLC input data

In [33]:
data = dataObject("""
with

ticker_data as (
  select
    ticker_time,
    ticker_time_sec,
    ticker,
    `interval` as ticker_interval,
    volume_base,
    volume,
    open as open2,
    high as high2,
    low as low2,
    close as close2,
    ROR_n6,
    ROR_n12,
    ROR_n24,
    ROR_n48,
    ROR_n72,
    ROR_n96,
    ROR_n120,
    ROR_n144,
    ROR_n168,
    rank() over (partition by ticker_time order by etl_time desc) as load_rank,
  from `algom-trading.cryptocompare.features_BTC_USD_hour_*`
  where
    _table_suffix in ('2018', '2019', '2020')
  )

select *,
  round(log(safe_divide(high1, open1)), 5)  as open_high1,
  round(log(safe_divide(low1, open1)), 5)   as open_low1,
  round(log(safe_divide(close1, open1)), 5) as open_close1,
  round(log(safe_divide(high2, open1)), 5)  as open_high2,
  round(log(safe_divide(low2, open1)), 5)   as open_low2,
  round(log(safe_divide(close2, open1)), 5) as open_close2,
from (
  select * except(load_rank),
  lag(open2) over (order by ticker_time) as open1,
  lag(high2) over (order by ticker_time)  as high1,
  lag(low2) over (order by ticker_time)  as low1,
  lag(close2) over (order by ticker_time)  as close1,
  from ticker_data
  where load_rank = 1
  )
where close1 is not null
""")

Downloading: 100%|██████████| 26152/26152 [00:10<00:00, 2566.48rows/s]


In [34]:
data.df.head()

Unnamed: 0,ticker_time,ticker_time_sec,ticker,ticker_interval,volume_base,volume,open2,high2,low2,close2,...,open1,high1,low1,close1,open_high1,open_low1,open_close1,open_high2,open_low2,open_close2
0,2019-05-17 08:00:00+00:00,1558080000,BTC-USD,hour,3352.17,24441362.73,7297.92,7319.1,7251.87,7301.51,...,7179.87,7357.64,7176.83,7297.92,0.02446,-0.00042,0.01631,0.01921,0.00998,0.0168
1,2020-02-09 13:00:00+00:00,1581253200,BTC-USD,hour,796.76,8054849.06,10123.89,10132.14,10087.48,10099.24,...,10099.09,10138.09,10099.09,10123.89,0.00385,0.0,0.00245,0.00327,-0.00115,1e-05
2,2018-12-03 00:00:00+00:00,1543795200,BTC-USD,hour,2660.38,10963432.52,4143.86,4159.01,4098.53,4123.11,...,4165.99,4187.18,4109.04,4143.86,0.00507,-0.01376,-0.00533,-0.00168,-0.01633,-0.01035
3,2019-10-21 15:00:00+00:00,1571670000,BTC-USD,hour,4382.41,36150730.91,8277.43,8359.52,8189.89,8213.58,...,8267.94,8277.43,8252.26,8277.43,0.00115,-0.0019,0.00115,0.01102,-0.00948,-0.0066
4,2019-08-13 21:00:00+00:00,1565730000,BTC-USD,hour,1719.04,18651948.3,10892.25,10895.46,10826.93,10869.77,...,10905.1,10949.32,10891.05,10892.25,0.00405,-0.00129,-0.00118,-0.00088,-0.00719,-0.00325


<br><br>

### Calculate Elbow (ie inertia plot)

In [None]:
# Get features
feature_list = [
    'open_high1',
    'open_low1',
    'open_close1',
    'open_high2',
    'open_low2',
    'open_close2'
]
X = data.df[feature_list]
cluster_summary = []

# Run kmeans
for n in range(2, 20, 1):
    print("RUNNING: Running cluster summary for k={}.".format(n))
    kmeans = KMeans(
        n_clusters=n,
        random_state=0,
    )
    kmeans = kmeans.fit(X)
    labels = kmeans.labels_
    silhouette = silhouette_score(X, labels)    
    cluster_summary.append({
        'n_clusters': kmeans.n_clusters,
        'inertia': kmeans.inertia_,
        'silhouette': silhouette,
    })


RUNNING: Running cluster summary for k=2.


In [None]:
# pd.DataFrame(cluster_summary).to_csv('n_cluster_2candle.csv')

In [None]:
# Oputput inertia plot
plot = pd.DataFrame(cluster_summary).plot(
    x='n_clusters', y=['inertia', 'silhouette'])



<br><br>

## Produce clusters


_N_ based on interia plot above

In [7]:
n = 16

# Get features
feature_list = [
    'open_high1',
    'open_low1',
    'open_close1',
    'open_high2',
    'open_low2',
    'open_close2'
]
X = data.df[feature_list]
kmeans = KMeans(
    n_clusters=n,
    random_state=0,
)
kmeans = kmeans.fit(X)
labels = pd.DataFrame(kmeans.labels_, columns=['labels'])
data.df = data.df.join(labels, lsuffix='drop')
data.df = data.df[[h for h in list(data.df) if 'drop' not in h]]
data.df.sample(5)


Unnamed: 0,ticker_time,ticker_time_sec,ticker,ticker_interval,volume_base,volume,open2,high2,low2,close2,...,high1,low1,close1,open_high1,open_low1,open_close1,open_high2,open_low2,open_close2,labels
20604,2020-01-22 14:00:00+00:00,1579701600,BTC-USD,hour,817.86,7075201.72,8633.89,8656.27,8626.48,8641.45,...,8654.93,8620.5,8633.89,0.00034,-0.00364,-0.00209,0.0005,-0.00295,-0.00121,7
7271,2019-02-07 23:00:00+00:00,1549580400,BTC-USD,hour,856.53,2888011.0,3378.27,3379.61,3371.64,3375.33,...,3380.95,3374.15,3378.27,0.00132,-0.00069,0.00053,0.00093,-0.00143,-0.00034,7
304,2020-08-14 04:00:00+00:00,1597377600,BTC-USD,hour,906.71,10648164.01,11743.19,11771.94,11720.19,11742.1,...,11764.61,11688.81,11743.19,0.00158,-0.00488,-0.00024,0.00221,-0.0022,-0.00033,7
17831,2020-09-03 08:00:00+00:00,1599120000,BTC-USD,hour,1887.93,21551230.24,11412.59,11469.2,11365.96,11400.54,...,11431.18,11263.63,11412.59,0.01222,-0.00254,0.0106,0.01554,0.0065,0.00954,5
18189,2018-09-27 09:00:00+00:00,1538038800,BTC-USD,hour,2016.34,13061449.45,6458.76,6477.44,6456.4,6459.7,...,6460.36,6434.76,6458.76,0.00045,-0.00352,0.00021,0.00309,-0.00016,0.00035,7


In [8]:
# Output features and labels
data.to_db(
    project_id='algom-trading',
    destination_table='clustering.2candle_cluster_n{}'.format(n),
    if_exists='replace'
)

1it [00:12, 12.75s/it]


<br><br>

## Variance analysis

Run several clusters at _k_ and calculate basic metrics across each feature and future returns.


In [19]:
metric_list = [
    'volume_base',
    'open2',
    'high2',
    'low2',
    'close2',
    'open1',
    'high1',
    'low1',
    'close1',
    'open_high1',
    'open_low1',
    'open_close1',
    'open_high2',
    'open_low2',
    'open_close2',
    'ROR_n6',
    'ROR_n12',
    'ROR_n24',
    'ROR_n48',
    'ROR_n72',
    'ROR_n96',
    'ROR_n120',
    'ROR_n144',
    'ROR_n168',
]

In [20]:

def get_cluster_summary(df, k, metric_list, feature_list=None):

    # Get features
    feature_list = feature_list or [
        'open_high1',
        'open_low1',
        'open_close1',
        'open_high2',
        'open_low2',
        'open_close2']
    X = df[feature_list]
    kmeans = KMeans(
        n_clusters=k,
        random_state=0)
    kmeans = kmeans.fit(X)
    labels = pd.DataFrame(kmeans.labels_, columns=['labels'])
    df = df.join(labels, lsuffix='drop')
    df = df[[h for h in list(df) if 'drop' not in h]]

    # Output metric summary in dataframe
    # Iterate by metric/feature
    calc_list = ['count', 'min', 'mean', 'median', 'max', 'std', 'var', 'mad']
    metric_summary = pd.DataFrame()
    for metric in metric_list:
        stats = df.groupby(by=['labels'])[metric].agg(calc_list).reset_index()
        stats['inertia'] = kmeans.inertia_
        stats['var'] = metric
        stats['k'] = k
        metric_summary = metric_summary.append(stats)
    return metric_summary



In [21]:
# len(metric_summary)

In [35]:

summary = pd.DataFrame()
for k in range(4, 100, 2):
    print('RUNNING: k={}'.format(k))
    tmp = get_cluster_summary(
        df=data.df, 
        k=k,
        metric_list=metric_list
    )
    tmp['type'] = '2 candle'
    summary = summary.append(tmp)


RUNNING: k=4
RUNNING: k=6
RUNNING: k=8
RUNNING: k=10
RUNNING: k=12
RUNNING: k=14
RUNNING: k=16
RUNNING: k=18
RUNNING: k=20
RUNNING: k=22
RUNNING: k=24
RUNNING: k=26
RUNNING: k=28
RUNNING: k=30
RUNNING: k=32
RUNNING: k=34
RUNNING: k=36
RUNNING: k=38
RUNNING: k=40
RUNNING: k=42
RUNNING: k=44
RUNNING: k=46
RUNNING: k=48
RUNNING: k=50
RUNNING: k=52
RUNNING: k=54
RUNNING: k=56
RUNNING: k=58
RUNNING: k=60
RUNNING: k=62
RUNNING: k=64
RUNNING: k=66
RUNNING: k=68
RUNNING: k=70
RUNNING: k=72
RUNNING: k=74
RUNNING: k=76
RUNNING: k=78
RUNNING: k=80
RUNNING: k=82
RUNNING: k=84
RUNNING: k=86
RUNNING: k=88
RUNNING: k=90
RUNNING: k=92
RUNNING: k=94
RUNNING: k=96
RUNNING: k=98


In [36]:
summary.tail()

Unnamed: 0,labels,count,min,mean,median,max,std,var,mad,inertia,k,type
93,93,6,-0.532239,-0.227899,-0.297323,0.501982,0.373161,ROR_n168,0.243294,0.969033,98,2 candle
94,94,44,-0.592853,-0.06094,-0.063529,0.517839,0.257804,ROR_n168,0.205528,0.969033,98,2 candle
95,95,214,-0.542043,-0.023569,-0.025981,0.402467,0.19341,ROR_n168,0.15935,0.969033,98,2 candle
96,96,12,-0.48645,0.001228,-0.068546,0.529207,0.353869,ROR_n168,0.30544,0.969033,98,2 candle
97,97,1062,-0.254912,0.007584,0.004846,0.341863,0.082585,ROR_n168,0.062022,0.969033,98,2 candle


In [31]:
# print(len(summary))
output = dataObject(summary)
output.to_db(
    project_id='algom-trading',
    destination_table='clustering_summary.2candle_cluster_summary',
    if_exists='replace'
)
summary.head()

1it [00:06,  6.17s/it]


Unnamed: 0,labels,count,min,mean,median,max,std,var,mad,inertia,k,type
0,0,4361,0.004,4044.03391,3221.42,40159.7,3163.58757,volume_base,2217.881017,6.882041,4,2 candle
1,1,1740,546.85,6905.125609,5642.78,61787.67,5016.794163,volume_base,3470.410036,6.882041,4,2 candle
2,2,586,931.35,11796.106126,9958.985,50801.58,7539.798869,volume_base,5691.509169,6.882041,4,2 candle
3,3,19465,0.1806,1820.22366,1375.07,17361.92,1492.965926,volume_base,1042.335383,6.882041,4,2 candle
0,0,4361,3196.94,8663.104139,8498.41,19614.54,2890.9753,open2,2206.416583,6.882041,4,2 candle
