### Clustering analysis: 1 hour, single period

##### Hourly BTC chart

In [None]:
from algom.utils.data_object import dataObject
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pandas as pd
import numpy as np


<br>

#### Load OHLC input data

In [8]:
data = dataObject("""
with

ticker_data as (
  select
    ticker_time,
    ticker_time_sec,
    ticker,
    `interval` as ticker_interval,
    ROR_n6,
    ROR_n12,
    ROR_n24,
    ROR_n48,
    ROR_n72,
    ROR_n96,
    ROR_n120,
    ROR_n144,
    ROR_n168,
    volume_base,
    volume,
    open as open1,
    high as high1,
    low as low1,
    close as close1,
    rank() over (partition by ticker_time order by etl_time desc) as load_rank,
  from `algom-trading.cryptocompare.features_BTC_USD_hour_*`
  where
    _table_suffix in ('2016', '2017', '2018', '2019')
  )

select *,
  round(log(safe_divide(high1, open4)), 5)  as open_high1,
  round(log(safe_divide(low1, open4)), 5)   as open_low1,
  round(log(safe_divide(close1, open4)), 5) as open_close1,

  round(log(safe_divide(high2, open4)), 5)  as open_high2,
  round(log(safe_divide(low2, open4)), 5)   as open_low2,
  round(log(safe_divide(close2, open4)), 5) as open_close2,

  round(log(safe_divide(high3, open4)), 5)  as open_high3,
  round(log(safe_divide(low3, open4)), 5)   as open_low3,
  round(log(safe_divide(close3, open4)), 5) as open_close3,

  round(log(safe_divide(high4, open4)), 5)  as open_high4,
  round(log(safe_divide(low4, open4)), 5)   as open_low4,
  round(log(safe_divide(close4, open4)), 5) as open_close4,

from (
    select * except(load_rank),

    lag(open1) over (order by ticker_time) as open2,
    lag(high1) over (order by ticker_time)  as high2,
    lag(low1) over (order by ticker_time)  as low2,
    lag(close1) over (order by ticker_time)  as close2,

    lag(open1, 2) over (order by ticker_time) as open3,
    lag(high1, 2) over (order by ticker_time)  as high3,
    lag(low1, 2) over (order by ticker_time)  as low3,
    lag(close1, 2) over (order by ticker_time)  as close3,

    lag(open1, 3) over (order by ticker_time) as open4,
    lag(high1, 3) over (order by ticker_time)  as high4,
    lag(low1, 3) over (order by ticker_time)  as low4,
    lag(close1, 3) over (order by ticker_time)  as close4,
    from ticker_data
    where load_rank = 1
    )
where close4 is not null
order by ticker_time
""")

Downloading: 100%|██████████| 36278/36278 [00:16<00:00, 2175.89rows/s]


In [9]:
data.df.head()

Unnamed: 0,ticker_time,ticker_time_sec,ticker,ticker_interval,ROR_n6,ROR_n12,ROR_n24,ROR_n48,ROR_n72,ROR_n96,...,open_close1,open_high2,open_low2,open_close2,open_high3,open_low3,open_close3,open_high4,open_low4,open_close4
0,2015-11-11 11:00:00+00:00,1447239600,BTC-USD,hour,,,,,,,...,0.01663,0.05239,0.03167,0.0407,0.04842,0.01124,0.0384,0.0137,-0.00311,0.01121
1,2015-11-11 12:00:00+00:00,1447243200,BTC-USD,hour,,,,,,,...,0.01754,0.03237,0.00262,0.00542,0.04118,0.02046,0.02949,0.03721,3e-05,0.02719
2,2015-11-11 13:00:00+00:00,1447246800,BTC-USD,hour,,,,,,,...,0.00344,-0.00171,-0.02502,-0.00965,0.00518,-0.02458,-0.02178,0.01398,-0.00674,0.0023
3,2015-11-11 14:00:00+00:00,1447250400,BTC-USD,hour,,,,,,,...,-0.01138,0.01193,-0.01201,0.00115,-0.00401,-0.02732,-0.01195,0.00288,-0.02687,-0.02407
4,2015-11-11 15:00:00+00:00,1447254000,BTC-USD,hour,,,,,,,...,-0.01279,0.02448,0.00305,0.01269,0.03601,0.01206,0.02522,0.02007,-0.00325,0.01213


<br><br>

### Calculate Elbow (ie inertia plot)

In [None]:
# Get features
feature_list = [
    'open_high1',
    'open_low1',
    'open_close1',
    'open_high2',
    'open_low2',
    'open_close2'
]
X = data.df[feature_list]
cluster_summary = []

# Run kmeans
for n in range(2, 20, 1):
    print("RUNNING: Running cluster summary for k={}.".format(n))
    kmeans = KMeans(
        n_clusters=n,
        random_state=0,
    )
    kmeans = kmeans.fit(X)
    labels = kmeans.labels_
    silhouette = silhouette_score(X, labels)    
    cluster_summary.append({
        'n_clusters': kmeans.n_clusters,
        'inertia': kmeans.inertia_,
        'silhouette': silhouette,
    })


RUNNING: Running cluster summary for k=2.
RUNNING: Running cluster summary for k=3.
RUNNING: Running cluster summary for k=4.
RUNNING: Running cluster summary for k=5.
RUNNING: Running cluster summary for k=6.
RUNNING: Running cluster summary for k=7.
RUNNING: Running cluster summary for k=8.
RUNNING: Running cluster summary for k=9.
RUNNING: Running cluster summary for k=10.
RUNNING: Running cluster summary for k=11.
RUNNING: Running cluster summary for k=12.
RUNNING: Running cluster summary for k=13.


In [None]:
# pd.DataFrame(cluster_summary).to_csv('n_cluster_2candle.csv')

In [None]:
# Oputput inertia plot
plot = pd.DataFrame(cluster_summary).plot(
    x='n_clusters', y=['inertia', 'silhouette'])



<br><br>

## Produce clusters


_N_ based on interia plot above

In [7]:
n = 16

# Get features
feature_list = [
    'open_high1',
    'open_low1',
    'open_close1',
    'open_high2',
    'open_low2',
    'open_close2',
    'open3',
    'high3',
    'low3',
    'close3'
]
X = data.df[feature_list]
kmeans = KMeans(
    n_clusters=n,
    random_state=0,
)
kmeans = kmeans.fit(X)
labels = pd.DataFrame(kmeans.labels_, columns=['labels'])
data.df = data.df.join(labels, lsuffix='drop')
data.df = data.df[[h for h in list(data.df) if 'drop' not in h]]
data.df.sample(5)


Unnamed: 0,ticker_time,ticker_time_sec,ticker,ticker_interval,volume_base,volume,open2,high2,low2,close2,...,high1,low1,close1,open_high1,open_low1,open_close1,open_high2,open_low2,open_close2,labels
20604,2020-01-22 14:00:00+00:00,1579701600,BTC-USD,hour,817.86,7075201.72,8633.89,8656.27,8626.48,8641.45,...,8654.93,8620.5,8633.89,0.00034,-0.00364,-0.00209,0.0005,-0.00295,-0.00121,7
7271,2019-02-07 23:00:00+00:00,1549580400,BTC-USD,hour,856.53,2888011.0,3378.27,3379.61,3371.64,3375.33,...,3380.95,3374.15,3378.27,0.00132,-0.00069,0.00053,0.00093,-0.00143,-0.00034,7
304,2020-08-14 04:00:00+00:00,1597377600,BTC-USD,hour,906.71,10648164.01,11743.19,11771.94,11720.19,11742.1,...,11764.61,11688.81,11743.19,0.00158,-0.00488,-0.00024,0.00221,-0.0022,-0.00033,7
17831,2020-09-03 08:00:00+00:00,1599120000,BTC-USD,hour,1887.93,21551230.24,11412.59,11469.2,11365.96,11400.54,...,11431.18,11263.63,11412.59,0.01222,-0.00254,0.0106,0.01554,0.0065,0.00954,5
18189,2018-09-27 09:00:00+00:00,1538038800,BTC-USD,hour,2016.34,13061449.45,6458.76,6477.44,6456.4,6459.7,...,6460.36,6434.76,6458.76,0.00045,-0.00352,0.00021,0.00309,-0.00016,0.00035,7


In [8]:
# Output features and labels
data.to_db(
    project_id='algom-trading',
    destination_table='clustering.2candle_cluster_n{}'.format(n),
    if_exists='replace'
)

1it [00:12, 12.75s/it]


<br><br>

## Variance analysis

Run several clusters at _k_ and calculate basic metrics across each feature and future returns.


In [10]:
metric_list = [
    'volume_base',
    'open_high1',
    'open_low1',
    'open_close1',
    'open_high2',
    'open_low2',
    'open_close2',
    'open_high3',
    'open_low3',
    'open_close3',
    'open_high4',
    'open_low4',
    'open_close4',
    'ROR_n6',
    'ROR_n12',
    'ROR_n24',
    'ROR_n48',
    'ROR_n72',
    'ROR_n96',
    'ROR_n120',
    'ROR_n144',
    'ROR_n168',
]

In [11]:
def get_cluster_summary(df, k, metric_list, feature_list=None):

    # Get features
    feature_list = feature_list or [
        'open_high1',
        'open_low1',
        'open_close1',
        'open_high2',
        'open_low2',
        'open_close2',
        'open_high3',
        'open_low3',
        'open_close3',
        'open_high4',
        'open_low4',
        'open_close4',
    ]
    X = df[feature_list]
    kmeans = KMeans(
        n_clusters=k,
        random_state=0)
    kmeans = kmeans.fit(X)
    labels = pd.DataFrame(kmeans.labels_, columns=['labels'])
    df = df.join(labels, lsuffix='drop')
    df = df[[h for h in list(df) if 'drop' not in h]]

    # Output metric summary in dataframe
    # Iterate by metric/feature
    calc_list = ['count', 'min', 'mean', 'median', 'max', 'std', 'var', 'mad']
    metric_summary = pd.DataFrame()
    for metric in metric_list:
        stats = df.groupby(by=['labels'])[metric].agg(calc_list).reset_index()
        stats['inertia'] = kmeans.inertia_
        stats['variable'] = metric
        stats['k'] = k
        metric_summary = metric_summary.append(stats)
    return metric_summary


In [12]:
summary = pd.DataFrame()
for k in range(5, 101, 5):
    print('RUNNING: k={}'.format(k))
    tmp = get_cluster_summary(
        df=data.df, 
        k=k,
        metric_list=metric_list
    )
    tmp['type'] = '4 candle'
    summary = summary.append(tmp)


RUNNING: k=5
RUNNING: k=10
RUNNING: k=15
RUNNING: k=20
RUNNING: k=25
RUNNING: k=30
RUNNING: k=35
RUNNING: k=40
RUNNING: k=45
RUNNING: k=50
RUNNING: k=55
RUNNING: k=60
RUNNING: k=65
RUNNING: k=70
RUNNING: k=75
RUNNING: k=80
RUNNING: k=85
RUNNING: k=90
RUNNING: k=95
RUNNING: k=100


In [13]:
output = dataObject(summary)
output.to_db(
    project_id='algom-trading',
    destination_table='clustering_summary.4candle_cluster_summary',
    if_exists='replace'
)
summary.head()

1it [00:09,  9.19s/it]


Unnamed: 0,labels,count,min,mean,median,max,std,var,mad,inertia,variable,k,type
0,0,23785,0.1471,2046.069562,1545.34,44008.82,1823.239062,3324201.0,1176.267369,25.924193,volume_base,5,4 candle
1,1,688,865.53,11023.443372,8862.295,60654.85,8028.585664,64458190.0,5886.588702,25.924193,volume_base,5,4 candle
2,2,6460,0.6445,4019.600268,3231.87,38338.02,3196.032427,10214620.0,2173.011676,25.924193,volume_base,5,4 candle
3,3,4330,0.004,5144.626381,3889.3,49903.12,4386.799107,19244010.0,2998.111886,25.924193,volume_base,5,4 candle
4,4,1015,0.1806,7451.047966,5794.77,61787.67,5822.84599,33905540.0,3970.167255,25.924193,volume_base,5,4 candle


In [14]:
len(summary)

23100