In [3]:
import datetime
import pytz
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import matplotlib.ticker as mpticker
import pandas as pd
import numpy as np

from pathlib import Path
from mpl_finance import candlestick_ohlc

#### Download Cryptocurrency trading data

In [4]:
# Load the data into a Pandas DataFrame
df = pd.read_csv(
    Path('../Resources/BTCUSD_1h.csv'),
    index_col="Date")
df.tail()

Unnamed: 0_level_0,Open,Close,High,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-11-17 13:00:00,60514.231682,60207.0,60639.0,60207.0,476.771475
2021-11-17 14:00:00,60207.0,59724.666301,60388.0,59724.0,824.163709
2021-11-17 15:00:00,59724.0,59708.630836,59859.0,59483.0,500.389504
2021-11-17 16:00:00,59728.0,60418.0,60480.0,59491.0,398.163219
2021-11-17 17:00:00,60418.0,60069.0,60439.0,60031.0,109.23502


#### Find Optimal Cluster using K-Means

In [5]:
def get_optimum_clusters(df, saturation_point=0.05):
    '''
    :param df: dataframe
    :param saturation_point: The amount of difference we are willing to detect
    :return: clusters with optimum K centers
    This method uses elbow method to find the optimum number of K clusters
    We initialize different K-means with 1..10 centers and compare the inertias
    If the difference is no more than saturation_point, we choose that as K and move on
    '''

    wcss = []
    k_models = []

    size = min(11, len(df.index))
    for i in range(1, size):
        kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
        kmeans.fit(df)
        wcss.append(kmeans.inertia_)
        k_models.append(kmeans)

    # Compare differences in inertias until it's no more than saturation_point
    optimum_k = len(wcss)-1
    for i in range(0, len(wcss)-1):
        diff = abs(wcss[i+1] - wcss[i])
        if diff < saturation_point:
            optimum_k = i
            break

    print("Optimum K is " + str(optimum_k + 1))
    optimum_clusters = k_models[optimum_k]

    return optimum_clusters

In [6]:
lows = df['Low']
highs = df["High"]

low_clusters = get_optimum_clusters(lows)
low_centers = low_clusters.cluster_centers_
low_centers = numpy.sort(low_centers, axis=0)

high_clusters = get_optimum_clusters(highs)
high_centers = high_clusters.cluster_centers_
high_centers = numpy.sort(high_centers, axis=0)

ValueError: Expected 2D array, got 1D array instead:
array=[  965.8    962.6    965.37 ... 59483.   59491.   60031.  ].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.