In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
import holoviews as hv

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, RobustScaler

In [2]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# convert percentages to decimal
df_market_data = df_market_data/100

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.010839,0.076028,0.065751,0.076726,-0.032518,0.835184,0.375176
ethereum,0.002239,0.103813,0.048085,0.001317,-0.128889,1.867742,1.019602
tether,-0.002117,0.000494,6.4e-05,-0.000424,0.002804,-5.4e-05,0.000195
ripple,-0.003782,-0.006093,0.022498,0.002345,-0.175524,0.395389,-0.166019
bitcoin-cash,0.029059,0.170972,0.147533,0.15749,-0.137179,0.216604,0.144938
binancecoin,0.021042,0.128551,0.068069,0.000586,0.363349,1.556194,0.69692
chainlink,-0.002394,0.206946,0.09301,-0.112175,-0.436952,4.032292,3.251319
cardano,3.2e-05,0.13993,0.055548,0.101055,-0.228478,2.645142,1.560976
litecoin,-0.000634,0.066022,0.072893,0.012166,-0.172396,0.274992,-0.126641
bitcoin-cash-sv,0.009253,0.032964,-0.018666,0.028893,-0.248743,0.074256,0.937308


In [3]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.002697,0.044971,0.001858,0.015457,-0.000941,2.365374,3.47668
std,0.026948,0.063752,0.083769,0.263442,0.473658,4.352253,12.478429
min,-0.135279,-0.060946,-0.181589,-0.347055,-0.448225,-0.003921,-0.175675
25%,-0.00609,0.000473,-0.050266,-0.104385,-0.25908,0.216604,0.004062
50%,-0.000634,0.032964,0.001097,-0.000424,-0.075445,0.839052,0.69692
75%,0.006121,0.076028,0.055107,0.045781,0.006573,2.161776,1.683725
max,0.048403,0.206946,0.242392,1.407957,2.230644,22.279278,78.520897


In [4]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

In [5]:
# using StandardScaler requires normally distributed inputs, which we'll check with a kde

def plot_kdes(df,cols=2):
    plots = [df[column].hvplot.kde(label=column) for column in df.columns]
    combined_plot = hv.Layout(plots).cols(cols)
    return combined_plot

plot_kdes(df_market_data)


---

### Prepare the Data

In [6]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
s_scaler = StandardScaler()

s_scaled = s_scaler.fit_transform(df_market_data)

# Create a DataFrame with the scaled data
# Copy the crypto names from the original data
# Set the coinid column as index

s_scaled_df = pd.DataFrame(s_scaled, columns=df_market_data.columns, index=df_market_data.index)

s_scaled_df.hvplot.line(
    width=800,
    height=400,
    rot=90
)

In [7]:
plot_kdes(s_scaled_df)

In [8]:
# Display sample data

s_scaled_df.sample(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
neo,0.286546,-0.326301,-1.21267,-0.903134,0.29097,-0.181187,-0.17555
iota,0.259097,0.249508,-0.478953,-0.218997,-0.735815,-0.329179,-0.28531
ethlend,-4.981042,-0.045178,-1.206956,-1.212126,0.047736,4.63238,6.088625
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317
wrapped-bitcoin,0.515453,0.461843,0.769975,0.224045,-0.074674,-0.355054,-0.251623
litecoin,0.077497,0.334297,0.85852,-0.012646,-0.366477,-0.486266,-0.292351
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
usd-coin,0.034352,-0.733026,-0.02314,-0.065775,0.002925,-0.550599,-0.282232
crypto-com-chain,0.33128,-1.614844,-1.054521,-0.729931,-0.350155,-0.022866,-0.03457
zcash,-0.127467,0.929119,0.677532,0.223834,-0.437068,-0.265163,-0.214829


In [9]:
# due to some skews and the big outlier in 1y we'll try the robust scaler too

# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
r_scaler = RobustScaler()

r_scaled = r_scaler.fit_transform(df_market_data)

r_scaled_df = pd.DataFrame(r_scaled, columns=df_market_data.columns, index=df_market_data.index)

plot_kdes(r_scaled_df)

# the results below are much better aligned on zero

---

### Find the Best Value for k Using the Original Data.

In [10]:
# Create a list with the number of k-values from 1 to 10
ks = list(range(1,11))

ks

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [11]:
# Create an empty list to store the inertia values

inertias = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list

def km_wrapper(df,k):
    model = KMeans(n_clusters=k, random_state=0, init='k-means++', n_init=20)
    model.fit(df)
    return model

for k in ks:
    model = km_wrapper(r_scaled_df,k)
    inertias.append(model.inertia_)


In [12]:
# Create a dictionary with the data to plot the Elbow curve
# Create a DataFrame with the data to plot the Elbow curve

elbow_df = pd.DataFrame({
    'k': ks,
    'inertia': inertias
})

elbow_df

Unnamed: 0,k,inertia
0,1,2906.777008
1,2,627.016576
2,3,279.997525
3,4,203.196406
4,5,158.824605
5,6,130.779495
6,7,112.029269
7,8,94.239542
8,9,80.869809
9,10,67.593185


In [13]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.

elbow_plot = elbow_df.hvplot.line(x='k', y='inertia', title='Elbow curve of k-means on cryptocurrencies', xticks=ks)

elbow_plot

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** ambiguous between 2 or 3, with the move from 2 to 3 clusters still yielding some but not a significant amount of marginal inertial loss

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [14]:
# Initialise the K-Means model using the best value for k
# Fit the K-Means model using the scaled data

s_model_k2 = km_wrapper(s_scaled_df,2)
s_model_k3 = km_wrapper(s_scaled_df,3)
r_model_k2 = km_wrapper(r_scaled_df,2)
r_model_k3 = km_wrapper(r_scaled_df,3)

models = [
    s_model_k2,
    s_model_k3,
    r_model_k2,
    r_model_k3
]

# Predict the clusters to group the cryptocurrencies using the scaled data
# Print the resulting array of cluster values.

[print(x.labels_) for x in models]

# note that both k=3 models using different standardisations pick out two individual coins as their own cluster

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0
 1 1 1 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2
 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2
 0 0 0 0]


[None, None, None, None]

In [15]:
# Create a copy of the DataFrame
coins_df = r_scaled_df.copy()

In [16]:
# Add a new column to the DataFrame with the predicted clusters
coins_df['label'] = r_model_k3.labels_

# Display sample data
coins_df[['price_change_percentage_60d','price_change_percentage_1y','label']].sort_values('label', ascending=False).head(10)

Unnamed: 0_level_0,price_change_percentage_60d,price_change_percentage_1y,label
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
celsius-degree-token,8.680849,11.550113,2
ethlend,0.364526,46.333079,1
bitcoin,0.161591,-0.191552,0
dash,-0.703176,-0.429556,0
huobi-token,0.155755,-0.268627,0
nem,3.403148,0.780073,0
binance-usd,0.287533,-0.414087,0
iota,-1.015401,-0.438643,0
vechain,-1.358129,0.792875,0
zcash,-0.489271,0.078546,0


In [17]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Colour the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.

cluster_plot = coins_df.hvplot.scatter(
    x = "price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="label",
    hover_cols=['coin_id'])

cluster_plot

---

### Optimise Clusters with Principal Component Analysis.

In [18]:
# Create a PCA model instance and set `n_components=3`.

pca = PCA(n_components=3)

In [19]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.

# View the first five rows of the DataFrame. 

pca_result = pca.fit_transform(r_scaled_df)

coins_df['PCA_1'] = pca_result[:, 0]
coins_df['PCA_2'] = pca_result[:, 1]
coins_df['PCA_3'] = pca_result[:, 2]

coins_df.iloc[:,-6:].head(5)

Unnamed: 0_level_0,price_change_percentage_200d,price_change_percentage_1y,label,PCA_1,PCA_2,PCA_3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bitcoin,-0.001989,-0.191552,0,-2.18867,0.52051,-0.741547
ethereum,0.528843,0.192112,0,-1.564105,-0.146419,-0.543574
tether,-0.431379,-0.4148,0,-2.264268,-0.342005,0.56713
ripple,-0.228084,-0.513757,0,-2.313358,-0.687349,0.506279
bitcoin-cash,-0.319996,-0.328626,0,-2.750554,1.217552,-2.570144


In [20]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.

print(pca.explained_variance_ratio_)

sum(pca.explained_variance_ratio_)

[0.854359   0.0944445  0.02027866]


0.9690821550730448

#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** 96.9%

In [21]:
# Create a new DataFrame with the PCA data.
# Creating a DataFrame with the PCA data
# Copy the crypto names from the original data
# Set the coinid column as index
# Display sample data

coins_pca_df = coins_df.iloc[:,-3:]

coins_pca_df.sample(5)


Unnamed: 0_level_0,PCA_1,PCA_2,PCA_3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
wrapped-bitcoin,-2.191919,0.506026,-0.749852
ftx-token,-1.427115,-0.27491,-0.722448
neo,-1.419327,-0.679032,-0.081215
stellar,-2.001164,-1.509676,0.54502
ripple,-2.313358,-0.687349,0.506279


---

### Find the Best Value for k Using the PCA Data

In [22]:
# Create a list with the number of k-values from 1 to 11
ks

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [23]:
# Create an empty list to store the inertia values
pca_inertias = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list

for k in ks:
    model = km_wrapper(coins_pca_df,k)
    pca_inertias.append(model.inertia_)


In [24]:
# Create a dictionary with the data to plot the Elbow curve
# Create a DataFrame with the data to plot the Elbow curve

elbow_df['pca_inertia'] = pca_inertias

elbow_df

Unnamed: 0,k,inertia,pca_inertia
0,1,2906.777008,2816.905727
1,2,627.016576,537.501537
2,3,279.997525,191.4837
3,4,203.196406,122.136845
4,5,158.824605,80.206163
5,6,130.779495,62.234674
6,7,112.029269,48.428755
7,8,94.239542,39.706361
8,9,80.869809,30.034357
9,10,67.593185,23.559862


In [25]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.

pca_elbow_plot = elbow_df.hvplot.line(x='k', y='pca_inertia', title='Elbow curve of k-means on cryptocurrencies (PCA)', xticks=ks)

pca_elbow_plot

#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** Similarly, the answer is ambiguously 2 or 3


&nbsp;

* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** No

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [26]:
# Fit the K-Means model using the PCA data

p_model_k3 = km_wrapper(coins_pca_df,3)

models.append(p_model_k3)

[print(x.labels_) for x in models]

# (same clusters as before)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0
 1 1 1 1]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2
 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2
 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2
 0 0 0 0]


[None, None, None, None, None]

In [27]:
# Predict the clusters to group the cryptocurrencies using the PCA data

# Print the resulting array of cluster values.

p_model_k3.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0],
      dtype=int32)

In [28]:
# Add a new column to the DataFrame with the predicted clusters
coins_df['PCA_label'] = p_model_k3.labels_

# Display sample data
coins_df.iloc[:,-4:].sort_values('PCA_label', ascending=False).head(10)


Unnamed: 0_level_0,PCA_1,PCA_2,PCA_3,PCA_label
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
celsius-degree-token,11.064407,13.781783,0.478997,2
ethlend,47.001114,-3.766282,-0.720656,1
bitcoin,-2.18867,0.52051,-0.741547,0
dash,-2.180401,-1.491193,0.457111,0
huobi-token,-2.047348,-0.547904,0.516148,0
nem,-0.589249,1.530066,1.817678,0
binance-usd,-2.282306,-0.311453,0.495543,0
iota,-2.342942,-1.024056,-0.56262,0
vechain,-1.107466,-1.410692,-0.998343,0
zcash,-1.621563,-0.346552,-0.030286,0


In [29]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Colour the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.

pca_cluster_plot = coins_df.hvplot.scatter(
    x = "PCA_1",
    y="PCA_2",
    by="PCA_label",
    hover_cols=['coin_id'])

pca_cluster_plot

### Visualise and Compare the Results

In this section, you will visually analyse the cluster analysis results by contrasting the outcome with and without using the optimisation techniques.

In [30]:
# Composite plot to contrast the Elbow curves

elbow_plot * pca_elbow_plot

In [31]:
# Composite plot to contrast the clusters

cluster_plot + pca_cluster_plot


In [34]:
# comparison plot chosing more salient features:

cluster_plot2 = coins_df.hvplot.scatter(
    x="price_change_percentage_1y",
    y="price_change_percentage_30d",
    by="label",
    hover_cols=['coin_id'])

cluster_plot2 + pca_cluster_plot

#### Answer the following question: 

  * **Question:** After visually analysing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** PCA has had negligible impact on differentially clustering the data, but has enabled the clusters to be better visualised in two dimensions. 

In [32]:
# summary

loadings = pca.components_
loadings_df = pd.DataFrame(loadings, columns=df_market_data.columns, index=[f"PCA_{i+1}" for i in range(loadings.shape[0])])
print(loadings_df.transpose())

                                 PCA_1     PCA_2     PCA_3
price_change_percentage_24h  -0.213953  0.362819 -0.803476
price_change_percentage_7d   -0.005452  0.012304 -0.399710
price_change_percentage_14d  -0.012040  0.172649 -0.101926
price_change_percentage_30d   0.006551  0.620477  0.137769
price_change_percentage_60d   0.051517  0.604800  0.331660
price_change_percentage_200d  0.253330  0.294302  0.070736
price_change_percentage_1y    0.941900 -0.031856 -0.224250


The first principle component is largely a proxy for the annual change, while the second component tracks 30d and 60d movements.