In [1]:
import os
os.environ['OMP_NUM_THREADS'] = '1'
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv("Resources/crypto_market_data.csv",
                             index_col="coin_id")

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [3]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [4]:
# Plot data to see what's in the DataFrame

df_market_data.hvplot.line(
    width = 800,
    height=400,
    rot=90
)

# Prepare the Data (Most Important Step)

In [5]:
# Create an instance of the StandardScaler
scaler = StandardScaler()

# Normalize the data
scaled_data = scaler.fit_transform(df_market_data)

# Convert the normalized data back to a DataFrame
df_market_data_scaled = pd.DataFrame(scaled_data, columns=df_market_data.columns)

# Display the normalized data
df_market_data_scaled.head()


Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
0,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
1,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
2,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
3,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
4,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


In [6]:
df_market_data_scaled = pd.DataFrame(scaled_data, columns=df_market_data.columns)

# Copy the crypto names from the original data
df_market_data_scaled['coin_id'] = df_market_data.index

# Set the coinid column as index
df_market_data_scaled.set_index('coin_id', inplace=True)

# Display sample data
df_market_data_scaled.head()


Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317


Best Value for k Using the Original Data.

In [7]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 12)) # Stops at 11, does not include 12
k_values

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [8]:
# Create an empty list to store the inertia values
inert_values = []

# Create a for loop to compute the inertia with each possible value of k
for k in k_values:
    # Create a KMeans model using the loop counter for the n_clusters
    kmeans_model = KMeans(n_clusters=k, n_init=10)
    
    # Fit the model to the data using `df_market_data_scaled`
    kmeans_model.fit(df_market_data_scaled)
    
    # Append the model.inertia_ to the inertia list
    inert_values.append(kmeans_model.inertia_)


In [9]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {
    'k_values': k_values,
    'inert_values': inert_values
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_1 = pd.DataFrame(elbow_data)


In [10]:
df_elbow_1.hvplot.line(x='k_values', y='inert_values', xlabel='Number of Clusters (k)', ylabel='Inertia', title='Elbow Curve', xticks=list(range(1, len(df_elbow_1))))



The elbow seems to occur at k=3, where the curve starts to flatten out more significantly. This suggests that the best value for k in the k-means clustering algorithm for this dataset would be 3. Choosing k=3 should provide a reasonable segmentation of the data into clusters without introducing unnecessary complexity or overfitting.

### Cluster Cryptocurrencies with K-Means Using the Original Data

In [11]:
# Initialize the K-Means model using the best value for k
kmeans_model = KMeans(n_clusters=3)

In [12]:
# Fit the K-Means model using the scaled data
kmeans_model.fit(df_market_data_scaled)


In [13]:
# Predict the clusters to group the cryptocurrencies using the scaled data
cluster_labels = kmeans_model.predict(df_market_data_scaled)

# Print the resulting array of cluster values
print(cluster_labels)


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 2
 0 0 0 0]


In [14]:
# Create a copy of the DataFrame
df_copy = df_market_data_scaled.copy()


In [15]:
# Add a new column to the DataFrame with the predicted clusters
df_copy['Cluster'] = cluster_labels

# Display sample data
df_copy.head()


Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y,Cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bitcoin,0.508529,0.493193,0.7722,0.23546,-0.067495,-0.355953,-0.251637,0
ethereum,0.185446,0.934445,0.558692,-0.054341,-0.273483,-0.115759,-0.199352,0
tether,0.021774,-0.706337,-0.02168,-0.06103,0.008005,-0.550247,-0.282061,0
ripple,-0.040764,-0.810928,0.249458,-0.050388,-0.373164,-0.458259,-0.295546,0
bitcoin-cash,1.193036,2.000959,1.76061,0.545842,-0.291203,-0.499848,-0.270317,0


In [16]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']  # These are  hex color codes for blue, orange, and


df_copy.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="Cluster",  # Ensure this parameter is used to group by clusters
    color=colors,
    hover_cols=["coin_id"]
)


### Optimize Clusters with Principal Component Analysis.

In [17]:
# Create a PCA model instance and set `n_components=3`
pca_model = PCA(n_components=3)


In [18]:
# Use the PCA model with `fit_transform` to reduce to three principal components
pca_data = pca_model.fit_transform(df_market_data_scaled)

# Convert the PCA data to a DataFrame
df_pca = pd.DataFrame(pca_data, columns=['PC1', 'PC2', 'PC3'])

# View the first five rows of the DataFrame
df_pca.head()


Unnamed: 0,PC1,PC2,PC3
0,-0.600667,0.84276,0.461595
1,-0.458261,0.458466,0.952877
2,-0.43307,-0.168126,-0.641752
3,-0.471835,-0.22266,-0.479053
4,-1.1578,2.041209,1.859715


In [19]:
# Retrieve the explained variance to determine how much information 
explained_variance = pca_model.explained_variance_ratio_
total_explained_variance = sum(explained_variance)

total_explained_variance_percent = total_explained_variance * 100

print(f'Total Variance in percentage: {total_explained_variance_percent}%')
print(f'Total Variance in float: {total_explained_variance}')


Total Variance in percentage: 89.5031657030984%
Total Variance in float: 0.895031657030984


#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** 89.5031657030984% or 0.895031657030984

In [20]:
# Create a new DataFrame with the PCA data
df_pca_data = pd.DataFrame(pca_data, columns=['PC1', 'PC2', 'PC3'])

# Copy the crypto names from the original data
df_pca_data['coin_id'] = df_market_data.index

# Set the coinid column as index
df_pca_data.set_index('coin_id', inplace=True)

# Display sample data
df_pca_data.head()


Unnamed: 0_level_0,PC1,PC2,PC3
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin,-0.600667,0.84276,0.461595
ethereum,-0.458261,0.458466,0.952877
tether,-0.43307,-0.168126,-0.641752
ripple,-0.471835,-0.22266,-0.479053
bitcoin-cash,-1.1578,2.041209,1.859715


### Find the Best Value for k Using the PCA Data

In [21]:
# Create a list with the number of k-values from 1 to 11
k_values = list(range(1, 12)) # Stops at 11, does not include 12
k_values

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

In [22]:
# Create an empty list to store the inertia values
inert_values = []

# Create a for loop to compute the inertia with each possible value of k
for k in k_values:
    # Create a KMeans model using the loop counter for the n_clusters
    kmeans_model = KMeans(n_clusters=k, n_init=10)
    
    # Fit the model to the data using `df_market_data_pca`
    kmeans_model.fit(df_pca_data)
    
    # Append the model.inertia_ to the inertia list
    inert_values.append(kmeans_model.inertia_)


In [23]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {
    'k_values': k_values,
    'inert_values': inert_values
}

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_2 = pd.DataFrame(elbow_data)


In [24]:
df_elbow_2.hvplot.line(x='k_values', y='inert_values', xlabel='Number of Clusters (k)', ylabel='Inertia', title='Elbow Curve', xticks=list(range(1, len(df_elbow_2))))


#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** There seems to be a slight bend around k=4. After that point, the rate of decrease in WCSS slows down, suggesting that increasing the number of clusters beyond 4 results in diminishing returns in terms of variance explained. Therefore, k=4 could be considered as the best value for k when using the PCA data.


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** Yes, it differs. The best k value suggested by the Elbow Curve using the PCA data appears to be k=4, while the best k value identified using the original data was k=3. This discrepancy can occur because PCA reduces dimensionality by combining features, which can alter the distribution of data points and potentially reveal different clustering structures. As a result, the optimal number of clusters (k) might change when using PCA-transformed data compared to the original data.

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [25]:
# Initialize the K-Means model using the best value for k
kmeans_model = KMeans(n_clusters=4)


In [26]:
# Fit the K-Means model using the PCA data
kmeans_model.fit(df_pca_data)

In [27]:
# Predict the clusters to group the cryptocurrencies using the PCA data

cluster_labels_pca = kmeans_model.predict(df_pca_data)

# Print the resulting array of cluster values
print(cluster_labels_pca)


[2 2 1 1 2 2 2 2 2 1 1 1 1 2 1 2 1 1 2 1 1 2 1 1 1 1 1 1 2 1 1 1 3 2 1 1 0
 1 1 1 1]


In [28]:
# Create a copy of the DataFrame with the PCA data
df_pca_copy = df_pca_data.copy()

# Add a new column to the DataFrame with the predicted clusters
df_pca_copy['Cluster'] = cluster_labels_pca

# Display sample data
df_pca_copy.head()


Unnamed: 0_level_0,PC1,PC2,PC3,Cluster
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bitcoin,-0.600667,0.84276,0.461595,2
ethereum,-0.458261,0.458466,0.952877,2
tether,-0.43307,-0.168126,-0.641752,1
ripple,-0.471835,-0.22266,-0.479053,1
bitcoin-cash,-1.1578,2.041209,1.859715,2


In [29]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.


df_pca_copy.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="Cluster",
    hover_cols=["coin_id"]
)


### Visualize and Compare the Results

In this section, we will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [34]:
import holoviews as hv
# Create the Elbow curve plot 2
hv.extension('bokeh')
# Create the Elbow curve plot 1
elbow_curve1 = df_elbow_1.hvplot.line(
    x='k_values',
    y='inert_values',
    xlabel='Number of Clusters (k)',
    ylabel='Inertia',
    title='Elbow Curve for Original Data'
).opts(xticks=list(range(1, len(df_elbow_1) + 1)))

# Create the Elbow curve plot 2
elbow_curve2 = df_elbow_2.hvplot.line(
    x='k_values',
    y='inert_values',
    xlabel='Number of Clusters (k)',
    ylabel='Inertia',
    title='Elbow Curve for PCA Data'
).opts(xticks=list(range(1, len(df_elbow_2) + 1)))

# Combine the plots side by side
composite_plot = (elbow_curve1 + elbow_curve2).cols(1)

# Display the composite plot
composite_plot


In [35]:
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']  # These are hex color codes for blue, orange, green, and red

# Scatter plot for original data with a title
scatter_plot1 = df_copy.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="Cluster",
    color=colors,
    hover_cols=["coin_id"],
    title="Original Data: 24h vs 7d Price Change"
)

# Scatter plot for PCA data with a title
scatter_plot2 = df_pca_copy.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="Cluster",
    color=colors,
    hover_cols=["coin_id"],
    title="PCA Data: PC1 vs PC2"
)

# Combine the scatter plots side by side (horizontally)
composite_plot = (scatter_plot1 + scatter_plot2).cols(2)

# Display the composite plot
composite_plot


#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** The impact of using fewer features to cluster the data using K-Means, as a result of dimensionality reduction through PCA, can vary depending on the specific characteristics of the dataset. 

  - By focusing on just two or three main aspects, we can more easily see and understand patterns in the data when looking at it in a simple two-dimensional or three-dimensional view.

  - Getting Rid of Unnecessary Details: PCA can help remove background noise from the data by concentrating on the most significant aspects, which might make patterns clearer that were previously hidden by less important details.

  - Improving Groupings: The way we group data can get better or worse with PCA. It can improve by removing distractions, or it might miss crucial details that help distinguish between groups, making the groupings less accurate.

  - Seeing Data in a New Way: After using PCA, we might see the data's structure differently, possibly suggesting we need to group the data in a new way, as indicated by changes in the Elbow curves.

  - Speeding Up Analysis: Analyzing data with fewer details can be faster, especially with big datasets, making the process less complicated.

  - Understanding the Groups: Although PCA makes it easier to see data in fewer dimensions, the new main aspects are mixes of the original details and might not be as easy to explain. This can make it harder to understand what each group specifically represents based on the original data.
  