In [240]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
import matplotlib.pyplot as plt
import holoviews as hv
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [241]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(10)

Unnamed: 0_level_0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
coin_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bitcoin,1.08388,7.60278,6.57509,7.67258,-3.25185,83.5184,37.51761
ethereum,0.22392,10.38134,4.80849,0.13169,-12.8889,186.77418,101.96023
tether,-0.21173,0.04935,0.0064,-0.04237,0.28037,-0.00542,0.01954
ripple,-0.37819,-0.60926,2.24984,0.23455,-17.55245,39.53888,-16.60193
bitcoin-cash,2.90585,17.09717,14.75334,15.74903,-13.71793,21.66042,14.49384
binancecoin,2.10423,12.85511,6.80688,0.05865,36.33486,155.61937,69.69195
chainlink,-0.23935,20.69459,9.30098,-11.21747,-43.69522,403.22917,325.13186
cardano,0.00322,13.99302,5.55476,10.10553,-22.84776,264.51418,156.09756
litecoin,-0.06341,6.60221,7.28931,1.21662,-17.2396,27.49919,-12.66408
bitcoin-cash-sv,0.9253,3.29641,-1.86656,2.88926,-24.87434,7.42562,93.73082


In [242]:
# Generate summary statistics
df_market_data.describe()

Unnamed: 0,price_change_percentage_24h,price_change_percentage_7d,price_change_percentage_14d,price_change_percentage_30d,price_change_percentage_60d,price_change_percentage_200d,price_change_percentage_1y
count,41.0,41.0,41.0,41.0,41.0,41.0,41.0
mean,-0.269686,4.497147,0.185787,1.545693,-0.094119,236.537432,347.667956
std,2.694793,6.375218,8.376939,26.344218,47.365803,435.225304,1247.842884
min,-13.52786,-6.09456,-18.1589,-34.70548,-44.82248,-0.3921,-17.56753
25%,-0.60897,0.04726,-5.02662,-10.43847,-25.90799,21.66042,0.40617
50%,-0.06341,3.29641,0.10974,-0.04237,-7.54455,83.9052,69.69195
75%,0.61209,7.60278,5.51074,4.57813,0.65726,216.17761,168.37251
max,4.84033,20.69459,24.23919,140.7957,223.06437,2227.92782,7852.0897


In [243]:
# Plot
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)

---

### Prepare the Data

In [244]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file
# Initialize StandardScaler
scaler = StandardScaler()

# Normalize the data
scaled_data = scaler.fit_transform(df_market_data)

In [245]:
print(df_market_data.columns)

Index(['price_change_percentage_24h', 'price_change_percentage_7d',
       'price_change_percentage_14d', 'price_change_percentage_30d',
       'price_change_percentage_60d', 'price_change_percentage_200d',
       'price_change_percentage_1y'],
      dtype='object')


In [246]:
# Create a DataFrame with the scaled data
df_scaled = pd.DataFrame(scaled_data, columns=df_market_data.columns, index=df_market_data.index)

# Display the scaled DataFrame
print(df_scaled.head())

# Set the default integer index for the new DataFrame
df_scaled.reset_index(drop=True, inplace=True)

# Display
print(df_scaled.head())

              price_change_percentage_24h  price_change_percentage_7d  \
coin_id                                                                 
bitcoin                          0.508529                    0.493193   
ethereum                         0.185446                    0.934445   
tether                           0.021774                   -0.706337   
ripple                          -0.040764                   -0.810928   
bitcoin-cash                     1.193036                    2.000959   

              price_change_percentage_14d  price_change_percentage_30d  \
coin_id                                                                  
bitcoin                          0.772200                     0.235460   
ethereum                         0.558692                    -0.054341   
tether                          -0.021680                    -0.061030   
ripple                           0.249458                    -0.050388   
bitcoin-cash                     1.760610   

---

### Find the Best Value for k Using the Original Data.

In [247]:
#Create a list with the number of k values from 1 to 11
k_values = range(1, 12)

In [248]:
#Create an empty list to store the inertia values
inertia_values = []

#Create a for loop to compute the inertia with each possible value of k
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_scaled)
    inertia_values.append(kmeans.inertia_)




In [249]:
#Create a dictionary with the data to plot the elbow curve
elbow_data = {"Number of Clusters (k)": k_values, "Inertia": inertia_values}

#Create a DataFrame from the elbow data
elbow_df = pd.DataFrame(elbow_data)
elbow_df

Unnamed: 0,Number of Clusters (k),Inertia
0,1,287.0
1,2,195.820218
2,3,123.190482
3,4,79.022435
4,5,65.220364
5,6,52.829819
6,7,43.786435
7,8,37.52774
8,9,33.070926
9,10,28.989073


In [250]:
#Plot
k_elbow = elbow_df.hvplot.line(
    x = "Number of Clusters (k)",
    y = "Inertia",
    title = "Original Elbow Curve",
    xticks = "Number of Clusters (k)",
    color = 'blue',
)
k_elbow

In [251]:
# Save the plot to 'resources' folder
k_elbow_html_path = "resources/k_elbow.html"
hv.save(k_elbow, k_elbow_html_path)

#### Answer the following question: 

**Question:** What is the best value for `k`?

**Answer:** The best value for K, determined using the elbow method above, is 3. This was identified by analyzing the inertia curve, where the inertia decreases radpidly as the number of clusters increases up to around 3, after which the rate of decrease slows down. Therefore, 3 clusters are sufficient to capture the structure of the data without introducing unnecessary complexity.

---

### Cluster Cryptocurrencies with K-means Using the Original Data

In [252]:
#Initialize the K-means model with the best value for k
best_k = 3  
kmeans = KMeans(n_clusters=best_k, random_state=42)

In [253]:
#Fit the K-means model using the original scaled DataFrame
kmeans.fit(df_scaled)



In [254]:
#Predict the clusters to group the cryptocurrencies using the original scaled DataFrame
clusters = kmeans.predict(df_scaled)

#Print the resulting array of cluster values.
print("Array of cluster values:", clusters)

Array of cluster values: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 1
 0 0 0 0]


In [255]:
#Create a copy of the original data and add a new column with the predicted clusters
df_clusters = df_market_data.copy()
df_clusters['Cluster'] = clusters

#Display sample data
print("Sample data with added cluster column:")
print(df_clusters.head())

Sample data with added cluster column:
              price_change_percentage_24h  price_change_percentage_7d  \
coin_id                                                                 
bitcoin                           1.08388                     7.60278   
ethereum                          0.22392                    10.38134   
tether                           -0.21173                     0.04935   
ripple                           -0.37819                    -0.60926   
bitcoin-cash                      2.90585                    17.09717   

              price_change_percentage_14d  price_change_percentage_30d  \
coin_id                                                                  
bitcoin                           6.57509                      7.67258   
ethereum                          4.80849                      0.13169   
tether                            0.00640                     -0.04237   
ripple                            2.24984                      0.23455   
bitco

In [256]:
#Plot
k_scatter = df_clusters.hvplot.scatter(
    x='price_change_percentage_24h',
    y='price_change_percentage_7d',
    by='Cluster',
    hover_cols=['coin_id'],
    width=800,
    height=400,
    title='Cryptocurrency Clusters (K-means)',
    xlabel='Price Change Percentage (24h)',
    ylabel='Price Change Percentage (7d)'
)
k_scatter

In [257]:
# Save the plot to 'resources' folder
output_path = 'resources/k_scatter.html'
hv.save(k_scatter, output_path)

---

### Optimize Clusters with Principal Component Analysis.

In [258]:
#Perform PCA on the original scaled DataFrame and reduce the features to three principal components
pca = PCA(n_components=3)
pca_data = pca.fit_transform(df_scaled)
pca_data[0:5]

array([[-0.60066733,  0.84276006,  0.46159457],
       [-0.45826071,  0.45846566,  0.95287678],
       [-0.43306981, -0.16812638, -0.64175193],
       [-0.47183495, -0.22266008, -0.47905316],
       [-1.15779997,  2.04120919,  1.85971527]])

In [259]:
#Retrieve the explained variance to determine how much information can be attributed to each principal component
explained_variance = pca.explained_variance_ratio_
print("Explained Variance of each Principal Component:")
print(explained_variance)

#Calculate the total explained variance of the three principal components
total_variance = sum(explained_variance)
print("\nTotal Explained Variance of the three Principal Components:", total_variance)

#Create a new DataFrame with the PCA data and set the "coin_id" index from the original DataFrame as the index for the new DataFrame
df_pca = pd.DataFrame(pca_data, columns=['PC1', 'PC2', 'PC3'], index=df_market_data.index)

#Set the coinid column as index
df_pca['coin_id'] = df_market_data.index
df_pca = df_pca.set_index('coin_id')

#Display
print("\nNew DataFrame with PCA data:")
print(df_pca.head())


Explained Variance of each Principal Component:
[0.3719856  0.34700813 0.17603793]

Total Explained Variance of the three Principal Components: 0.8950316570309841

New DataFrame with PCA data:
                   PC1       PC2       PC3
coin_id                                   
bitcoin      -0.600667  0.842760  0.461595
ethereum     -0.458261  0.458466  0.952877
tether       -0.433070 -0.168126 -0.641752
ripple       -0.471835 -0.222660 -0.479053
bitcoin-cash -1.157800  2.041209  1.859715


#### Answer the following question: 

**Question:** What is the total explained variance of the three principal components?

**Answer:** 89.50%

---

### Find the Best Value for k Using the PCA Data

In [260]:
#Create a list with the number of k-values from 1 to 11
k_values = range(1, 12)

In [261]:
#Create an empty list to store the inertia values
inertia_values = []

#Create a for loop to compute the inertia with each possible value of k
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_pca)
    inertia_values.append(kmeans.inertia_)




In [262]:
#Create a dictionary with the data to plot the Elbow curve
elbow_data_pca = {"Number of Clusters (k)": k_values, "Inertia": inertia_values}

#Create a DataFrame to store the inertia values
elbow_df_pca = pd.DataFrame(elbow_data_pca)
elbow_df_pca


Unnamed: 0,Number of Clusters (k),Inertia
0,1,256.874086
1,2,168.811896
2,3,93.774626
3,4,49.665497
4,5,37.878747
5,6,27.618972
6,7,21.182776
7,8,17.091637
8,9,13.667065
9,10,10.559358


In [263]:
#Plot
k_elbow_pca = elbow_df_pca.hvplot.line(
    x = "Number of Clusters (k)",
    y = "Inertia",
    title = "PCA Elbow Curve",
    xticks = "Number of Clusters (k)",
    color = 'green',
)
k_elbow_pca

In [264]:
# Save the plot to 'resources' folder
k_elbow_pca_html_path = "resources/k_elbow_pca.html"
hv.save(k_elbow_pca, k_elbow_pca_html_path)


#### Answer the following questions: 

* **Question:** What is the best value for `k` when using the PCA data?

  * **Answer:** Based on the elbow curve above, the point where the inertia starts to decrease at a slower rate is "K:3". Therefore, I believe this is the best value for K when using the PCA data.


* **Question:** Does it differ from the best k value found using the original data?

  * **Answer:** The best value using both seems to be 3. So, they dont differ. PCA simplifies the data without losing much information, resulting in similar clustering results.

### Cluster Cryptocurrencies with K-means Using the PCA Data

In [265]:
#Initialize the K-means model with the best value for k obtained from the PCA data
best_k_pca = 3 
kmeans_pca = KMeans(n_clusters=best_k_pca, random_state=42)


In [266]:
#Fit the K-means model using the PCA data
kmeans_pca.fit(df_pca)




In [267]:
#Predict the clusters to group the cryptocurrencies using the PCA data
k_clusters_pca = kmeans_pca.predict(df_pca)

#Print the resulting array of cluster values.
print("Resulting Array of Cluster Values:", k_clusters_pca)

Resulting Array of Cluster Values: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 1
 0 0 0 0]


In [268]:
#Create a copy of the DataFrame with the PCA data and add a new column to store the predicted clusters
df_clusters_pca = df_pca.copy()
df_clusters_pca['Cluster'] = k_clusters_pca

# Display sample
print("Sample Data with Predicted Clusters:")
print(df_clusters_pca.head())


Sample Data with Predicted Clusters:
                   PC1       PC2       PC3  Cluster
coin_id                                            
bitcoin      -0.600667  0.842760  0.461595        0
ethereum     -0.458261  0.458466  0.952877        0
tether       -0.433070 -0.168126 -0.641752        0
ripple       -0.471835 -0.222660 -0.479053        0
bitcoin-cash -1.157800  2.041209  1.859715        0


In [269]:
#Plot
k_scatter_pca = df_clusters_pca.hvplot.scatter(
    x='PC1',
    y='PC2',
    by='Cluster',
    hover_cols=['coin_id'],
    width=800,
    height=400,
    title='Cryptocurrency Clusters (K-means with PCA Data)',
    xlabel='Principal Component 1 (PC1)',
    ylabel='Principal Component 2 (PC2)'
)
k_scatter_pca

In [270]:
# Save the plot to 'resources' folder
output_path = 'resources/k_scatter_pca.html'
hv.save(k_scatter_pca, output_path)

### Visualize and Compare the Results

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [271]:

#Plot Elbow curve for original data
elbow_original = elbow_df.hvplot.line(
    x="Number of Clusters (k)",
    y="Inertia",
    title="Original Elbow Curve",
    xticks="Number of Clusters (k)",
    color='blue',
)
#Plot elbow curve for PCA data
elbow_pca = elbow_df_pca.hvplot.line(
    x="Number of Clusters (k)",
    y="Inertia",
    title="PCA Elbow Curve",
    xticks="Number of Clusters (k)",
    color='green',
)

#Combine plots side by side
composite_plot_elbow = (elbow_original + elbow_pca).cols(2)

#Display
composite_plot_elbow

In [272]:
# Define the filename for the HTML file
composite_plot_elbow_html_path = "resources/composite_plot_elbow.html"

# Save the plot as an HTML file
hv.save(composite_plot_elbow, composite_plot_elbow_html_path)


In [273]:
# Plot clusters for original data
scatter_original = df_clusters.hvplot.scatter(
    x='price_change_percentage_24h',
    y='price_change_percentage_7d',
    by='Cluster',
    hover_cols=['coin_id'],
    width=400,
    height=400,
    title='Original Data Clusters',
    xlabel='price_change_percentage_24h',
    ylabel='price_change_percentage_7d',
    color=hv.Cycle('Category10')
)

# Plot clusters for PCA data
scatter_pca = df_clusters_pca.hvplot.scatter(
    x='PC1',
    y='PC2',
    by='Cluster',
    hover_cols=['coin_id'],
    width=400,
    height=400,
    title='PCA Data Clusters',
    xlabel='Principal Component 1 (PC1)',
    ylabel='Principal Component 2 (PC2)',
    color=hv.Cycle('Category10')
)

# Combine the plots side by side
composite_plot_scatter = (scatter_original + scatter_pca).cols(2)

# Display the composite plot
composite_plot_scatter


In [274]:
# Define the filename for the HTML file
composite_plot_scatter_html_path = "resources/composite_plot_scatter.html"

# Save the plot as an HTML file
hv.save(composite_plot_scatter, composite_plot_scatter_html_path)


#### Answer the following question: 

  * **Question:** After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  * **Answer:** Using fewer features simplifies data analysis, making it easier to understand and interpret clusters. While it saves resources and time, it may sacrafice some accuracy by overlooking important information. Thus, the choice involves finding a balance between simplicity and accuracy in clustering.