# Clustering Crypto

In [1]:
!pip install -U altair

# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
#import hvplot.pandas
#import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from pathlib import Path

import altair as alt

Requirement already up-to-date: altair in c:\users\tsanp\.conda\envs\mlenv\lib\site-packages (4.1.0)


### Fetching Cryptocurrency Data

In [2]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"
response = requests.get(url)
content = response.json()

In [3]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
crypto_df = pd.DataFrame.from_dict(content["Data"], orient="index")
crypto_df.columns

Index(['Id', 'Url', 'ImageUrl', 'ContentCreatedOn', 'Name', 'Symbol',
       'CoinName', 'FullName', 'Description', 'AssetTokenStatus', 'Algorithm',
       'ProofType', 'SortOrder', 'Sponsored', 'Taxonomy', 'Rating',
       'IsTrading', 'TotalCoinsMined', 'BlockNumber', 'NetHashesPerSecond',
       'BlockReward', 'BlockTime', 'AssetLaunchDate', 'MaxSupply',
       'MktCapPenalty', 'PlatformType', 'BuiltOn', 'SmartContractAddress',
       'DecimalPoints', 'Difficulty', 'IsUsedInDefi'],
      dtype='object')

In [4]:
# Alternatively, use the provided csv file:
# Using csv file, because API call doesn't return "TotalCoinSupply".
#file_path = Path("Resources/crypto_data.csv")
# Create a DataFrame
#crypto_df = pd.read_csv(file_path).rename(columns={"Unnamed: 0":"ID"})
#crypto_df = crypto_df.set_index(crypto_df["ID"]).drop(columns="ID")
#crypto_df.head()

### Data Preprocessing

In [5]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
#drop_columns = crypto_df.columns.drop(['CoinName','Algorithm','IsTrading','ProofType','MaxSupply','TotalCoinSupply'])
crypto_df = crypto_df.rename(columns={"MaxSupply":"TotalCoinSupply"})
drop_columns = crypto_df.columns.drop(['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'])
crypto_df = crypto_df.drop(columns=drop_columns)
print(f"Total Rows: {len(crypto_df):0.0f}")
crypto_df.head()

Total Rows: 5772


Unnamed: 0,CoinName,Algorithm,ProofType,IsTrading,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,True,0.0,0.0
300,300 token,,,True,300.0,300.0
365,365Coin,X11,PoW/PoS,True,0.0,0.0
404,404Coin,Scrypt,PoW/PoS,True,0.0,0.0
433,433 Token,,,False,,


In [6]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df[crypto_df.IsTrading==True]
print(f"Total Rows: {len(crypto_df):0.0f}")
crypto_df.head()

Total Rows: 4650


Unnamed: 0,CoinName,Algorithm,ProofType,IsTrading,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,True,0.0,0.0
300,300 token,,,True,300.0,300.0
365,365Coin,X11,PoW/PoS,True,0.0,0.0
404,404Coin,Scrypt,PoW/PoS,True,0.0,0.0
611,SixEleven,SHA-256,PoW,True,0.0,0.0


In [7]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df[crypto_df.Algorithm!="N/A"]
print(f"Total Rows: {len(crypto_df):0.0f}")
crypto_df.head()

Total Rows: 1473


Unnamed: 0,CoinName,Algorithm,ProofType,IsTrading,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,True,0.0,0.0
365,365Coin,X11,PoW/PoS,True,0.0,0.0
404,404Coin,Scrypt,PoW/PoS,True,0.0,0.0
611,SixEleven,SHA-256,PoW,True,0.0,0.0
808,808,SHA-256,PoW/PoS,True,0.0,0.0


In [8]:
# Remove the "IsTrading" column
crypto_df = crypto_df.drop(columns=["IsTrading"])
print(f"Total Rows: {len(crypto_df):0.0f}")
crypto_df.head()

Total Rows: 1473


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,0.0,0.0
365,365Coin,X11,PoW/PoS,0.0,0.0
404,404Coin,Scrypt,PoW/PoS,0.0,0.0
611,SixEleven,SHA-256,PoW,0.0,0.0
808,808,SHA-256,PoW/PoS,0.0,0.0


In [9]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna()
print(f"Total Rows: {len(crypto_df):0.0f}")
crypto_df.head()

Total Rows: 162


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,0.0,0.0
365,365Coin,X11,PoW/PoS,0.0,0.0
404,404Coin,Scrypt,PoW/PoS,0.0,0.0
611,SixEleven,SHA-256,PoW,0.0,0.0
808,808,SHA-256,PoW/PoS,0.0,0.0


In [10]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df[crypto_df.TotalCoinsMined>0]
print(f"Total Rows: {len(crypto_df):0.0f}")
crypto_df.head()

Total Rows: 107


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
NVC,NovaCoin,Scrypt,PoW/PoS,3075844.0,-1.0
XCP,CounterParty,SHA-256,PoW,2615225.0,-1.0
NSR,NuShares,PoS,PoS,5849637000.0,0.0
MONA,MonaCoin,Scrypt,PoW,79962240.0,-1.0
TRI,Triangles Coin,X13,PoW/PoS,166109.0,0.0


In [11]:
# Drop rows where there are 'N/A' text values
crypto_df = crypto_df[crypto_df.ProofType!="N/A"]
print(f"Total Rows: {len(crypto_df):0.0f}")
crypto_df.head()

Total Rows: 95


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
NVC,NovaCoin,Scrypt,PoW/PoS,3075844.0,-1.0
XCP,CounterParty,SHA-256,PoW,2615225.0,-1.0
NSR,NuShares,PoS,PoS,5849637000.0,0.0
MONA,MonaCoin,Scrypt,PoW,79962240.0,-1.0
TRI,Triangles Coin,X13,PoW/PoS,166109.0,0.0


In [12]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coinName_df = crypto_df.CoinName
coinName_df.head()

NVC           NovaCoin
XCP       CounterParty
NSR           NuShares
MONA          MonaCoin
TRI     Triangles Coin
Name: CoinName, dtype: object

In [13]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.drop(columns=["CoinName"])
crypto_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
NVC,Scrypt,PoW/PoS,3075844.0,-1.0
XCP,SHA-256,PoW,2615225.0,-1.0
NSR,PoS,PoS,5849637000.0,0.0
MONA,Scrypt,PoW,79962240.0,-1.0
TRI,X13,PoW/PoS,166109.0,0.0


In [14]:
# Create dummy variables for text features
X = pd.get_dummies(crypto_df, columns=["Algorithm", "ProofType"])
X.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_BEP2 Token,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake2b,Algorithm_C31,Algorithm_CryptoNight,Algorithm_CryptoNight-Heavy,Algorithm_CryptoNight-Lite,...,ProofType_PoS/LPoS,ProofType_PoSA,ProofType_PoW,ProofType_PoW/PoS,ProofType_PoW/PoSe,ProofType_PoW/nPoS,ProofType_Proof of Authority,ProofType_SPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
NVC,3075844.0,-1.0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
XCP,2615225.0,-1.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
NSR,5849637000.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MONA,79962240.0,-1.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
TRI,166109.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [15]:
# Standardize data
std_scaler = StandardScaler()
std_scaler.fit(X)
X_scaled = pd.DataFrame(std_scaler.transform(X))
X_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,52,53,54,55,56,57,58,59,60,61
0,-0.162711,-0.139858,-0.103142,-0.146647,-0.103142,-0.146647,-0.146647,-0.146647,-0.103142,-0.103142,...,-0.103142,-0.103142,-0.968904,1.877181,-0.103142,-0.103142,-0.103142,-0.103142,-0.103142,-0.103142
1,-0.162715,-0.139858,-0.103142,-0.146647,-0.103142,-0.146647,-0.146647,-0.146647,-0.103142,-0.103142,...,-0.103142,-0.103142,1.032094,-0.532714,-0.103142,-0.103142,-0.103142,-0.103142,-0.103142,-0.103142
2,-0.106183,-0.139858,-0.103142,-0.146647,-0.103142,-0.146647,-0.146647,-0.146647,-0.103142,-0.103142,...,-0.103142,-0.103142,-0.968904,-0.532714,-0.103142,-0.103142,-0.103142,-0.103142,-0.103142,-0.103142
3,-0.161968,-0.139858,-0.103142,-0.146647,-0.103142,-0.146647,-0.146647,-0.146647,-0.103142,-0.103142,...,-0.103142,-0.103142,1.032094,-0.532714,-0.103142,-0.103142,-0.103142,-0.103142,-0.103142,-0.103142
4,-0.162739,-0.139858,-0.103142,-0.146647,-0.103142,-0.146647,-0.146647,-0.146647,-0.103142,-0.103142,...,-0.103142,-0.103142,-0.968904,1.877181,-0.103142,-0.103142,-0.103142,-0.103142,-0.103142,-0.103142


### Reducing Dimensions Using PCA

In [16]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

In [17]:
# Create a DataFrame with the principal components data
pcs_df = pd.DataFrame(data=X_pca, 
                      index=crypto_df.index, 
                      columns=["PC1", "PC2", "PC3"])
pcs_df.head()

Unnamed: 0,PC1,PC2,PC3
NVC,-0.380114,-1.16769,-1.779028
XCP,-0.473714,1.610627,0.037106
NSR,-0.170034,-1.06645,1.010253
MONA,-0.453239,0.904984,-0.54226
TRI,-0.357015,-1.983805,-2.300397


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [18]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
#df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")
alt.Chart(df_elbow).mark_line().encode(
    x = "k",
    y = "inertia").properties(title = "Elbow Curve")

Running K-Means with `k=4`

In [19]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)
# Fit the model
model.fit(pcs_df)
# Predict clusters
predictions = model.predict(pcs_df)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
cluster_df = pd.concat([crypto_df, pcs_df, coinName_df], axis="columns", join="inner")
cluster_df["Class"] = model.labels_
cluster_df.head()


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,CoinName,Class
NVC,Scrypt,PoW/PoS,3075844.0,-1.0,-0.380114,-1.16769,-1.779028,NovaCoin,0
XCP,SHA-256,PoW,2615225.0,-1.0,-0.473714,1.610627,0.037106,CounterParty,1
NSR,PoS,PoS,5849637000.0,0.0,-0.170034,-1.06645,1.010253,NuShares,2
MONA,Scrypt,PoW,79962240.0,-1.0,-0.453239,0.904984,-0.54226,MonaCoin,1
TRI,X13,PoW/PoS,166109.0,0.0,-0.357015,-1.983805,-2.300397,Triangles Coin,0


### Visualizing Results

#### 3D-Scatter with Clusters

In [20]:
# Create a 3D-Scatter with the PCA data and the clusters
alt.Chart(cluster_df).mark_circle().encode(
    x = "PC1",
    y = "PC2",
    tooltip = ["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
).properties(title = "PC1 PC2 Clusters").interactive()

#### Table of Tradable Cryptocurrencies

In [21]:
# Table with tradable cryptos
# ["CoinName", "Algorithm", "ProofType", "TotalCoinsMined", "TotalCoinSupply"]
base = alt.Chart(cluster_df).mark_text().encode(
    y=alt.Y('row_number:O',axis=None)
).transform_window(
    row_number='row_number()'
)

# Data Tables
coin_name = base.encode(text="CoinName").properties(title="CoinName", width = 100)
algo = base.encode(text="Algorithm").properties(title="Algorithm", width = 100)
proof = base.encode(text="ProofType").properties(title="ProofType", width = 100)
mined = base.encode(text="TotalCoinsMined").properties(title="TotalCoinsMined", width = 100)
supply = base.encode(text="TotalCoinSupply").properties(title="TotalCoinSupply", width = 100)
text = alt.hconcat(coin_name, algo, proof, mined, supply) # Combine data tables

text.display()

In [22]:
# Print the total number of tradable cryptocurrencies
print(f"There are total of {len(cluster_df)} tradable cryptocurrencies.")

There are total of 95 tradable cryptocurrencies.


#### Scatter Plot with Tradable Cryptocurrencies

In [23]:
# Scale data to create the scatter plot
#cluster_df["TotalCoinSupply"] = cluster_df.apply(lambda x: int(x["TotalCoinSupply"]))
#cluster_df = cluster_df[cluster_df.TotalCoinSupply != 0]
#type(cluster_df.iloc[0]["TotalCoinSupply"])
cluster_df.loc["BTT"]

Algorithm                TRC10
ProofType                 DPoS
TotalCoinsMined    9.89989e+11
TotalCoinSupply        9.9e+11
PC1                    16.5525
PC2                     1.1872
PC3                  -0.873798
CoinName            BitTorrent
Class                        3
Name: BTT, dtype: object

In [24]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
alt.Chart(cluster_df).mark_circle().encode(
    x = "TotalCoinsMined",
    y = "TotalCoinSupply",
    tooltip = ["CoinName", "Algorithm", "TotalCoinsMined", "TotalCoinSupply"]
).interactive()