 # Clustering Crypto

 ## Installing External Libraries

In [1]:
# Install the altair plotting library: https://altair-viz.github.io/
!pip install -U altair

You should consider upgrading via the '/Users/cyb/miniconda3/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
# Initial imports
import requests
import pandas as pd
import altair as alt
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [3]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"
response = requests.get(url).json()

In [4]:
# Create a DataFrame
crypto_df = pd.DataFrame(response['Data']).T
crypto_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,AlgorithmType,Difficulty,BuiltOn,SmartContractAddress,DecimalPoints
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,42.0,0.0,0.0,0.0,blockchain,scrypt,0.504232,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300.0,0.0,0.0,0.0,token,,,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,-1.0,0.0,0.0,0.0,blockchain,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


In [5]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
# crypto_df = pd.read_csv(file_path, index_col=0)
# crypto_df.head(10)

 ### Data Preprocessing

In [6]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df=crypto_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','MaxSupply']]
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42.0
300,300 token,,True,,300.0,300.0
365,365Coin,X11,True,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1.0
433,433 Token,,False,,,
611,SixEleven,SHA-256,True,PoW,0.0,0.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
888,Octocoin,,True,PoW,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0.0
2015,2015 coin,X11,True,PoW/PoS,0.0,0.0


In [7]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df[crypto_df["IsTrading"] == True]
print(crypto_df.shape)
crypto_df.head(10)

(6922, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42.0
300,300 token,,True,,300.0,300.0
365,365Coin,X11,True,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1.0
611,SixEleven,SHA-256,True,PoW,0.0,0.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
888,Octocoin,,True,PoW,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0.0
2015,2015 coin,X11,True,PoW/PoS,0.0,0.0
XBS,Bitstake,X11,True,PoW/PoS,,


In [8]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df[crypto_df["Algorithm"] != "N/A"]
print(crypto_df.shape)
crypto_df.head(10)

(1644, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.999952,42.0
365,365Coin,X11,True,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,True,PoW/PoS,0.0,-1.0
611,SixEleven,SHA-256,True,PoW,0.0,0.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,0.0,0.0
2015,2015 coin,X11,True,PoW/PoS,0.0,0.0
XBS,Bitstake,X11,True,PoW/PoS,,
XPY,PayCoin,SHA-256,True,PoS,,
PRC,ProsperCoin,Scrypt,True,PoW,,


In [9]:
# Remove the "IsTrading" column
crypto_df.drop("IsTrading", axis=1, inplace=True)
print(crypto_df.shape)
crypto_df.head(10)

(1644, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,42.0
365,365Coin,X11,PoW/PoS,0.0,-1.0
404,404Coin,Scrypt,PoW/PoS,0.0,-1.0
611,SixEleven,SHA-256,PoW,0.0,0.0
808,808,SHA-256,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,PoW/PoS,0.0,0.0
2015,2015 coin,X11,PoW/PoS,0.0,0.0
XBS,Bitstake,X11,PoW/PoS,,
XPY,PayCoin,SHA-256,PoS,,
PRC,ProsperCoin,Scrypt,PoW,,


In [10]:
# Remove rows with at least 1 null value
crypto_df = crypto_df.dropna(axis=0, how="any")
print(crypto_df.shape)
crypto_df.head(10)

(710, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,42
365,365Coin,X11,PoW/PoS,0.0,-1
404,404Coin,Scrypt,PoW/PoS,0.0,-1
611,SixEleven,SHA-256,PoW,0.0,0
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,0.0,0
2015,2015 coin,X11,PoW/PoS,0.0,0
XPD,PetroDollar,SHA-256D,,0.0,-1
XMY,MyriadCoin,Multiple,PoW,0.0,2000000000
SXC,SexCoin,Scrypt,PoW,0.0,0


In [11]:
# Remove rows with cryptocurrencies withouhaving no coins mined
crypto_df = crypto_df[crypto_df["TotalCoinsMined"] > 0]
print(crypto_df.shape)
crypto_df.head(10)

(312, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,42
NSR,NuShares,PoS,PoS,6178782525.8373,0
TRI,Triangles Coin,X13,PoW/PoS,199294.064798,0
CMTC,CometCoin,Scrypt,PoW,872830.0,0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000.0,-1
PURA,Pura,X11,PoW,188358976.839698,-1
ADK,Aidos Kuneen,IMesh,PoW,25000000.0,0
DAPS,DAPS Coin,Dagger,PoW/PoS/PoA,62319462900.0,70000000000
FOIN,Foin,SHA-256,,92631000.8161,100000000
NVL,Nevula,NEP-5,,40000000000.0,40000000000


In [12]:
# Drop rows where there are 'N/A' text values
crypto_df = crypto_df[crypto_df.iloc[:] != 'N/A'].dropna()
crypto_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,42 Coin,Scrypt,PoW/PoS,41.999952,42
NSR,NuShares,PoS,PoS,6178782525.8373,0
TRI,Triangles Coin,X13,PoW/PoS,199294.064798,0
CMTC,CometCoin,Scrypt,PoW,872830.0,0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000.0,-1
PURA,Pura,X11,PoW,188358976.839698,-1
ADK,Aidos Kuneen,IMesh,PoW,25000000.0,0
DAPS,DAPS Coin,Dagger,PoW/PoS/PoA,62319462900.0,70000000000
VEIL,VEIL,X16RT,PoW/PoS,119516479.714871,300000000
RVC,Ravencoin Classic,X16R,PoW,10501536386.860544,21000000000


In [13]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coins_name = pd.DataFrame(crypto_df["CoinName"], index=crypto_df.index)
print(coins_name.shape)
coins_name.head()

(140, 1)


Unnamed: 0,CoinName
42,42 Coin
NSR,NuShares
TRI,Triangles Coin
CMTC,CometCoin
CHAT,OpenChat


In [14]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.drop("CoinName", axis=1)
print(crypto_df.shape)
crypto_df.head(10)

(140, 4)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply
42,Scrypt,PoW/PoS,41.999952,42
NSR,PoS,PoS,6178782525.8373,0
TRI,X13,PoW/PoS,199294.064798,0
CMTC,Scrypt,PoW,872830.0,0
CHAT,Scrypt,PoW/PoS,1000000000.0,-1
PURA,X11,PoW,188358976.839698,-1
ADK,IMesh,PoW,25000000.0,0
DAPS,Dagger,PoW/PoS/PoA,62319462900.0,70000000000
VEIL,X16RT,PoW/PoS,119516479.714871,300000000
RVC,X16R,PoW,10501536386.860544,21000000000


In [15]:
# Create dummy variables for text features
X = pd.get_dummies(data=crypto_df, columns=["Algorithm", "ProofType"])
print(X.shape)
X.head(10)

(140, 83)


Unnamed: 0,TotalCoinsMined,MaxSupply,Algorithm_Autolykos,Algorithm_BEP-2,Algorithm_BEP-20 Token,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake2B + SHA3,Algorithm_Blake2b,Algorithm_C31,...,ProofType_PoW/PoSe,ProofType_PoW/nPoS,ProofType_ProgPoW/PoS,ProofType_Proof of Authority,ProofType_Proof-of-Work,ProofType_SPoS,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW,ProofType_dPoW/PoW
42,41.999952,42,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NSR,6178782525.8373,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRI,199294.064798,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CMTC,872830.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHAT,1000000000.0,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PURA,188358976.839698,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ADK,25000000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DAPS,62319462900.0,70000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
VEIL,119516479.714871,300000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RVC,10501536386.860544,21000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Standardize data
X = StandardScaler().fit_transform(X)
X[:5]

array([[-0.08660438, -0.09087225, -0.08481889, -0.08481889, -0.08481889,
        -0.12038585, -0.08481889, -0.08481889, -0.12038585, -0.12038585,
        -0.14797909, -0.08481889, -0.08481889, -0.08481889, -0.24618298,
        -0.12038585, -0.08481889, -0.08481889, -0.08481889, -0.29201253,
        -0.08481889, -0.08481889, -0.24618298, -0.08481889, -0.08481889,
        -0.12038585, -0.08481889, -0.08481889, -0.08481889, -0.08481889,
        -0.08481889, -0.08481889, -0.14797909, -0.08481889, -0.08481889,
        -0.12038585, -0.19245009, -0.08481889, -0.08481889, -0.14797909,
        -0.12038585, -0.29201253, -0.12038585, -0.08481889, -0.08481889,
        -0.08481889,  2.19848433, -0.08481889, -0.08481889, -0.08481889,
        -0.08481889, -0.08481889, -0.21160368, -0.08481889, -0.19245009,
        -0.12038585, -0.08481889, -0.08481889, -0.08481889, -0.08481889,
        -0.08481889, -0.26211122, -0.08481889, -0.08481889, -0.12038585,
        -0.12038585, -0.08481889, -0.31994094, -0.0

 ### Reducing Dimensions Using PCA

In [17]:
# Use PCA to reduce dimension to 3 principal components
n_comp = 3
pca = PCA(n_components=n_comp)
principal_components = pca.fit_transform(X)
principal_components

array([[ 2.22848785e-01, -1.32606183e+00, -1.34365811e+00],
       [ 6.96262339e-01, -1.16929985e+00, -3.12068027e-01],
       [ 6.54791911e-01, -1.97456592e+00, -1.62691027e+00],
       [-8.53522827e-01,  4.47471463e-01, -3.64626266e-01],
       [ 2.22853570e-01, -1.32605797e+00, -1.34365834e+00],
       [-5.55574962e-01,  1.16552388e-01, -3.38081156e-01],
       [-9.29555832e-01,  8.89836627e-01,  2.89141460e-01],
       [ 8.61758472e-01, -1.95192307e+00,  6.34026044e+00],
       [ 6.19078495e-01, -1.98033461e+00, -1.71363992e+00],
       [-1.21994362e+00,  1.24793343e+00,  2.26503417e-01],
       [ 6.19098671e-01, -1.98036517e+00, -1.71364551e+00],
       [-1.29969826e+00,  1.33905475e+00,  1.71861716e-01],
       [ 8.83591721e-01, -1.41029875e+00, -4.01970685e-01],
       [-1.01092116e+00,  9.05266812e-01,  8.33258045e-02],
       [ 6.25508531e-01, -2.00421431e+00, -1.73851162e+00],
       [-1.26320025e+00,  1.28048827e+00,  1.62596984e-01],
       [-1.10355713e+00,  1.04594313e+00

In [18]:
# Create a DataFrame with the principal components data
col_names = [f"PC {i}" for i in range(1, n_comp + 1)]
pcs_df = pd.DataFrame(principal_components, columns=col_names, index=crypto_df.index)
print(pcs_df.shape)
pcs_df.head(10)

(140, 3)


Unnamed: 0,PC 1,PC 2,PC 3
42,0.222849,-1.326062,-1.343658
NSR,0.696262,-1.1693,-0.312068
TRI,0.654792,-1.974566,-1.62691
CMTC,-0.853523,0.447471,-0.364626
CHAT,0.222854,-1.326058,-1.343658
PURA,-0.555575,0.116552,-0.338081
ADK,-0.929556,0.889837,0.289141
DAPS,0.861758,-1.951923,6.34026
VEIL,0.619078,-1.980335,-1.71364
RVC,-1.219944,1.247933,0.226503


 ### Clustering Crytocurrencies Using K-Means

 #### Finde the Best Value for `k` Using the Elbow Curve

In [19]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
alt.Chart(df_elbow).mark_line().encode(x="k", y="inertia")


 Running K-Means with `k=4`

In [20]:
# Initialize the K-Means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
clustered_df = pd.concat([crypto_df, pcs_df], axis=1, sort=False)
clustered_df["CoinName"] = coins_name["CoinName"]
clustered_df["Class"] = model.labels_
print(clustered_df.shape)
clustered_df.head(10)


(140, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,MaxSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.999952,42,0.222849,-1.326062,-1.343658,42 Coin,0
NSR,PoS,PoS,6178782525.8373,0,0.696262,-1.1693,-0.312068,NuShares,0
TRI,X13,PoW/PoS,199294.064798,0,0.654792,-1.974566,-1.62691,Triangles Coin,0
CMTC,Scrypt,PoW,872830.0,0,-0.853523,0.447471,-0.364626,CometCoin,2
CHAT,Scrypt,PoW/PoS,1000000000.0,-1,0.222854,-1.326058,-1.343658,OpenChat,0
PURA,X11,PoW,188358976.839698,-1,-0.555575,0.116552,-0.338081,Pura,2
ADK,IMesh,PoW,25000000.0,0,-0.929556,0.889837,0.289141,Aidos Kuneen,2
DAPS,Dagger,PoW/PoS/PoA,62319462900.0,70000000000,0.861758,-1.951923,6.34026,DAPS Coin,3
VEIL,X16RT,PoW/PoS,119516479.714871,300000000,0.619078,-1.980335,-1.71364,VEIL,0
RVC,X16R,PoW,10501536386.860544,21000000000,-1.219944,1.247933,0.226503,Ravencoin Classic,2


 ### Visualizing Results

 #### Scatter Plot for Clusters

In [21]:
# Scatter plot to visualize clusters using two principal components
alt.Chart(clustered_df).mark_circle(size=60).encode(
    x="PC 1",
    y="PC 2",
    color=alt.Color(
        "Class",
        scale=alt.Scale(domain=[0, 1, 2, 3], range=["red", "green", "blue", "orange"]),
    ),
    tooltip=["CoinName", "Algorithm", "TotalCoinsMined", "MaxSupply"],
).interactive()


 #### Scatter Plot with Tradable Cryptocurrencies

In [22]:
# Scale data to create the scatter plot
mm_scaler = MinMaxScaler()
plot_data = mm_scaler.fit_transform(
    clustered_df[["MaxSupply", "TotalCoinsMined"]]
)
plot_df = pd.DataFrame(
    plot_data, columns=["MaxSupply", "TotalCoinsMined"], index=clustered_df.index
)
plot_df["CoinName"] = clustered_df["CoinName"]
plot_df["Class"] = clustered_df["Class"]
plot_df.head()



Unnamed: 0,MaxSupply,TotalCoinsMined,CoinName,Class
42,2.047619e-12,0.0,42 Coin,0
NSR,4.761905e-14,6.241194e-06,NuShares,0
TRI,4.761905e-14,2.012647e-10,Triangles Coin,0
CMTC,4.761905e-14,8.81604e-10,CometCoin,2
CHAT,0.0,1.010101e-06,OpenChat,0


In [23]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
alt.Chart(plot_df).mark_circle(size=60).encode(
    x="TotalCoinsMined",
    y="MaxSupply",
    color=alt.Color(
        "Class",
        scale=alt.Scale(domain=[0, 1, 2, 3], range=["red", "green", "blue", "orange"]),
    ),
    tooltip=["CoinName", "TotalCoinsMined", "MaxSupply"],
).interactive()


 #### Table of Tradable Cryptocurrencies

In [24]:
# Table with tradable cryptos
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    display(
        clustered_df[
            [
                "CoinName",
                "Algorithm",
                "ProofType",
                "MaxSupply",
                "TotalCoinsMined",
                "Class",
            ]
        ]
    )



Unnamed: 0,CoinName,Algorithm,ProofType,MaxSupply,TotalCoinsMined,Class
42,42 Coin,Scrypt,PoW/PoS,42.0,41.999952,0
NSR,NuShares,PoS,PoS,0.0,6178782525.8373,0
TRI,Triangles Coin,X13,PoW/PoS,0.0,199294.064798,0
CMTC,CometCoin,Scrypt,PoW,0.0,872830.0,2
CHAT,OpenChat,Scrypt,PoW/PoS,-1.0,1000000000.0,0
PURA,Pura,X11,PoW,-1.0,188358976.839698,2
ADK,Aidos Kuneen,IMesh,PoW,0.0,25000000.0,2
DAPS,DAPS Coin,Dagger,PoW/PoS/PoA,70000000000.0,62319462900.0,3
VEIL,VEIL,X16RT,PoW/PoS,300000000.0,119516479.714871,0
RVC,Ravencoin Classic,X16R,PoW,21000000000.0,10501536386.860544,2
