# Clustering Crypto

In [4]:
# Initial imports
import requests
import pandas as pd
import matplotlib.pyplot as plt
#import hvplot.pandas
#import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [5]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [6]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.
response_data = requests.get(url)

In [7]:
data = response_data.json()

In [8]:
df = pd.DataFrame(data['Data']).transpose()

In [9]:
df.reset_index(inplace = True)
df

Unnamed: 0,index,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Algorithm,...,SortOrder,Sponsored,Taxonomy,Rating,IsTrading,TotalCoinsMined,BlockNumber,NetHashesPerSecond,BlockReward,BlockTime
0,42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Scrypt,...,34,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",True,42,194639,0,0,0
1,300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),,...,2212,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",True,300,0,0,0,0
2,365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),X11,...,916,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",True,,,,,
3,404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),Scrypt,...,602,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",True,1.12364e+06,42082,0,11.087,60
4,433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),,...,3505,False,"{'Access': '', 'FCA': '', 'FINMA': '', 'Indust...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",False,1.12423e+08,10776236,221634832754715,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5228,DOT,935731,/coins/dot/overview,/media/37072130/dot.png,1593791455,DOT,DOT,Polkadot,Polkadot (DOT),,...,5535,False,"{'Access': 'Permissioned', 'FCA': 'Utility', '...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",True,9.8745e+08,0,0,0,0
5229,VIDYA,936339,/coins/vidya/overview,/media/37305405/vidya.png,1598974844,VIDYA,VIDYA,Vidya,Vidya (VIDYA),,...,5724,False,"{'Access': 'Permissionless', 'FCA': 'Utility',...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",False,0,0,0,4,0
5230,QQQF,936342,/coins/qqqf/overview,/media/37305406/qqqf.png,1598975290,QQQF,QQQF,Standard Crypto Fund,Standard Crypto Fund (QQQF),,...,5726,False,"{'Access': 'Permissionless', 'FCA': 'Utility',...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",False,50000000,0,0,0,0
5231,LIBFX,936344,/coins/libfx/overview,/media/37305407/libfx.png,1598975577,LIBFX,LIBFX,Libfx,Libfx (LIBFX),,...,5727,False,"{'Access': 'Permissionless', 'FCA': 'Utility',...","{'Weiss': {'Rating': '', 'TechnologyAdoptionRa...",False,35000000,0,0,0,0


In [10]:
# Alternatively, use the provided csv file:
from pathlib import Path
file_path = Path("Resources/crypto_data.csv")

# Create a DataFrame
crypto_df = pd.read_csv(file_path)
crypto_df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1247,XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


### Data Preprocessing

In [11]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_df = crypto_df[['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply']]
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1247,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [12]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df[crypto_df['IsTrading']]
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1243,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,SHA-256,True,DPoI,,1000000000
1245,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [13]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df[crypto_df['Algorithm'] != 'N/A']
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1243,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,SHA-256,True,DPoI,,1000000000
1245,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [14]:
# Remove the "IsTrading" column
crypto_df = crypto_df.drop(['IsTrading'], axis = 1)
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
1243,Super Zero,Ethash,PoW,,1000000000
1244,UOS,SHA-256,DPoI,,1000000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [15]:
# Remove rows with at least 1 null value
crypto_df.dropna(inplace = True)
crypto_df.isnull().sum()

CoinName           0
Algorithm          0
ProofType          0
TotalCoinsMined    0
TotalCoinSupply    0
dtype: int64

In [16]:
crypto_df.dtypes

CoinName            object
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [17]:
# Remove rows with cryptocurrencies having no coins mined
#coins_df['TotalCoinsMined'] = coins_df['TotalCoinsMined'].astype(float)
crypto_df.drop(crypto_df[crypto_df['TotalCoinsMined'] == 0].index, 
              inplace = True)
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
8,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [18]:
# Drop rows where there are 'N/A' text values
crypto_df.drop(crypto_df[crypto_df.values == 'N/A'].index, inplace = True)
crypto_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
7,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
8,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
1238,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
1242,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [19]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
coins_name = crypto_df['CoinName'].to_frame()
coins_name

Unnamed: 0,CoinName
0,42 Coin
2,404Coin
5,EliteCoin
7,Bitcoin
8,Ethereum
...,...
1238,ZEPHYR
1242,Gapcoin
1245,Beldex
1246,Horizen


In [20]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df.drop('CoinName', axis = 1, inplace = True)
crypto_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
2,Scrypt,PoW/PoS,1.055185e+09,532000000
5,X13,PoW/PoS,2.927942e+10,314159265359
7,SHA-256,PoW,1.792718e+07,21000000
8,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000
1242,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,CryptoNight,PoW,9.802226e+08,1400222610
1246,Equihash,PoW,7.296538e+06,21000000


In [21]:
# Create dummy variables for text features
X = pd.get_dummies(crypto_df, 
                columns = ['Algorithm', 'ProofType'])
X

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,4.199995e+01,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,2.000000e+09,2000000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1242,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1245,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1246,7.296538e+06,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Standardize data
coins_scaled = StandardScaler().fit_transform(X)
coins_scaled

array([[-0.11674788, -0.15286468, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.09358885, -0.14499604, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [ 0.52587231,  4.4937636 , -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       ...,
       [-0.09523411, -0.13215444, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11658774, -0.15255408, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ],
       [-0.11674507, -0.15284989, -0.0433555 , ..., -0.0433555 ,
        -0.0433555 , -0.0433555 ]])

### Reducing Dimensions Using PCA

In [23]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components = 3)
pca

PCA(n_components=3)

In [24]:
# Create a DataFrame with the principal components data
coins_pca = pca.fit_transform(coins_scaled)
coins_pca

array([[-0.32285945,  1.03346837, -0.56987899],
       [-0.30623388,  1.03376921, -0.57033169],
       [ 2.29618078,  1.64126589, -0.68384559],
       ...,
       [ 0.3216167 , -2.32075714,  0.42592452],
       [-0.19861539, -1.98155194,  0.35226124],
       [-0.27650507,  0.78154898, -0.28154078]])

In [25]:
pcs_df = pd.DataFrame(
                data = coins_pca,
                columns = ['PC 1',
                           'PC 2',
                           'PC 3'],
                index = crypto_df.index.to_list())
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3
0,-0.322859,1.033468,-0.569879
2,-0.306234,1.033769,-0.570332
5,2.296181,1.641266,-0.683846
7,-0.151044,-1.309224,0.220048
8,-0.139112,-2.090298,0.362069
...,...,...,...
1238,2.509894,0.623410,0.274921
1242,-0.320910,1.033374,-0.569912
1245,0.321617,-2.320757,0.425925
1246,-0.198615,-1.981552,0.352261


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [26]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters = i, random_state = 0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {'k': k, 'inertia': inertia}
elbow_df = pd.DataFrame(elbow_data)
elbow_df

Unnamed: 0,k,inertia
0,1,3644.560778
1,2,2481.295141
2,3,1509.371912
3,4,515.128114
4,5,314.663636
5,6,251.027274
6,7,212.80132
7,8,172.301384
8,9,143.115811
9,10,114.906243


In [27]:
#!pip install -U altair
import altair as alt

In [28]:
alt.Chart(elbow_df).mark_line().encode(
    x='k',
    y='inertia',
).properties(title='Elbow Curve')

Running K-Means with `k=<your best value for k here>`

In [29]:
# Initialize the K-Means model
model = KMeans(n_clusters = 4, random_state = 0)
# Fit the model
model.fit(pcs_df)
# Predict clusters
predications = model.predict(pcs_df)
# Create a new DataFrame including predicted clusters and cryptocurrencies features
pcs_df['Class'] = model.labels_
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3,Class
0,-0.322859,1.033468,-0.569879,0
2,-0.306234,1.033769,-0.570332,0
5,2.296181,1.641266,-0.683846,0
7,-0.151044,-1.309224,0.220048,1
8,-0.139112,-2.090298,0.362069,1
...,...,...,...,...
1238,2.509894,0.623410,0.274921,0
1242,-0.320910,1.033374,-0.569912,0
1245,0.321617,-2.320757,0.425925,1
1246,-0.198615,-1.981552,0.352261,1


In [30]:
clustered_df = pd.concat([crypto_df, pcs_df, coins_name],
                        join = 'inner', axis = 1)
clustered_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,Class,CoinName
0,Scrypt,PoW/PoS,4.199995e+01,42,-0.322859,1.033468,-0.569879,0,42 Coin
2,Scrypt,PoW/PoS,1.055185e+09,532000000,-0.306234,1.033769,-0.570332,0,404Coin
5,X13,PoW/PoS,2.927942e+10,314159265359,2.296181,1.641266,-0.683846,0,EliteCoin
7,SHA-256,PoW,1.792718e+07,21000000,-0.151044,-1.309224,0.220048,1,Bitcoin
8,Ethash,PoW,1.076842e+08,0,-0.139112,-2.090298,0.362069,1,Ethereum
...,...,...,...,...,...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000,2.509894,0.623410,0.274921,0,ZEPHYR
1242,Scrypt,PoW/PoS,1.493105e+07,250000000,-0.320910,1.033374,-0.569912,0,Gapcoin
1245,CryptoNight,PoW,9.802226e+08,1400222610,0.321617,-2.320757,0.425925,1,Beldex
1246,Equihash,PoW,7.296538e+06,21000000,-0.198615,-1.981552,0.352261,1,Horizen


### Visualizing Results

#### 3D-Scatter with Clusters

In [32]:
alt.Chart(clustered_df).mark_circle(size=60).encode(
    x='PC 1',
    y='PC 2',
    color='Class',
    tooltip=['CoinName', 'Algorithm', 'TotalCoinsMined', 'TotalCoinSupply']
).interactive()

#### Table of Tradable Cryptocurrencies

In [33]:
# Table with tradable cryptos
df = clustered_df[['CoinName', 'Algorithm', 'ProofType','TotalCoinSupply','TotalCoinsMined', 'Class']]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  
    display(df)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinSupply,TotalCoinsMined,Class
0,42 Coin,Scrypt,PoW/PoS,42.0,41.99995,0
2,404Coin,Scrypt,PoW/PoS,532000000.0,1055185000.0,0
5,EliteCoin,X13,PoW/PoS,314159265359.0,29279420000.0,0
7,Bitcoin,SHA-256,PoW,21000000.0,17927180.0,1
8,Ethereum,Ethash,PoW,0.0,107684200.0,1
9,Litecoin,Scrypt,PoW,84000000.0,63039240.0,1
10,Dash,X11,PoW/PoS,22000000.0,9031294.0,0
11,Monero,CryptoNight-V7,PoW,0.0,17201140.0,1
12,Ethereum Classic,Ethash,PoW,210000000.0,113359700.0,1
13,ZCash,Equihash,PoW,21000000.0,7383056.0,1


In [34]:
# Print the total number of tradable cryptocurrencies
print(f"The total number of tradeble cryptocurrencies: \
      {clustered_df['TotalCoinSupply'].astype('float').sum()}")

The total number of tradeble cryptocurrencies:       5508664273302.22


#### Scatter Plot with Tradable Cryptocurrencies

In [35]:
# Scale data to create the scatter plot
clustered_df['TotalCoinsMined_s'] = StandardScaler().fit_transform(clustered_df[['TotalCoinsMined']])
clustered_df['TotalCoinSupply_s'] = StandardScaler().fit_transform(clustered_df[['TotalCoinSupply']])


In [36]:
alt.Chart(clustered_df).mark_circle(size=60).encode(
    x='TotalCoinsMined_s',
    y='TotalCoinSupply_s',
    tooltip=['CoinName', 'TotalCoinsMined', 'TotalCoinSupply'],
    color=alt.Color('Class',
                   scale=alt.Scale(range=['red', 'green']))
).interactive()