# Clustering Crypto

In [1]:
# Initial imports
#!pip install altair
import requests
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
#import altair as alt

### Fetching Cryptocurrency Data

In [2]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [3]:
#Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.

resp = requests.get(url)
response_content = resp.content
data = resp.json()

In [4]:
# save crypto data as coin_data
coin_data = data['Data']

In [5]:
# Create a Datafrane and then Transpose datframe (Columns to Rows)
crypto_df = pd.DataFrame(coin_data)
crypto_df = crypto_df.T
crypto_df.head()

Unnamed: 0,Id,Url,ImageUrl,ContentCreatedOn,Name,Symbol,CoinName,FullName,Description,AssetTokenStatus,...,MaxSupply,MktCapPenalty,IsUsedInDefi,IsUsedInNft,PlatformType,BuiltOn,SmartContractAddress,DecimalPoints,Difficulty,AlgorithmType
42,4321,/coins/42/overview,/media/35650717/42.jpg,1427211129,42,42,42 Coin,42 Coin (42),Everything about 42 coin is 42 - apart from th...,,...,0.0,0.0,0.0,0.0,,,,,,
300,749869,/coins/300/overview,/media/27010595/300.png,1517935016,300,300,300 token,300 token (300),300 token is an ERC20 token. This Token was cr...,,...,300.0,0.0,0.0,0.0,token,ETH,0xaec98a708810414878c3bcdf46aad31ded4a4557,18.0,,
365,33639,/coins/365/overview,/media/352070/365.png,1480032918,365,365,365Coin,365Coin (365),365Coin is a Proof of Work and Proof of Stake ...,,...,0.0,0.0,0.0,0.0,,,,,,
404,21227,/coins/404/overview,/media/35650851/404-300x300.jpg,1466100361,404,404,404Coin,404Coin (404),404 is a PoW/PoS hybrid cryptocurrency that al...,,...,0.0,0.0,0.0,0.0,,,,,,
433,926547,/coins/433/overview,/media/34836095/433.png,1541597321,433,433,433 Token,433 Token (433),433 Token is a decentralised soccer platform t...,Finished,...,,,,,,,,,,


In [6]:
# Select desired columns from above dataframe and save it as 'crypto_df'
crypto_df = crypto_df[['CoinName', 'Algorithm', 'IsTrading', 'ProofType', 
                         'TotalCoinsMined', 'CirculatingSupply']]
crypto_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,CirculatingSupply
42,42 Coin,Scrypt,True,PoW/PoS,0,0
300,300 token,,True,,300,0
365,365Coin,X11,True,PoW/PoS,0,0
404,404Coin,Scrypt,True,PoW/PoS,0,0
433,433 Token,,False,,,
...,...,...,...,...,...,...
MSA,My Shiba Academia,,True,,3978024469.284408,0
JPAW,Jpaw Inu,,True,,1000000000000,0
L,L inu,,True,,1000000000000,0
SUNOLD,Sun Token,TRC-20,True,,19900730,0


### Data Preprocessing

In [7]:
# check the shape (rows and columns) of dataframe
crypto_df.shape

(7281, 6)

In [8]:
# Keep only cryptocurrencies that are trading
crypto_df = crypto_df.loc[crypto_df['IsTrading'] == True].copy()
crypto_df.shape

(5775, 6)

In [9]:
# Replace Algorithm column empty cells with numpy nan 
crypto_df['Algorithm'].replace('', np.nan, inplace=True)

In [10]:
# drop na from column 'Algorithm'
crypto_df.dropna(subset=['Algorithm'], inplace=True)

In [11]:
# Keep only cryptocurrencies with a working algorithm
crypto_df = crypto_df.loc[crypto_df['Algorithm'] != 'N/A' ].copy()
crypto_df.shape

(1612, 6)

In [12]:
# Remove the "IsTrading" column
crypto_df.drop(columns=['IsTrading'], inplace = True)

In [13]:
# check the shape (rows and columns) of dataframe
crypto_df.shape

(1612, 5)

In [14]:
# check the count for null values by columns
crypto_df.isnull().sum()

CoinName                0
Algorithm               0
ProofType               0
TotalCoinsMined      1201
CirculatingSupply    1199
dtype: int64

In [15]:
# Remove rows with cryptocurrencies having no coins mined
crypto_df = crypto_df.loc[crypto_df.TotalCoinsMined > 0].copy()
crypto_df.shape

(266, 5)

In [16]:
# check for any duplicates
print(f"Duplicate entries: {crypto_df.duplicated().sum()}")

Duplicate entries: 0


In [17]:
# check the count for null values by columns
crypto_df.isnull().sum()

CoinName             0
Algorithm            0
ProofType            0
TotalCoinsMined      0
CirculatingSupply    0
dtype: int64

In [18]:
# Store the 'CoinName' column in its own DataFrame prior to dropping it from crypto_df. Plus save dataframe 
# in new dataframe named 'shoppings_df' for later usage
coin_df = pd.DataFrame(crypto_df['CoinName'], index=crypto_df.index)
shoppings_df = crypto_df
shoppings_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply
NSR,NuShares,PoS,PoS,6166805595.8311,0
TRI,Triangles Coin,X13,PoW/PoS,189106.707058,0
CMTC,CometCoin,Scrypt,PoW,872830,0
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000,0
QRL,Quantum Resistant Ledger,RandomX,PoW,75177900.387417,0
...,...,...,...,...,...
AURORAC,Auroracoin,Scrypt,PoW/PoS,99865603.452017,0
AUR,AUREO,BEP-20 Token,,19560972.492959,0
XEC,eCash,SHA-256,PoW,18881433396789.378906,18881433396789.378906
SUNOLD,Sun Token,TRC-20,,19900730,0


In [19]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
crypto_df = crypto_df.drop(columns=['CoinName'])

In [20]:
crypto_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply
NSR,PoS,PoS,6166805595.8311,0
TRI,X13,PoW/PoS,189106.707058,0
CMTC,Scrypt,PoW,872830,0
CHAT,Scrypt,PoW/PoS,1000000000,0
QRL,RandomX,PoW,75177900.387417,0
...,...,...,...,...
AURORAC,Scrypt,PoW/PoS,99865603.452017,0
AUR,BEP-20 Token,,19560972.492959,0
XEC,SHA-256,PoW,18881433396789.378906,18881433396789.378906
SUNOLD,TRC-20,,19900730,0


In [21]:
# Create dummy variables for text features only using 'Algorithm' and 'ProofType' columns
crypto_df = pd.get_dummies(crypto_df, columns=['Algorithm', 'ProofType'])
crypto_df

Unnamed: 0,TotalCoinsMined,CirculatingSupply,Algorithm_Autolykos,Algorithm_BEP-2,Algorithm_BEP-2 Token,Algorithm_BEP-20 Token,Algorithm_BEP2 Token,Algorithm_BLAKE256,Algorithm_BMW512 / Echo512,Algorithm_Blake2B + SHA3,...,ProofType_PoW,ProofType_PoW/PoS,ProofType_PoW/PoSe,ProofType_PoW/nPoS,ProofType_Proof of Authority,ProofType_SPoS,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW,ProofType_dPoW/PoW
NSR,6166805595.8311,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TRI,189106.707058,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
CMTC,872830,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
CHAT,1000000000,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
QRL,75177900.387417,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AURORAC,99865603.452017,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
AUR,19560972.492959,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
XEC,18881433396789.378906,18881433396789.378906,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
SUNOLD,19900730,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Standardize data
crypto_scaled = StandardScaler().fit_transform(crypto_df)
print(crypto_scaled[0:3])

[[-0.06142951 -0.06267677 -0.06142951 -0.10680283 -0.06142951 -0.85280287
  -0.06142951 -0.08703883 -0.06142951 -0.06142951 -0.10680283 -0.08703883
  -0.08703883 -0.06142951 -0.06142951 -0.2076951  -0.08703883 -0.06142951
  -0.06142951 -0.06142951 -0.2076951  -0.06142951 -0.06142951 -0.16439899
  -0.06142951 -0.06142951 -0.06142951 -0.06142951 -0.08703883 -0.06142951
  -0.06142951 -0.06142951 -0.06142951 -0.06142951 -0.06142951 -0.13840913
  -0.08703883 -0.06142951 -0.06142951 -0.06142951 -0.08703883  7.22495675
  -0.06142951 -0.10680283 -0.08703883 -0.22667907 -0.08703883 -0.06142951
  -0.06142951 -0.06142951 -0.3002731  -0.06142951 -0.13840913 -0.06142951
  -0.08703883 -0.12356041 -0.06142951 -0.15191091 -0.08703883 -0.06142951
  -0.06142951 -0.06142951 -0.06142951 -0.06142951 -0.06142951 -0.17609018
  -0.06142951 -0.06142951 -1.07012209 -0.08703883 -0.06142951 -0.06142951
   5.05964426 -0.06142951 -0.06142951 -0.06142951 -0.5454915  -0.31491833
  -0.06142951 -0.06142951 -0.06142951 

### Reducing Dimensions Using PCA

In [23]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(crypto_scaled)

In [24]:
# Create a DataFrame with the principal components data
df_crypto_pca = pd.DataFrame(
    data=crypto_pca, columns=["PC 1", "PC 2", "PC 3"], index=crypto_df.index
)
df_crypto_pca

Unnamed: 0,PC 1,PC 2,PC 3
NSR,1.350895,0.669157,-0.104060
TRI,1.523606,0.288847,-0.054754
CMTC,2.058974,-0.839132,0.085619
CHAT,1.785087,-0.127333,-0.003379
QRL,2.108758,-1.168459,0.128725
...,...,...,...
AURORAC,1.785087,-0.127333,-0.003379
AUR,-1.744890,-0.114581,0.013975
XEC,3.098308,-2.296771,0.254700
SUNOLD,-1.075573,-0.034451,0.003939


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [25]:
# Find the best value for k using Elbow Curve
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(crypto_df)
    inertia.append(km.inertia_)


# Create the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

  
  


In [26]:
# Create Elbow chart using altair
alt.Chart(df_elbow).mark_line().encode(
    x='k',
    y='inertia'
).interactive() 

NameError: name 'alt' is not defined

In [27]:
# Initialize the K-Means model
model = KMeans(n_clusters= 4, random_state=0)

# Fit the model
model.fit(df_crypto_pca)

# Predict clusters
predictions = model.predict(df_crypto_pca)

# Add the predicted class columns
df_crypto_pca["class"] = model.labels_
df_crypto_pca.head()

Unnamed: 0,PC 1,PC 2,PC 3,class
NSR,1.350895,0.669157,-0.10406,1
TRI,1.523606,0.288847,-0.054754,1
CMTC,2.058974,-0.839132,0.085619,1
CHAT,1.785087,-0.127333,-0.003379,1
QRL,2.108758,-1.168459,0.128725,1


In [28]:
# combined crypto data as clustered_df, drop index column, and set index 'Symbol'
clustered_df = pd.concat([shoppings_df, df_crypto_pca], axis=1)
clustered_df.head(10)

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,CirculatingSupply,PC 1,PC 2,PC 3,class
NSR,NuShares,PoS,PoS,6166805595.8311,0,1.350895,0.669157,-0.10406,1
TRI,Triangles Coin,X13,PoW/PoS,189106.707058,0,1.523606,0.288847,-0.054754,1
CMTC,CometCoin,Scrypt,PoW,872830.0,0,2.058974,-0.839132,0.085619,1
CHAT,OpenChat,Scrypt,PoW/PoS,1000000000.0,0,1.785087,-0.127333,-0.003379,1
QRL,Quantum Resistant Ledger,RandomX,PoW,75177900.387417,0,2.108758,-1.168459,0.128725,1
AMB,Amber,Dagger,PoA,819401369.0,0,1.49848,8.342785,9.892108,2
BTCZ,BitcoinZ,Equihash,PoW,10488896877.131413,0,2.106142,-1.088856,0.118785,1
PURA,Pura,X11,PoW,188358976.839698,0,1.984316,-0.646093,0.063369,1
BTCP,Bitcoin Private,Equihash,PoW,3818878.387802,0,2.106142,-1.088856,0.118785,1
ADK,Aidos Kuneen,IMesh,PoW,25000000.0,0,2.0193,-0.978931,0.108173,1


### Visualizing Results

#### 2D-Scatter with Clusters

In [None]:
# Create a 2D-Scatter with the PCA data and the clusters
alt.Chart(clustered_df).mark_circle(size=60).encode(
    x='PC 1',
    y='PC 2',
    color='class',
    tooltip=['CoinName', 'Algorithm', 'TotalCoinsMined', 'CirculatingSupply']
).interactive()

#### Table of Tradable Cryptocurrencies


In [29]:
# for MinMaxscaler usage, we will save totalcoinsmined and CirculatingSupply columns in new dataframe
clustered_df_temp = clustered_df[['TotalCoinsMined', 'CirculatingSupply']]
clustered_df_temp

Unnamed: 0,TotalCoinsMined,CirculatingSupply
NSR,6166805595.8311,0
TRI,189106.707058,0
CMTC,872830,0
CHAT,1000000000,0
QRL,75177900.387417,0
...,...,...
AURORAC,99865603.452017,0
AUR,19560972.492959,0
XEC,18881433396789.378906,18881433396789.378906
SUNOLD,19900730,0


In [30]:
# Table with tradable cryptos without principal components, TotalcoinsMined, CirculatingSupply
clustered_df.drop(columns=['TotalCoinsMined', 'CirculatingSupply', 'PC 1', 'PC 2', 'PC 3'], inplace = True)
clustered_df

Unnamed: 0,CoinName,Algorithm,ProofType,class
NSR,NuShares,PoS,PoS,1
TRI,Triangles Coin,X13,PoW/PoS,1
CMTC,CometCoin,Scrypt,PoW,1
CHAT,OpenChat,Scrypt,PoW/PoS,1
QRL,Quantum Resistant Ledger,RandomX,PoW,1
...,...,...,...,...
AURORAC,Auroracoin,Scrypt,PoW/PoS,1
AUR,AUREO,BEP-20 Token,,0
XEC,eCash,SHA-256,PoW,1
SUNOLD,Sun Token,TRC-20,,0


#### Scatter Plot with Tradable Cryptocurrencies

In [31]:
# Use MinMaxScaler on clusterd_df_temp we previously created 
scaler = MinMaxScaler()
tradable_crypto_scaled = scaler.fit_transform(clustered_df_temp)
print(tradable_crypto_scaled[0:10])

[[6.22264609e-17 0.00000000e+00]
 [1.88265687e-21 0.00000000e+00]
 [8.78180385e-21 0.00000000e+00]
 [1.00905287e-17 0.00000000e+00]
 [7.58561148e-19 0.00000000e+00]
 [8.26818844e-18 0.00000000e+00]
 [1.05838758e-16 0.00000000e+00]
 [1.90062094e-18 0.00000000e+00]
 [3.85090650e-20 0.00000000e+00]
 [2.52238322e-19 0.00000000e+00]]


In [32]:
# Create dataframe using tradable_crypto_scaled, Rename columns for scatter plot usage
crypto_scaled_df = pd.DataFrame(tradable_crypto_scaled, index=clustered_df.index)
crypto_scaled_df.rename(columns={0:'TotalCoinsMined', 1: 'TotalCoinSupply'}, inplace = True)
crypto_scaled_df

Unnamed: 0,TotalCoinsMined,TotalCoinSupply
NSR,6.222646e-17,0.0
TRI,1.882657e-21,0.0
CMTC,8.781804e-21,0.0
CHAT,1.009053e-17,0.0
QRL,7.585611e-19,0.0
...,...,...
AURORAC,1.007674e-18,0.0
AUR,1.973555e-19,0.0
XEC,1.905241e-13,1.0
SUNOLD,2.007839e-19,0.0


In [33]:
# combine scaled dataframe with clustered dataframe using index
tradable_crypto_df = pd.concat([crypto_scaled_df, clustered_df], axis = 1)
tradable_crypto_df

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,CoinName,Algorithm,ProofType,class
NSR,6.222646e-17,0.0,NuShares,PoS,PoS,1
TRI,1.882657e-21,0.0,Triangles Coin,X13,PoW/PoS,1
CMTC,8.781804e-21,0.0,CometCoin,Scrypt,PoW,1
CHAT,1.009053e-17,0.0,OpenChat,Scrypt,PoW/PoS,1
QRL,7.585611e-19,0.0,Quantum Resistant Ledger,RandomX,PoW,1
...,...,...,...,...,...,...
AURORAC,1.007674e-18,0.0,Auroracoin,Scrypt,PoW/PoS,1
AUR,1.973555e-19,0.0,AUREO,BEP-20 Token,,0
XEC,1.905241e-13,1.0,eCash,SHA-256,PoW,1
SUNOLD,2.007839e-19,0.0,Sun Token,TRC-20,,0


In [None]:
# Create a 2D-Scatter 
alt.Chart(tradable_crypto_df).mark_circle(size=60).encode(
    x='TotalCoinsMined',
    y='TotalCoinSupply',
    color='class',
    tooltip=['CoinName', 'Algorithm']
    
).interactive()