In [55]:
 # Initial imports
import pandas as pd
import hvplot.pandas
from pathlib import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [56]:
# Loade the cryptocurrencies data
file_path = Path("../Desktop/crypto_data.csv")
df = pd.read_csv(file_path, index_col=0)
df.head(10)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42.0
365,365Coin,X11,True,PoW/PoS,,2300000000.0
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000.0
611,SixEleven,SHA-256,True,PoW,,611000.0
808,808,SHA-256,True,PoW/PoS,0.0,0.0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159000000.0
2015,2015 coin,X11,True,PoW/PoS,,0.0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000.0
ETH,Ethereum,Ethash,True,PoW,107684200.0,0.0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000.0


## Data Preprocessing

In [57]:

df['IsTrading'] = df['IsTrading'].astype(str)

bool_list = ['True']

df = df[df.IsTrading.isin(bool_list)]

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1144 entries, 42 to XBC
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         1144 non-null   object 
 1   Algorithm        1144 non-null   object 
 2   IsTrading        1144 non-null   object 
 3   ProofType        1144 non-null   object 
 4   TotalCoinsMined  685 non-null    float64
 5   TotalCoinSupply  1144 non-null   object 
dtypes: float64(1), object(5)
memory usage: 62.6+ KB


In [58]:
df.head(5)

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [59]:
df.drop(columns=['IsTrading'],inplace=True)

In [60]:
df['TotalCoinSupply'] =  pd.to_numeric(df['TotalCoinSupply'],errors='coerce')

In [61]:
df['TotalCoinsMined'] =  pd.to_numeric(df['TotalCoinsMined'],errors='coerce')

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1144 entries, 42 to XBC
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         1144 non-null   object 
 1   Algorithm        1144 non-null   object 
 2   ProofType        1144 non-null   object 
 3   TotalCoinsMined  685 non-null    float64
 4   TotalCoinSupply  1141 non-null   float64
dtypes: float64(2), object(3)
memory usage: 53.6+ KB


In [63]:
df.dropna(axis=0,how='any',inplace=True)

In [64]:
df.replace(0,'one',inplace=True)

In [65]:
df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,42,42
404,404Coin,Scrypt,PoW/PoS,1.05518e+09,5.32e+08
808,808,SHA-256,PoW/PoS,one,one
1337,EliteCoin,X13,PoW/PoS,2.92794e+10,3.14159e+11
BTC,Bitcoin,SHA-256,PoW,1.79272e+07,2.1e+07


In [66]:
df2 = df[df != 'one']

df2.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,42.0,42.0
404,404Coin,Scrypt,PoW/PoS,1055180000.0,532000000.0
808,808,SHA-256,PoW/PoS,,
1337,EliteCoin,X13,PoW/PoS,29279400000.0,314159000000.0
BTC,Bitcoin,SHA-256,PoW,17927200.0,21000000.0


In [67]:
df2 = df2.dropna(axis=0, how='any')

df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 498 entries, 42 to XBC
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   CoinName         498 non-null    object
 1   Algorithm        498 non-null    object
 2   ProofType        498 non-null    object
 3   TotalCoinsMined  498 non-null    object
 4   TotalCoinSupply  498 non-null    object
dtypes: object(5)
memory usage: 23.3+ KB


In [68]:
df2.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,42.0,42.0
404,404Coin,Scrypt,PoW/PoS,1055180000.0,532000000.0
1337,EliteCoin,X13,PoW/PoS,29279400000.0,314159000000.0
BTC,Bitcoin,SHA-256,PoW,17927200.0,21000000.0
LTC,Litecoin,Scrypt,PoW,63039200.0,84000000.0


In [69]:

crypto_df = df2

copy_df = crypto_df.copy()

In [70]:
 # Fetch the cryptocurrencies names prior to drop them from crypto_df
coins_name = pd.DataFrame(index = copy_df['CoinName'])



y = coins_name

y.head()

42 Coin
404Coin
EliteCoin
Bitcoin
Litecoin


In [71]:
new_df = copy_df.drop(columns=['CoinName'])
new_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,42.0,42.0
404,Scrypt,PoW/PoS,1055180000.0,532000000.0
1337,X13,PoW/PoS,29279400000.0,314159000000.0
BTC,SHA-256,PoW,17927200.0,21000000.0
LTC,Scrypt,PoW,63039200.0,84000000.0


In [72]:
#Removig the cryptocurrency name since it's not going to be used on the clustering algorithm


from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [73]:
new_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,42.0,42.0
404,Scrypt,PoW/PoS,1055180000.0,532000000.0
1337,X13,PoW/PoS,29279400000.0,314159000000.0
BTC,SHA-256,PoW,17927200.0,21000000.0
LTC,Scrypt,PoW,63039200.0,84000000.0


In [74]:
# Creating dummies variables for text features
X = pd.get_dummies(new_df,columns=['Algorithm', 'ProofType'])

In [75]:

X

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,42,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.05518e+09,5.32e+08,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.92794e+10,3.14159e+11,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.79272e+07,2.1e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LTC,6.30392e+07,8.4e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2e+09,2e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.4931e+07,2.5e+08,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.80223e+08,1.40022e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEN,7.29654e+06,2.1e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
adjusted = X.iloc[:,2:]
adjusted.head()

Unnamed: 0,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,Algorithm_Counterparty,Algorithm_CryptoNight,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LTC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
# Standardizing data

#select data to scale 

data_to_scale = X.iloc[:,:2]

data_to_scale.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply
42,42.0,42.0
404,1055180000.0,532000000.0
1337,29279400000.0,314159000000.0
BTC,17927200.0,21000000.0
LTC,63039200.0,84000000.0


In [78]:
scaler = StandardScaler()

scaler.fit(data_to_scale)

StandardScaler()

In [79]:
scaled_data = scaler.transform(data_to_scale)

In [80]:
features_scaled = pd.DataFrame(scaled_data, columns = X.iloc[:,:2].columns)
features_scaled.index = X.index
fs_df = pd.concat([features_scaled,adjusted],axis=1)
fs_df.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,-0.110782,-0.158275,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,-0.088176,-0.150663,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,0.516504,4.336889,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,-0.110398,-0.157975,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LTC,-0.109431,-0.157073,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Reducing Dimensions Using PCA

In [81]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.express as px


# Using PCA to reduce dimension to 3 principal components
pca = PCA(n_components=3)

crypto_pca = pca.fit_transform(fs_df)

In [82]:
# Creating a DataFrame with the principal components data

df_pca = pd.DataFrame(
    data=crypto_pca, columns=["principal component 1", "principal component 2","principal component 3"]
)
df_pca.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-0.208086,-0.793526,0.223477
1,-0.186751,-0.795939,0.213344
2,3.418905,-0.320347,2.77538
3,-0.181891,0.663655,-0.204139
4,-0.180293,0.571174,-0.04835


In [83]:
pca.explained_variance_ratio_

array([0.48883485, 0.11285156, 0.0824152 ])

In [84]:
df_pca.index = fs_df.index
df_pca.columns = ['PC1','PC2', 'PC3']
df_pca.head()

Unnamed: 0,PC1,PC2,PC3
42,-0.208086,-0.793526,0.223477
404,-0.186751,-0.795939,0.213344
1337,3.418905,-0.320347,2.77538
BTC,-0.181891,0.663655,-0.204139
LTC,-0.180293,0.571174,-0.04835


## Clustering Crytocurrencies Using K-Means

#### Find the Best Value for k Using the Elbow Curve

In [85]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range ok k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_pca)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot(x = "k", y = "inertia", xticks = k, title = "Elbow Curve")

In [86]:

def get_clusters(k,data):
# Initialize the K-Means model
    model = KMeans(n_clusters=k,random_state=0)
# Fit the model
    model.fit(data)
# Predict clusters
    predictions = model.predict(data)
# Creating a new DataFrame including predicted clusters and cryptocurrencies features
    data['class'] = model.labels_
    
    return data

clusters_df = get_clusters(4,df_pca)

In [87]:
print(clusters_df.columns)

Index(['PC1', 'PC2', 'PC3', 'class'], dtype='object')


In [88]:
clustered_df = pd.concat([crypto_df, clusters_df],axis=1)

clustered_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC1,PC2,PC3,class
42,42 Coin,Scrypt,PoW/PoS,42.0,42.0,-0.208086,-0.793526,0.223477,3
404,404Coin,Scrypt,PoW/PoS,1055180000.0,532000000.0,-0.186751,-0.795939,0.213344,3
1337,EliteCoin,X13,PoW/PoS,29279400000.0,314159000000.0,3.418905,-0.320347,2.77538,3
BTC,Bitcoin,SHA-256,PoW,17927200.0,21000000.0,-0.181891,0.663655,-0.204139,0
LTC,Litecoin,Scrypt,PoW,63039200.0,84000000.0,-0.180293,0.571174,-0.04835,0


In [89]:
clusters_df.head()

Unnamed: 0,PC1,PC2,PC3,class
42,-0.208086,-0.793526,0.223477,3
404,-0.186751,-0.795939,0.213344,3
1337,3.418905,-0.320347,2.77538,3
BTC,-0.181891,0.663655,-0.204139,0
LTC,-0.180293,0.571174,-0.04835,0


In [90]:
clustered_df = clustered_df[['Algorithm','ProofType', 'TotalCoinSupply', 'TotalCoinsMined','PC1','PC2', 'PC3', 'CoinName', 'class']]

In [91]:
clustered_df.rename(columns={'class':"Class"},inplace=True)

In [92]:
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinSupply,TotalCoinsMined,PC1,PC2,PC3,CoinName,Class
42,Scrypt,PoW/PoS,42.0,42.0,-0.208086,-0.793526,0.223477,42 Coin,3
404,Scrypt,PoW/PoS,532000000.0,1055180000.0,-0.186751,-0.795939,0.213344,404Coin,3
1337,X13,PoW/PoS,314159000000.0,29279400000.0,3.418905,-0.320347,2.77538,EliteCoin,3
BTC,SHA-256,PoW,21000000.0,17927200.0,-0.181891,0.663655,-0.204139,Bitcoin,0
LTC,Scrypt,PoW,84000000.0,63039200.0,-0.180293,0.571174,-0.04835,Litecoin,0


##  Visualizing Results

#### 3D-Scatter with Clusters

In [93]:
# Creating a 3D-Scatter with the PCA data and the clusters

fig = px.scatter_3d(
    clustered_df,
    x ="PC1",
    y = "PC2",
    z="PC3",
    hover_data=['Algorithm'],
    hover_name = "CoinName",
    color="Class",
    symbol="Class",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

####  Table of Tradable Cryptocurrencies

In [94]:
clustered_df.hvplot.table(columns=["CoinName", "Algorithm", "ProofType", "TotalCoinSupply", "TotalCoinsMined", "Class"])

#### Scatter Plot with Tradable Cryptocurrencies

In [95]:
# Scaling data to create the scatter plot
clustered_df[['TotalCoinsMined','TotalCoinSupply']]  = MinMaxScaler().fit_transform(clustered_df[['TotalCoinsMined','TotalCoinSupply']])
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinSupply,TotalCoinsMined,PC1,PC2,PC3,CoinName,Class
42,Scrypt,PoW/PoS,0.0,0.005942,-0.208086,-0.793526,0.223477,42 Coin,3
404,Scrypt,PoW/PoS,0.000532,0.007002,-0.186751,-0.795939,0.213344,404Coin,3
1337,X13,PoW/PoS,0.314159,0.035342,3.418905,-0.320347,2.77538,EliteCoin,3
BTC,SHA-256,PoW,2.1e-05,0.00596,-0.181891,0.663655,-0.204139,Bitcoin,0
LTC,Scrypt,PoW,8.4e-05,0.006006,-0.180293,0.571174,-0.04835,Litecoin,0


In [96]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
clustered_df.hvplot.scatter(x="TotalCoinsMined",y="TotalCoinSupply",hover_cols=["CoinName"])