# Clustering Crypto

In [183]:
# Initial imports
import pandas as pd
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas


### Deliverable 1: Preprocessing the Data for PCA

In [184]:
file_path = 'Resources/crypto_data.csv'
df_crypto = pd.read_csv(file_path, index_col=0)
df_crypto.dtypes

CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [185]:
# Keep all the cryptocurrencies that are being traded.
df_crypto = df_crypto.loc[(df_crypto['IsTrading'] == True)]
df_crypto

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
SERO,Super Zero,Ethash,True,PoW,,1000000000
UOS,UOS,SHA-256,True,DPoI,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [186]:
# Keep all the cryptocurrencies that have a working algorithm.
options = ['PoW/PoS', 'PoW'] 
df_crypto = df_crypto.loc[df_crypto['ProofType'].isin(options)] 
df_crypto

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
VPRC,VapersCoin,Scrypt,True,PoW,,42750000000
GAP,Gapcoin,Scrypt,True,PoW/PoS,1.493105e+07,250000000
SERO,Super Zero,Ethash,True,PoW,,1000000000
BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610


In [187]:
# Remove the "IsTrading" column. 
df_crypto = df_crypto.drop(axis=1, columns='IsTrading')
df_crypto

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
VPRC,VapersCoin,Scrypt,PoW,,42750000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
SERO,Super Zero,Ethash,PoW,,1000000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610


In [188]:
# Remove rows that have at least 1 null value.
df_crypto = df_crypto.dropna(axis=0, how='any')
df_crypto

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
808,808,SHA-256,PoW/PoS,0.000000e+00,0
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...,...
RCC,Reality Clash,Ethash,PoW,2.448794e+07,24487944
ILT,iOlite,Ethash,PoW,0.000000e+00,1000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610


In [189]:
df_crypto.dtypes

CoinName            object
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [190]:
# Keep the rows where coins are mined.
df_crypto = df_crypto.loc[(df_crypto['TotalCoinsMined'] > 0)]
df_crypto

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
VOLLAR,Vollar,Equihash+Scrypt,PoW,1.000000e+08,2100000000
RCC,Reality Clash,Ethash,PoW,2.448794e+07,24487944
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610


In [191]:
# Create a new DataFrame that holds only the cryptocurrencies names.
df_crypto_names = df_crypto[['CoinName']].copy()
df_crypto_names

Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
VOLLAR,Vollar
RCC,Reality Clash
GAP,Gapcoin
BDX,Beldex


In [192]:
df_crypto_names.index

Index(['42', '404', '1337', 'BTC', 'ETH', 'LTC', 'DASH', 'XMR', 'ETC', 'ZEC',
       ...
       'OK', 'XWC', 'FSC', 'J', 'TRI', 'VOLLAR', 'RCC', 'GAP', 'BDX', 'ZEN'],
      dtype='object', length=413)

In [193]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
df_crypto = df_crypto.drop(axis=1, columns='CoinName', )
df_crypto

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
VOLLAR,Equihash+Scrypt,PoW,1.000000e+08,2100000000
RCC,Ethash,PoW,2.448794e+07,24487944
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610


In [194]:
df_crypto.dtypes

Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [195]:
# Use get_dummies() to create variables for text features.
X = pd.get_dummies(df_crypto, columns=['Algorithm', 'ProofType'])
X

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,Algorithm_Tribus,Algorithm_X11,Algorithm_X11GOST,Algorithm_X13,Algorithm_X14,Algorithm_X15,Algorithm_X16R,Algorithm_XEVAN,ProofType_PoW,ProofType_PoW/PoS
42,4.199995e+01,42,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
404,1.055185e+09,532000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1337,2.927942e+10,314159265359,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
BTC,1.792718e+07,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
ETH,1.076842e+08,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VOLLAR,1.000000e+08,2100000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
RCC,2.448794e+07,24487944,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
GAP,1.493105e+07,250000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
BDX,9.802226e+08,1400222610,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [196]:
# Standardize the data with StandardScaler().
X_scaled = StandardScaler().fit_transform(X)
X_scaled

array([[-0.21129146, -0.16014434, -0.04926646, ..., -0.08553989,
        -1.16042704,  1.16042704],
       [-0.14968272, -0.15111869, -0.04926646, ..., -0.08553989,
        -1.16042704,  1.16042704],
       [ 1.49823714,  5.1697257 , -0.04926646, ..., -0.08553989,
        -1.16042704,  1.16042704],
       ...,
       [-0.21041969, -0.15590296, -0.04926646, ..., -0.08553989,
        -1.16042704,  1.16042704],
       [-0.15405952, -0.13638886, -0.04926646, ..., -0.08553989,
         0.86175172, -0.86175172],
       [-0.21086545, -0.15978806, -0.04926646, ..., -0.08553989,
         0.86175172, -0.86175172]])

### Deliverable 2: Reducing Data Dimensions Using PCA

In [197]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)
X_pca

array([[-1.62812811,  0.20507193, -1.41189656],
       [-1.62100065,  0.24625393, -1.40863277],
       [-1.47041409,  4.85873046,  1.77083765],
       ...,
       [-1.62754991,  0.20800696, -1.41147146],
       [ 2.02853085,  1.236117  ,  0.87843767],
       [ 1.81173537, -0.94400856,  0.38389055]])

In [198]:
pd.Index(df_crypto_names)

Index([         ('42 Coin',),          ('404Coin',),        ('EliteCoin',),
                ('Bitcoin',),         ('Ethereum',),         ('Litecoin',),
                   ('Dash',),           ('Monero',), ('Ethereum Classic',),
                  ('ZCash',),
       ...
                 ('OKCash',),        ('WhiteCoin',),   ('FriendshipCoin',),
               ('JoinCoin',),   ('Triangles Coin',),           ('Vollar',),
          ('Reality Clash',),          ('Gapcoin',),           ('Beldex',),
                ('Horizen',)],
      dtype='object', length=413)

In [199]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(
    data=X_pca, columns=["PC 1", "PC 2", "PC 3"], index=df_crypto_names.index
)
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3
42,-1.628128,0.205072,-1.411897
404,-1.621001,0.246254,-1.408633
1337,-1.470414,4.858730,1.770838
BTC,1.154874,-0.778584,0.759930
ETH,1.801246,-0.991404,0.370260
...,...,...,...
VOLLAR,1.783387,-0.947683,0.356620
RCC,1.800814,-0.994006,0.370106
GAP,-1.627550,0.208007,-1.411471
BDX,2.028531,1.236117,0.878438


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [200]:
# Create an elbow curve to find the best value for K.
inertia = []
k = list(range(1, 11))


for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)


  f"KMeans is known to have a memory leak on Windows "


Running K-Means with `k=4`

In [201]:
# Initialize the K-Means model.
model = KMeans(n_clusters=2, random_state=5)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)
print(predictions)

[1 1 1 0 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1 0 0
 0 0 1 1 0 0 0 1 0 1 1 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0
 0 1 0 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 1 0 0 0 0 1 1 0 0 1 1 1 1 1 0 1
 1 1 0 1 1 0 1 0 1 1 0 1 0 0 0 1 1 1 0 1 1 0 0 1 0 0 1 1 1 1 0 1 1 1 1 1 1
 0 1 1 1 1 0 0 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 1 1 1 0 1 0 1 1 1 1 1 0 0 1 1
 0 0 1 0 0 0 0 1 0 1 1 0 1 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 1 0
 1 0 1 0 0 1 1 1 1 0 1 1 0 1 0 0 0 0 1 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0
 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 1 0 1 0 1 0 0 0 0 0
 0 0 1 0 1 0 1 0 0 0 1 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1
 0 1 0 1 1 1 1 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 1 1 0 1 0 1 0 1 1 0 0 1 1 1 0
 1 0 0 1 0 0]


In [202]:
df_crypto

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,4.199995e+01,42
404,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,X13,PoW/PoS,2.927942e+10,314159265359
BTC,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
VOLLAR,Equihash+Scrypt,PoW,1.000000e+08,2100000000
RCC,Ethash,PoW,2.448794e+07,24487944
GAP,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,CryptoNight,PoW,9.802226e+08,1400222610


In [212]:
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3
42,-1.628128,0.205072,-1.411897
404,-1.621001,0.246254,-1.408633
1337,-1.470414,4.858730,1.770838
BTC,1.154874,-0.778584,0.759930
ETH,1.801246,-0.991404,0.370260
...,...,...,...
VOLLAR,1.783387,-0.947683,0.356620
RCC,1.800814,-0.994006,0.370106
GAP,-1.627550,0.208007,-1.411471
BDX,2.028531,1.236117,0.878438


In [216]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([df_crypto, pcs_df], axis=1, join='inner')

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df = pd.concat([df_crypto_names, clustered_df], axis=1, join='inner')

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df["class"] = model.labels_

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(413, 9)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class
42,42 Coin,Scrypt,PoW/PoS,41.99995,42,-1.628128,0.205072,-1.411897,1
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000,-1.621001,0.246254,-1.408633,1
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359,-1.470414,4.85873,1.770838,1
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000,1.154874,-0.778584,0.75993,0
ETH,Ethereum,Ethash,PoW,107684200.0,0,1.801246,-0.991404,0.37026,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000,0.843754,-0.164513,-1.595209,0
DASH,Dash,X11,PoW/PoS,9031294.0,22000000,-1.764628,-0.392244,1.380987,1
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0,1.789018,-1.007695,0.35622,0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000,1.801692,-0.989173,0.370601,0
ZEC,ZCash,Equihash,PoW,7383056.0,21000000,1.811736,-0.944006,0.383891,0


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [205]:
# Creating a 3D-Scatter with the PCA data and the clusters
# YOUR CODE HERE


In [206]:
# Create a table with tradable cryptocurrencies.
# YOUR CODE HERE

In [207]:
# Print the total number of tradable cryptocurrencies.
# YOUR CODE HERE

In [208]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# YOUR CODE HERE

In [209]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
# YOUR CODE HERE

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
# YOUR CODE HERE

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
# YOUR CODE HERE

plot_df.head(10)

NameError: name 'plot_df' is not defined

In [None]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
# YOUR CODE HERE
