# Clustering Crypto

In [363]:
# Initial imports
!pip install plotly
import pandas as pd
import numpy as np
import hvplot.pandas
from path import Path
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans




### Deliverable 1: Preprocessing the Data for PCA

In [15]:
# Load the crypto_data.bcsv dataset.
# YOUR CODE HERE

In [364]:
# 1.Read in the crypto_data.csv to the Pandas DataFrame named crypto_df
crypto_df = pd.read_csv("Resources/crypto_data.csv")
crypto_df.head(10)

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0
5,1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
6,2015,2015 coin,X11,True,PoW/PoS,,0
7,BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,True,PoW,107684200.0,0
9,LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [365]:
# check all null values
print(f'{crypto_df.isnull().sum()}')
print('--- data types ----')
print(crypto_df.dtypes)
print()
print(crypto_df.count())

Unnamed: 0           0
CoinName             0
Algorithm            0
IsTrading            0
ProofType            0
TotalCoinsMined    508
TotalCoinSupply      0
dtype: int64
--- data types ----
Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

Unnamed: 0         1252
CoinName           1252
Algorithm          1252
IsTrading          1252
ProofType          1252
TotalCoinsMined     744
TotalCoinSupply    1252
dtype: int64


In [3]:
# Keep all the cryptocurrencies that are being traded.
# YOUR CODE HERE

(1144, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [366]:
# 2. Keep all the cryptocurrencies that are being traded.
crypto_df = crypto_df.loc[lambda crypto_df: crypto_df['IsTrading'] == True]
print(crypto_df.shape)
# asking for 6 columns, set index tot Unnamed: 0
crypto_df = crypto_df.set_index("Unnamed: 0")
crypto_df = crypto_df.rename_axis(index=None, columns=None)
print()
print(crypto_df.shape)
crypto_df.head(10)

(1144, 7)

(1144, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [4]:
# Keep all the cryptocurrencies that have a working algorithm.
# YOUR CODE HERE

(1144, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [367]:
# Algorithm column has no null values
working_crypto_df = crypto_df.loc[lambda crypto_df: crypto_df['Algorithm'].notnull()]
print(working_crypto_df.shape)
working_crypto_df.head(5)

(1144, 6)


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0


In [5]:
# Remove the "IsTrading" column. 
# YOUR CODE HERE

(1144, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,PoW/PoS,,0
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000


In [368]:
# 3. Drop the IsTrading column.
crypto_df.drop('IsTrading', axis=1, inplace=True)
print(crypto_df.shape)
crypto_df.head(5)

(1144, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0


In [6]:
# Remove rows that have at least 1 null value.
# YOUR CODE HERE

(685, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000
DASH,Dash,X11,PoW/PoS,9031294.0,22000000
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000


In [369]:
# 4. Remove rows that have at least one null value
print(crypto_df.isnull().sum())
crypto_df = crypto_df.dropna()
print(crypto_df.shape)
crypto_df.head()

CoinName             0
Algorithm            0
ProofType            0
TotalCoinsMined    459
TotalCoinSupply      0
dtype: int64
(685, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [7]:
# Keep the rows where coins are mined.
# YOUR CODE HERE

(532, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000
DASH,Dash,X11,PoW/PoS,9031294.0,22000000
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000
ZEC,ZCash,Equihash,PoW,7383056.0,21000000


In [370]:
# 5. Filter the crypto_df DataFrame so it only has rows where coins have been mined.
crypto_df = crypto_df[crypto_df['TotalCoinsMined'] > 0]
print(crypto_df.shape)
crypto_df.head(10)

(532, 5)


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000
DASH,Dash,X11,PoW/PoS,9031294.0,22000000
XMR,Monero,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethereum Classic,Ethash,PoW,113359700.0,210000000
ZEC,ZCash,Equihash,PoW,7383056.0,21000000


In [8]:
# Create a new DataFrame that holds only the cryptocurrencies names.
# YOUR CODE HERE

(532, 1)


Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [371]:
# 6. Create a new DataFrame that holds only the cryptocurrency names, 
# and use the crypto_df DataFrame index as the index for this new DataFrame
crypto_names = crypto_df[['CoinName']]
print(crypto_names.shape)
crypto_names.head(5)

(532, 1)


Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [9]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
# YOUR CODE HERE

(532, 4)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0
LTC,Scrypt,PoW,63039240.0,84000000
DASH,X11,PoW/PoS,9031294.0,22000000
XMR,CryptoNight-V7,PoW,17201140.0,0
ETC,Ethash,PoW,113359700.0,210000000
ZEC,Equihash,PoW,7383056.0,21000000


In [372]:
# 7. Remove the CoinName column from the crypto_df DataFrame since it's not going to be used on the clustering algorithm.
crypto_df.drop('CoinName', axis=1, inplace=True)
print(crypto_df.shape)
crypto_df.head(5)

(532, 4)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [10]:
# Use get_dummies() to create variables for text features.
# YOUR CODE HERE

(532, 98)


Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LTC,63039240.0,84000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DASH,9031294.0,22000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
XMR,17201140.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETC,113359700.0,210000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEC,7383056.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [373]:
# 8. Use the get_dummies() method to create variables for the two text features, 
# Algorithm and ProofType, and store the resulting data in a new DataFrame named X.
X = pd.get_dummies(crypto_df, columns=["Algorithm","ProofType"])
print(X.dtypes)
X[['TotalCoinSupply']] = X[['TotalCoinSupply']].astype(float)
print(X.dtypes)
pd.set_option("display.max_columns", 99)
print(X.shape)
X.head(10)

TotalCoinsMined                     float64
TotalCoinSupply                      object
Algorithm_1GB AES Pattern Search      uint8
Algorithm_536                         uint8
Algorithm_Argon2d                     uint8
                                     ...   
ProofType_Proof of Authority          uint8
ProofType_Proof of Trust              uint8
ProofType_TPoS                        uint8
ProofType_Zero-Knowledge Proof        uint8
ProofType_dPoW/PoW                    uint8
Length: 98, dtype: object
TotalCoinsMined                     float64
TotalCoinSupply                     float64
Algorithm_1GB AES Pattern Search      uint8
Algorithm_536                         uint8
Algorithm_Argon2d                     uint8
                                     ...   
ProofType_Proof of Authority          uint8
ProofType_Proof of Trust              uint8
ProofType_TPoS                        uint8
ProofType_Zero-Knowledge Proof        uint8
ProofType_dPoW/PoW                    uint8
Length

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,Algorithm_Counterparty,Algorithm_CryptoNight,Algorithm_CryptoNight Heavy,Algorithm_CryptoNight-V7,Algorithm_Cryptonight-GPU,Algorithm_DPoS,Algorithm_Dagger,Algorithm_Dagger-Hashimoto,Algorithm_ECC 256K1,Algorithm_Equihash,Algorithm_Equihash+Scrypt,Algorithm_Ethash,Algorithm_Exosis,Algorithm_Green Protocol,Algorithm_Groestl,Algorithm_HMQ1725,Algorithm_HybridScryptHash256,Algorithm_IMesh,Algorithm_Jump Consistent Hash,Algorithm_Keccak,Algorithm_Leased POS,Algorithm_Lyra2RE,Algorithm_Lyra2REv2,Algorithm_Lyra2Z,Algorithm_M7 POW,Algorithm_Multiple,Algorithm_NIST5,Algorithm_NeoScrypt,Algorithm_Ouroboros,Algorithm_PHI1612,Algorithm_POS 2.0,Algorithm_POS 3.0,Algorithm_PoS,Algorithm_Proof-of-Authority,Algorithm_Proof-of-BibleHash,Algorithm_QUAIT,Algorithm_QuBit,Algorithm_Quark,Algorithm_SHA-256,Algorithm_SHA-256 + Hive,Algorithm_SHA-256D,Algorithm_SHA-512,Algorithm_SHA3,Algorithm_Scrypt,Algorithm_Semux BFT consensus,Algorithm_Shabal256,Algorithm_Skein,Algorithm_SkunkHash,Algorithm_SkunkHash v2 Raptor,Algorithm_Stanford Folding,Algorithm_TRC10,Algorithm_Time Travel,Algorithm_Tribus,Algorithm_VBFT,Algorithm_VeChainThor Authority,Algorithm_X11,Algorithm_X11GOST,Algorithm_X13,Algorithm_X14,Algorithm_X15,Algorithm_X16R,Algorithm_XEVAN,ProofType_DPOS,ProofType_DPoS,ProofType_HPoW,ProofType_LPoS,ProofType_POBh,ProofType_PoA,ProofType_PoC,ProofType_PoS,ProofType_PoS/LPoS,ProofType_PoS/PoW,ProofType_PoS/PoW/PoT,ProofType_PoST,ProofType_PoW,ProofType_PoW + Hive,ProofType_PoW and PoS,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,41.99995,42.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159300000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
LTC,63039240.0,84000000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
DASH,9031294.0,22000000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
XMR,17201140.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
ETC,113359700.0,210000000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
ZEC,7383056.0,21000000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
# Standardize the data with StandardScaler().
# YOUR CODE HERE

array([[-0.11710817, -0.1528703 , -0.0433963 , -0.0433963 , -0.0433963 ,
        -0.06142951, -0.07530656, -0.0433963 , -0.06142951, -0.06142951,
        -0.0433963 , -0.0433963 , -0.19245009, -0.06142951, -0.09740465,
        -0.0433963 , -0.11547005, -0.07530656, -0.0433963 , -0.0433963 ,
        -0.15191091, -0.0433963 , -0.13118084, -0.0433963 , -0.0433963 ,
        -0.08703883, -0.0433963 , -0.0433963 , -0.0433963 , -0.0433963 ,
        -0.06142951, -0.0433963 , -0.08703883, -0.08703883, -0.08703883,
        -0.0433963 , -0.13118084, -0.13840913, -0.13840913, -0.0433963 ,
        -0.06142951, -0.0433963 , -0.07530656, -0.18168574, -0.0433963 ,
        -0.0433963 , -0.0433963 , -0.07530656, -0.15826614, -0.31491833,
        -0.0433963 , -0.08703883, -0.07530656, -0.06142951,  1.38675049,
        -0.0433963 , -0.0433963 , -0.06142951, -0.0433963 , -0.0433963 ,
        -0.0433963 , -0.0433963 , -0.0433963 , -0.0433963 , -0.0433963 ,
        -0.0433963 , -0.39879994, -0.0433963 , -0.1

In [374]:
# 9.Use the StandardScaler fit_transform() function to standardize the features from the X DataFrame.
data_scaler = StandardScaler()
crypto_data_scaled = data_scaler.fit_transform(X)
print(np.mean(crypto_data_scaled[:,0]))
print(np.std(crypto_data_scaled[:,0]))
crypto_data_scaled[0:5]

2.0034099692484027e-17
1.0


array([[-0.11710817, -0.1528703 , -0.0433963 , -0.0433963 , -0.0433963 ,
        -0.06142951, -0.07530656, -0.0433963 , -0.06142951, -0.06142951,
        -0.0433963 , -0.0433963 , -0.19245009, -0.06142951, -0.09740465,
        -0.0433963 , -0.11547005, -0.07530656, -0.0433963 , -0.0433963 ,
        -0.15191091, -0.0433963 , -0.13118084, -0.0433963 , -0.0433963 ,
        -0.08703883, -0.0433963 , -0.0433963 , -0.0433963 , -0.0433963 ,
        -0.06142951, -0.0433963 , -0.08703883, -0.08703883, -0.08703883,
        -0.0433963 , -0.13118084, -0.13840913, -0.13840913, -0.0433963 ,
        -0.06142951, -0.0433963 , -0.07530656, -0.18168574, -0.0433963 ,
        -0.0433963 , -0.0433963 , -0.07530656, -0.15826614, -0.31491833,
        -0.0433963 , -0.08703883, -0.07530656, -0.06142951,  1.38675049,
        -0.0433963 , -0.0433963 , -0.06142951, -0.0433963 , -0.0433963 ,
        -0.0433963 , -0.0433963 , -0.0433963 , -0.0433963 , -0.0433963 ,
        -0.0433963 , -0.39879994, -0.0433963 , -0.1

### Deliverable 2: Reducing Data Dimensions Using PCA

In [12]:
# Using PCA to reduce dimension to three principal components.
# YOUR CODE HERE

array([[-0.33285464,  1.0383583 , -0.56494435],
       [-0.31620149,  1.03851511, -0.56537078],
       [ 2.30004045,  1.64353216, -0.570651  ],
       ...,
       [ 0.3268763 , -2.33198921,  0.46344651],
       [-0.17901176, -2.02470009,  0.4332558 ],
       [-0.2828024 ,  0.82179669, -0.25676861]])

In [375]:
# 1-2.Using the information we’ve provided, apply PCA to reduce the dimensions to three principal components.
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(crypto_data_scaled)
print(crypto_pca.shape)
crypto_pca

(532, 3)


array([[-0.33465343,  1.10386529, -0.59957652],
       [-0.3179803 ,  1.10392616, -0.59992056],
       [ 2.28861989,  1.64090342, -0.65766122],
       ...,
       [ 0.30186957, -2.25905128,  0.43289774],
       [-0.14223989, -2.15503634,  0.49832155],
       [-0.28844976,  0.90933568, -0.32497794]])

In [13]:
# Create a DataFrame with the three principal components.
# YOUR CODE HERE

(532, 3)


Unnamed: 0,PC 1,PC 2,PC 3
42,-0.332855,1.038358,-0.564944
404,-0.316201,1.038515,-0.565371
1337,2.30004,1.643532,-0.570651
BTC,-0.149023,-1.309646,0.18262
ETH,-0.162646,-2.019908,0.380155
LTC,-0.159391,-1.123165,-0.021041
DASH,-0.410793,1.224033,-0.517184
XMR,-0.148242,-2.196597,0.375973
ETC,-0.161087,-2.02001,0.380143
ZEC,-0.179011,-2.0247,0.433256


In [376]:
# 3. Create a new DataFrame named pcs_df that includes the following columns, PC 1, PC 2, and PC 3, 
# and uses the index of the crypto_df DataFrame as the index.
pcs_df = pd.DataFrame(crypto_pca, columns = ['PC 1', 'PC 2', 'PC 3'], index = crypto_df.index)
print(pcs_df.shape)
pcs_df.head(10)

(532, 3)


Unnamed: 0,PC 1,PC 2,PC 3
42,-0.334653,1.103865,-0.599577
404,-0.31798,1.103926,-0.599921
1337,2.28862,1.640903,-0.657661
BTC,-0.150216,-1.317557,0.22419
ETH,-0.156872,-1.976461,0.333588
LTC,-0.168049,-1.016785,-0.038784
DASH,-0.391885,1.074263,-0.47722
XMR,-0.142864,-2.304885,0.439133
ETC,-0.155316,-1.976557,0.333572
ZEC,-0.142239,-2.155036,0.498322


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [14]:
# Create an elbow curve to find the best value for K.
# YOUR CODE HERE


In [377]:
# 1-2 Using the pcs_df DataFrame, create an elbow curve using hvPlot to find the best value for K.
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")



KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



Running K-Means with `k=4`

In [15]:
# Initialize the K-Means model.
# YOUR CODE HERE

# Fit the model
# YOUR CODE HERE

# Predict clusters
# YOUR CODE HERE

array([0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0,

In [378]:
# 3.Next, use the pcs_df DataFrame to run the K-means algorithm to
# make predictions of the K clusters for the cryptocurrencies’ data.
# 
# Initialize the K-means model
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)
predictions

array([0, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 0, 2, 2,
       2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0,
       2, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 0, 2, 0, 0, 0, 2,
       2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 2, 0,
       0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 2, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2,
       0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0,
       2, 0, 2, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 0,
       0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0,
       0, 2, 0, 2, 2, 0, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 2, 2, 0, 2, 0, 0, 2,
       0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 0,

In [16]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
# YOUR CODE HERE

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
# YOUR CODE HERE

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# YOUR CODE HERE

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.332855,1.038358,-0.564944,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.316201,1.038515,-0.565371,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.30004,1.643532,-0.570651,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.149023,-1.309646,0.18262,Bitcoin,1
ETH,Ethash,PoW,107684200.0,0,-0.162646,-2.019908,0.380155,Ethereum,1
LTC,Scrypt,PoW,63039240.0,84000000,-0.159391,-1.123165,-0.021041,Litecoin,1
DASH,X11,PoW/PoS,9031294.0,22000000,-0.410793,1.224033,-0.517184,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.148242,-2.196597,0.375973,Monero,1
ETC,Ethash,PoW,113359700.0,210000000,-0.161087,-2.02001,0.380143,Ethereum Classic,1
ZEC,Equihash,PoW,7383056.0,21000000,-0.179011,-2.0247,0.433256,ZCash,1


In [379]:
# 4.Create a new DataFrame named clustered_df by concatenating the crypto_df and pcs_df DataFrames on the same columns. 
# The index should be the same as the crypto_df DataFrame.
clustered_df = pd.concat([crypto_df, pcs_df], axis=1)
clustered_df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3
42,Scrypt,PoW/PoS,41.99995,42,-0.334653,1.103865,-0.599577
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.31798,1.103926,-0.599921
1337,X13,PoW/PoS,29279420000.0,314159265359,2.28862,1.640903,-0.657661
BTC,SHA-256,PoW,17927180.0,21000000,-0.150216,-1.317557,0.22419
ETH,Ethash,PoW,107684200.0,0,-0.156872,-1.976461,0.333588


In [380]:
# 5.Add the CoinName column that holds the names of the cryptocurrencies, 
# which you created in Step 7 of Deliverable 1, to the clustered_df.
clustered_df = pd.concat([clustered_df,crypto_names], axis = 1)
# print(clustered_df.shape)
clustered_df.head(8)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName
42,Scrypt,PoW/PoS,41.99995,42,-0.334653,1.103865,-0.599577,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.31798,1.103926,-0.599921,404Coin
1337,X13,PoW/PoS,29279420000.0,314159265359,2.28862,1.640903,-0.657661,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000,-0.150216,-1.317557,0.22419,Bitcoin
ETH,Ethash,PoW,107684200.0,0,-0.156872,-1.976461,0.333588,Ethereum
LTC,Scrypt,PoW,63039240.0,84000000,-0.168049,-1.016785,-0.038784,Litecoin
DASH,X11,PoW/PoS,9031294.0,22000000,-0.391885,1.074263,-0.47722,Dash
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.142864,-2.304885,0.439133,Monero


In [381]:
# 6.Add another new column to the clustered_df named Class that holds the predictions, i.e., model.labels_, from Step 3.
clustered_df['Class']= predictions
print(clustered_df.shape)
clustered_df.head(8)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,41.99995,42,-0.334653,1.103865,-0.599577,42 Coin,0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.31798,1.103926,-0.599921,404Coin,0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.28862,1.640903,-0.657661,EliteCoin,0
BTC,SHA-256,PoW,17927180.0,21000000,-0.150216,-1.317557,0.22419,Bitcoin,2
ETH,Ethash,PoW,107684200.0,0,-0.156872,-1.976461,0.333588,Ethereum,2
LTC,Scrypt,PoW,63039240.0,84000000,-0.168049,-1.016785,-0.038784,Litecoin,2
DASH,X11,PoW/PoS,9031294.0,22000000,-0.391885,1.074263,-0.47722,Dash,0
XMR,CryptoNight-V7,PoW,17201140.0,0,-0.142864,-2.304885,0.439133,Monero,2


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [389]:
# 1-2-3. Creating a 3D-Scatter with the PCA data and the clusters
# YOUR CODE HERE
# Add the CoinName and Algorithm columns to the hover_name and hover_data parameters, 
# respectively, so each data point shows the CoinName and Algorithm on hover
fig = px.scatter_3d(clustered_df, x="PC 1", y="PC 2", z="PC 3", color="Class", 
                    hover_name = "CoinName",hover_data = ["Algorithm"], width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [383]:
# Create a table with tradable cryptocurrencies.
# YOUR CODE HERE
# 4.Create a table with tradable cryptocurrencies using the hvplot.table() function.
clustered_df.hvplot.table(columns=['CoinName','Algorithm', 'ProofType', 'TotalCoinsMined','TotalCoinSupply','Class'], 
                          sortable=True, selectable=True)

In [19]:
# Print the total number of tradable cryptocurrencies.
# YOUR CODE HERE

There are 532 tradable cryptocurrencies.


In [384]:
# 5.Print the total number of tradable cryptocurrencies in the clustered_df DataFrame.
print(f'There are {len(clustered_df.index)} tradable cryptocurrencies.')

There are 532 tradable cryptocurrencies.


In [20]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
# YOUR CODE HERE

array([[4.20000000e-11, 0.00000000e+00],
       [5.32000000e-04, 1.06585544e-03],
       [3.14159265e-01, 2.95755135e-02],
       ...,
       [1.40022261e-03, 9.90135079e-04],
       [2.10000000e-05, 7.37028150e-06],
       [1.00000000e-06, 1.29582282e-07]])

In [385]:
# 6. Use the MinMaxScaler().fit_transform method to scale the TotalCoinSupply and TotalCoinsMined 
# columns between the given range of zero and one.
scaler = MinMaxScaler()
scaled_df=scaler.fit_transform(clustered_df[['TotalCoinSupply','TotalCoinsMined']].to_numpy())
scaled_df

array([[4.20000000e-11, 0.00000000e+00],
       [5.32000000e-04, 1.06585544e-03],
       [3.14159265e-01, 2.95755135e-02],
       ...,
       [1.40022261e-03, 9.90135079e-04],
       [2.10000000e-05, 7.37028150e-06],
       [1.00000000e-06, 1.29582282e-07]])

In [21]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
# YOUR CODE HERE

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
# YOUR CODE HERE

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
# YOUR CODE HERE

plot_df.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
42,4.2e-11,0.0,42 Coin,0
404,0.000532,0.001066,404Coin,0
1337,0.3141593,0.029576,EliteCoin,0
BTC,2.1e-05,1.8e-05,Bitcoin,1
ETH,0.0,0.000109,Ethereum,1
LTC,8.4e-05,6.4e-05,Litecoin,1
DASH,2.2e-05,9e-06,Dash,0
XMR,0.0,1.7e-05,Monero,1
ETC,0.00021,0.000115,Ethereum Classic,1
ZEC,2.1e-05,7e-06,ZCash,1


In [386]:
# 7.Create a new DataFrame using the clustered_df DataFrame index that contains the scaled data you created in Step 5.
plot_df = pd.DataFrame(scaled_df, columns = ['TotalCoinSupply','TotalCoinsMined'], index = clustered_df.index)
# 8.Add the CoinName column from the clustered_df DataFrame to the new DataFrame.
plot_df['CoinName'] = clustered_df[['CoinName']]
# 9.Add the CoinName column from the clustered_df DataFrame to the new DataFrame.
plot_df['Class'] = clustered_df[['Class']]
plot_df.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
42,4.2e-11,0.0,42 Coin,0
404,0.000532,0.001066,404Coin,0
1337,0.3141593,0.029576,EliteCoin,0
BTC,2.1e-05,1.8e-05,Bitcoin,2
ETH,0.0,0.000109,Ethereum,2
LTC,8.4e-05,6.4e-05,Litecoin,2
DASH,2.2e-05,9e-06,Dash,0
XMR,0.0,1.7e-05,Monero,2
ETC,0.00021,0.000115,Ethereum Classic,2
ZEC,2.1e-05,7e-06,ZCash,2


In [387]:
# 10.Create an hvplot scatter plot with x="TotalCoinsMined", y="TotalCoinSupply", and by="Class", and have it show the 
# CoinName when you hover over the the data point.
plot_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by="Class", hover_cols=["CoinName"])