# Clustering Crypto

In [131]:
# Initial imports
import pandas as pd
import hvplot.pandas
from path import Path
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler 
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Deliverable 1: Preprocessing the Data for PCA

In [63]:
# Load the crypto_data.csv dataset use the read_csv method.
file_path = "Data/crypto_data.csv"
crypto_df = pd.read_csv(file_path)
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [64]:
# set_index to Unnamed: 0 column
crypto_df.set_index('Unnamed: 0', inplace= True)
crypto_df

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [65]:
crypto_df.shape

(1252, 6)

In [66]:
crypto_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1252 entries, 42 to PUNK
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         1252 non-null   object 
 1   Algorithm        1252 non-null   object 
 2   IsTrading        1252 non-null   bool   
 3   ProofType        1252 non-null   object 
 4   TotalCoinsMined  744 non-null    float64
 5   TotalCoinSupply  1252 non-null   object 
dtypes: bool(1), float64(1), object(4)
memory usage: 59.9+ KB


In [67]:
crypto_df.columns

Index(['CoinName', 'Algorithm', 'IsTrading', 'ProofType', 'TotalCoinsMined',
       'TotalCoinSupply'],
      dtype='object')

### Keep all the cryptocurrencies that are being traded.

In [68]:
# find crypto NOT being traded - examine va ue_count for "IsTrading"
crypto_df["IsTrading"].value_counts()

True     1144
False     108
Name: IsTrading, dtype: int64

In [69]:
# get my conditional statement for my .loc
(crypto_df["IsTrading"] == True)

Unnamed: 0
42       True
365      True
404      True
611      True
808      True
        ...  
XBC      True
DVTC    False
GIOT    False
OPSC    False
PUNK    False
Name: IsTrading, Length: 1252, dtype: bool

In [70]:
# use .loc to get all the "IsTrading" == True
crypto_trading_df = crypto_df.loc[(crypto_df["IsTrading"] == True)]
crypto_trading_df.shape

(1144, 6)

In [71]:
crypto_trading_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1144 entries, 42 to XBC
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         1144 non-null   object 
 1   Algorithm        1144 non-null   object 
 2   IsTrading        1144 non-null   bool   
 3   ProofType        1144 non-null   object 
 4   TotalCoinsMined  685 non-null    float64
 5   TotalCoinSupply  1144 non-null   object 
dtypes: bool(1), float64(1), object(4)
memory usage: 54.7+ KB


In [72]:
crypto_trading_df["IsTrading"].value_counts()

True    1144
Name: IsTrading, dtype: int64

In [73]:
crypto_trading_df.head(10)

Unnamed: 0_level_0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
365,365Coin,X11,True,PoW/PoS,,2300000000
404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,True,PoW,,611000
808,808,SHA-256,True,PoW/PoS,0.0,0
1337,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,True,PoW/PoS,,0
BTC,Bitcoin,SHA-256,True,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,True,PoW,107684200.0,0
LTC,Litecoin,Scrypt,True,PoW,63039240.0,84000000


In [74]:
# Remove the "IsTrading" column.
# use drop method to drop the "IsTrading" column
# df.drop(columns=['B', 'C'])
crypto_trading_df = crypto_trading_df.drop(columns=['IsTrading'])

In [75]:
crypto_trading_df.head(10)

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,41.99995,42
365,365Coin,X11,PoW/PoS,,2300000000
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
611,SixEleven,SHA-256,PoW,,611000
808,808,SHA-256,PoW/PoS,0.0,0
1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
2015,2015 coin,X11,PoW/PoS,,0
BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
ETH,Ethereum,Ethash,PoW,107684200.0,0
LTC,Litecoin,Scrypt,PoW,63039240.0,84000000


In [76]:
# Keep all the cryptocurrencies that have a working algorithm.
crypto_trading_df["Algorithm"].value_counts()

Scrypt                   394
X11                      182
SHA-256                  121
X13                       54
PoS                       42
                        ... 
VeChainThor Authority      1
Ouroboros                  1
POS 2.0                    1
Proof-of-BibleHash         1
TRC10                      1
Name: Algorithm, Length: 89, dtype: int64

In [77]:
crypto_trading_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1144 entries, 42 to XBC
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         1144 non-null   object 
 1   Algorithm        1144 non-null   object 
 2   ProofType        1144 non-null   object 
 3   TotalCoinsMined  685 non-null    float64
 4   TotalCoinSupply  1144 non-null   object 
dtypes: float64(1), object(4)
memory usage: 53.6+ KB


In [78]:
# Remove rows that have at least 1 null value.
crypto_trading_df["TotalCoinsMined"].isnull().value_counts()

False    685
True     459
Name: TotalCoinsMined, dtype: int64

In [79]:
# drop coins where TotalCoinsMined isnull
clean_crypto_trading_df = crypto_trading_df.dropna(how='any', axis='rows')
clean_crypto_trading_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 685 entries, 42 to XBC
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CoinName         685 non-null    object 
 1   Algorithm        685 non-null    object 
 2   ProofType        685 non-null    object 
 3   TotalCoinsMined  685 non-null    float64
 4   TotalCoinSupply  685 non-null    object 
dtypes: float64(1), object(4)
memory usage: 32.1+ KB


In [80]:
clean_crypto_trading_df.shape

(685, 5)

In [81]:
# Keep the rows where coins are mined.
(clean_crypto_trading_df["TotalCoinsMined"] > 0)

Unnamed: 0
42       True
404      True
808     False
1337     True
BTC      True
        ...  
ZEPH     True
GAP      True
BDX      True
ZEN      True
XBC      True
Name: TotalCoinsMined, Length: 685, dtype: bool

In [82]:
# remove all rows that do not have coins being mined
clean_crypto_df = clean_crypto_trading_df[clean_crypto_trading_df["TotalCoinsMined"] > 0]
clean_crypto_df

Unnamed: 0_level_0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
42,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
404,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
1337,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
BTC,Bitcoin,SHA-256,PoW,1.792718e+07,21000000
ETH,Ethereum,Ethash,PoW,1.076842e+08,0
...,...,...,...,...,...
ZEPH,ZEPHYR,SHA-256,DPoS,2.000000e+09,2000000000
GAP,Gapcoin,Scrypt,PoW/PoS,1.493105e+07,250000000
BDX,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
ZEN,Horizen,Equihash,PoW,7.296538e+06,21000000


In [83]:
clean_crypto_df.shape

(532, 5)

In [84]:
# Create a new DataFrame that holds only the cryptocurrencies names.
crypto_names_df = clean_crypto_df.copy()
crypto_names_df = pd.DataFrame(crypto_names_df['CoinName'], index=crypto_names_df.index)
crypto_names_df.shape

(532, 1)

In [85]:
# view the DataFrame
crypto_names_df.head()

Unnamed: 0_level_0,CoinName
Unnamed: 0,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum


In [86]:
# save crypto_names_df as csv file for others
output_file_path = "Data/crypto_names.csv"
crypto_names_df.to_csv(output_file_path, index=True)

In [87]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm.
# use the drop method
clean_crypto_df = clean_crypto_df.drop(columns=['CoinName'])
clean_crypto_df.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [88]:
# before using get_dummies
clean_crypto_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Algorithm        532 non-null    object 
 1   ProofType        532 non-null    object 
 2   TotalCoinsMined  532 non-null    float64
 3   TotalCoinSupply  532 non-null    object 
dtypes: float64(1), object(3)
memory usage: 20.8+ KB


In [89]:
# recast TotalCoinSupply as numeric value using pandas
# to_numeric method
# pd.to_numeric(s, errors='coerce')
clean_crypto_df["TotalCoinSupply"] = pd.to_numeric(clean_crypto_df["TotalCoinSupply"], errors='coerce')
clean_crypto_df.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,X13,PoW/PoS,29279420000.0,314159300000.0
BTC,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethash,PoW,107684200.0,0.0


In [90]:
# check data types
clean_crypto_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Algorithm        532 non-null    object 
 1   ProofType        532 non-null    object 
 2   TotalCoinsMined  532 non-null    float64
 3   TotalCoinSupply  532 non-null    float64
dtypes: float64(2), object(2)
memory usage: 20.8+ KB


In [91]:
# save our clean_crypto_df as clean_crypto.csv
output_file_path = "Data/clean_crypto.csv"
clean_crypto_df.to_csv(output_file_path, index=True)

In [92]:
clean_crypto_df.shape

(532, 4)

In [93]:
clean_crypto_df['Algorithm'].value_counts()

Scrypt                   182
X11                       73
SHA-256                   48
CryptoNight               19
X13                       17
                        ... 
Time Travel                1
SkunkHash v2 Raptor        1
VeChainThor Authority      1
Ouroboros                  1
TRC10                      1
Name: Algorithm, Length: 71, dtype: int64

In [94]:
clean_crypto_df['ProofType'].value_counts().count()

25

In [95]:
# Use get_dummies() to create variables for text features.
# encoded_crypto_df = pd.read_csv("Data/clean_crypto.csv")
# encoded_crypto_df
X_encoded = pd.get_dummies(clean_crypto_df, columns=['Algorithm', 'ProofType'])
X_encoded.shape

(532, 98)

In [96]:
# check the dtypes
X_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 532 entries, 42 to XBC
Data columns (total 98 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   TotalCoinsMined                   532 non-null    float64
 1   TotalCoinSupply                   532 non-null    float64
 2   Algorithm_1GB AES Pattern Search  532 non-null    uint8  
 3   Algorithm_536                     532 non-null    uint8  
 4   Algorithm_Argon2d                 532 non-null    uint8  
 5   Algorithm_BLAKE256                532 non-null    uint8  
 6   Algorithm_Blake                   532 non-null    uint8  
 7   Algorithm_Blake2S                 532 non-null    uint8  
 8   Algorithm_Blake2b                 532 non-null    uint8  
 9   Algorithm_C11                     532 non-null    uint8  
 10  Algorithm_Cloverhash              532 non-null    uint8  
 11  Algorithm_Counterparty            532 non-null    uint8  
 12  Algorithm_Cr

In [97]:
# why standardize?
X_encoded.describe()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
count,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,...,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0,532.0
mean,5340456000.0,10345230000.0,0.00188,0.00188,0.00188,0.003759,0.005639,0.00188,0.003759,0.003759,...,0.330827,0.00188,0.00188,0.00188,0.00188,0.00188,0.00188,0.00188,0.00188,0.00188
std,45645680000.0,67736970000.0,0.043355,0.043355,0.043355,0.061256,0.074952,0.043355,0.061256,0.061256,...,0.470954,0.043355,0.043355,0.043355,0.043355,0.043355,0.043355,0.043355,0.043355,0.043355
min,41.99995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8359849.0,21000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,24743970.0,78417600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,186725000.0,500000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,989988700000.0,1000000000000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [98]:
# Standardize the data with StandardScaler().
# create an instance of the model
scaler = StandardScaler()

# train/fit our data to the model
X_scaled = scaler.fit_transform(X_encoded)
X_scaled[:1]

array([[-0.11710817, -0.1528703 , -0.0433963 , -0.0433963 , -0.0433963 ,
        -0.06142951, -0.07530656, -0.0433963 , -0.06142951, -0.06142951,
        -0.0433963 , -0.0433963 , -0.19245009, -0.06142951, -0.09740465,
        -0.0433963 , -0.11547005, -0.07530656, -0.0433963 , -0.0433963 ,
        -0.15191091, -0.0433963 , -0.13118084, -0.0433963 , -0.0433963 ,
        -0.08703883, -0.0433963 , -0.0433963 , -0.0433963 , -0.0433963 ,
        -0.06142951, -0.0433963 , -0.08703883, -0.08703883, -0.08703883,
        -0.0433963 , -0.13118084, -0.13840913, -0.13840913, -0.0433963 ,
        -0.06142951, -0.0433963 , -0.07530656, -0.18168574, -0.0433963 ,
        -0.0433963 , -0.0433963 , -0.07530656, -0.15826614, -0.31491833,
        -0.0433963 , -0.08703883, -0.07530656, -0.06142951,  1.38675049,
        -0.0433963 , -0.0433963 , -0.06142951, -0.0433963 , -0.0433963 ,
        -0.0433963 , -0.0433963 , -0.0433963 , -0.0433963 , -0.0433963 ,
        -0.0433963 , -0.39879994, -0.0433963 , -0.1

In [99]:
# save our X_encoded as csv file
X_encoded.head()

Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,41.99995,42.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1055185000.0,532000000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,29279420000.0,314159300000.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,17927180.0,21000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,107684200.0,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
# create X_encoded into a DataFrame
X_encoded_df = pd.DataFrame(X_encoded)
X_encoded_df

Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42,4.199995e+01,4.200000e+01,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.055185e+09,5.320000e+08,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,3.141593e+11,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,2.100000e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0.000000e+00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,2.000000e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,2.500000e+08,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,1.400223e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEN,7.296538e+06,2.100000e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
# save X_encoded_df as csv file for others
output_file_path = "Data/X_encoded.csv"
X_encoded_df.to_csv(output_file_path, index=True)

### Deliverable 2: Reducing Data Dimensions Using PCA

In [102]:
# Using PCA to reduce dimension to three principal components.

# Initialize PCA model
pca = PCA(n_components=3)

# Get 3 principal components for the X_scaled data where X is our feature matrix
X_pca = pca.fit_transform(X_scaled)

X_pca

array([[-0.31595235,  0.9519562 , -0.53727473],
       [-0.29929407,  0.95190941, -0.53748242],
       [ 2.29690032,  1.62084517, -0.52264384],
       ...,
       [ 0.32201418, -2.27632182,  0.35578336],
       [-0.12935829, -2.1260035 ,  0.2665415 ],
       [-0.27014944,  0.71581468, -0.22214806]])

In [103]:
# Create a DataFrame with the three principal components.
X_pca_df = pd.DataFrame(
            data=X_pca,
            columns = ['PC 1', 'PC 2', 'PC 3'],
            index=X_encoded.index)

In [104]:
X_pca_df

Unnamed: 0_level_0,PC 1,PC 2,PC 3
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42,-0.315952,0.951956,-0.537275
404,-0.299294,0.951909,-0.537482
1337,2.296900,1.620845,-0.522644
BTC,-0.152905,-1.242052,0.179431
ETH,-0.194476,-1.986549,0.367089
...,...,...,...
ZEPH,2.468787,1.012638,-0.279331
GAP,-0.313997,0.951813,-0.537274
BDX,0.322014,-2.276322,0.355783
ZEN,-0.129358,-2.126004,0.266542


In [105]:
# examine the explained variance ratio
pca.explained_variance_ratio_

array([0.02792952, 0.02138175, 0.02047171])

### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [106]:
# Create an elbow curve to find the best value for K.
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(X_pca_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")



KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



Running K-Means with `k=4`

In [107]:
# Initialize the K-Means model.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(X_pca_df)

# Predict clusters
predictions = model.predict(X_pca_df)

# Add the predicted calss columns
X_pca_df["class"] = model.labels_
X_pca_df.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,-0.315952,0.951956,-0.537275,1
404,-0.299294,0.951909,-0.537482,1
1337,2.2969,1.620845,-0.522644,1
BTC,-0.152905,-1.242052,0.179431,0
ETH,-0.194476,-1.986549,0.367089,0


In [108]:
X_pca_df['class'].value_counts()

1    289
0    239
3      3
2      1
Name: class, dtype: int64

In [109]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
# YOUR CODE HERE

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
# YOUR CODE HERE

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
# YOUR CODE HERE

# Print the shape of the clustered_df
# print(clustered_df.shape)
# clustered_df.head(10)

### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [110]:
# Create a table with tradable cryptocurrencies.
crypto_df = pd.read_csv("Data/clean_crypto.csv")
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,Scrypt,PoW/PoS,41.99995,42.0
1,404,Scrypt,PoW/PoS,1055185000.0,532000000.0
2,1337,X13,PoW/PoS,29279420000.0,314159300000.0
3,BTC,SHA-256,PoW,17927180.0,21000000.0
4,ETH,Ethash,PoW,107684200.0,0.0


In [111]:
crypto_df = crypto_df.set_index("Unnamed: 0")
crypto_df.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,X13,PoW/PoS,29279420000.0,314159300000.0
BTC,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethash,PoW,107684200.0,0.0


In [112]:
# we will concatonate this with our X_pca_df
clustered_df = pd.concat([crypto_df, X_pca_df], axis=1)
clustered_df.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
42,Scrypt,PoW/PoS,41.99995,42.0,-0.315952,0.951956,-0.537275,1
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.299294,0.951909,-0.537482,1
1337,X13,PoW/PoS,29279420000.0,314159300000.0,2.2969,1.620845,-0.522644,1
BTC,SHA-256,PoW,17927180.0,21000000.0,-0.152905,-1.242052,0.179431,0
ETH,Ethash,PoW,107684200.0,0.0,-0.194476,-1.986549,0.367089,0


In [113]:
crypto_names = pd.read_csv("Data/crypto_names.csv")
crypto_names.head()

Unnamed: 0.1,Unnamed: 0,CoinName
0,42,42 Coin
1,404,404Coin
2,1337,EliteCoin
3,BTC,Bitcoin
4,ETH,Ethereum


In [114]:
crypto_names.set_index("Unnamed: 0", inplace=True)
crypto_names

Unnamed: 0_level_0,CoinName
Unnamed: 0,Unnamed: 1_level_1
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [115]:
# Add crypto coin names to df.
clustered_df["CoinName"] = crypto_names["CoinName"]
clustered_df.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class,CoinName
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,Scrypt,PoW/PoS,41.99995,42.0,-0.315952,0.951956,-0.537275,1,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.299294,0.951909,-0.537482,1,404Coin
1337,X13,PoW/PoS,29279420000.0,314159300000.0,2.2969,1.620845,-0.522644,1,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000.0,-0.152905,-1.242052,0.179431,0,Bitcoin
ETH,Ethash,PoW,107684200.0,0.0,-0.194476,-1.986549,0.367089,0,Ethereum


In [116]:
# Print the total number of tradable cryptocurrencies.
print(f"The total number of tradeable cryptocurrenices: {clustered_df.shape[0]}")

The total number of tradeable cryptocurrenices: 532


In [117]:
# Creating a 3D-Scatter with the PCA data and the clusters
# Plotting the clusters with three features
fig = px.scatter_3d(clustered_df, x="PC 1", y="PC 2", z="PC 3", color="class", symbol="class", width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [61]:
# save our clustered_df as a csv file and an image
clustered_df.to_csv("Data/crypto_clustered_.csv")

In [118]:
# view data
clustered_df.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,class,CoinName
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,Scrypt,PoW/PoS,41.99995,42.0,-0.315952,0.951956,-0.537275,1,42 Coin
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.299294,0.951909,-0.537482,1,404Coin
1337,X13,PoW/PoS,29279420000.0,314159300000.0,2.2969,1.620845,-0.522644,1,EliteCoin
BTC,SHA-256,PoW,17927180.0,21000000.0,-0.152905,-1.242052,0.179431,0,Bitcoin
ETH,Ethash,PoW,107684200.0,0.0,-0.194476,-1.986549,0.367089,0,Ethereum


In [119]:
# scaling data to create a scatter plot with tradeable cryptocurrencies
# use minmax scaler
mm_scaler = MinMaxScaler()

plot_data = mm_scaler.fit_transform(
        clustered_df[["TotalCoinsMined", "TotalCoinSupply"]]
)

plot_data[:5]

array([[0.00000000e+00, 4.20000000e-11],
       [1.06585544e-03, 5.32000000e-04],
       [2.95755135e-02, 3.14159265e-01],
       [1.81084216e-05, 2.10000000e-05],
       [1.08773140e-04, 0.00000000e+00]])

In [120]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame
plot_df = pd.DataFrame(
    plot_data, columns=["TotalCoinsMined", "TotalCoinSupply"], index=clustered_df.index
)

plot_df.head(10)

Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
42,0.0,4.2e-11
404,0.001066,0.000532
1337,0.029576,0.3141593
BTC,1.8e-05,2.1e-05
ETH,0.000109,0.0
LTC,6.4e-05,8.4e-05
DASH,9e-06,2.2e-05
XMR,1.7e-05,0.0
ETC,0.000115,0.00021
ZEC,7e-06,2.1e-05


In [123]:
# Add the "Coinname" column from the clustered_df to the new DataFrame
plot_df["CoinName"] = clustered_df["CoinName"]

In [125]:
# Add the "Class" column from the clustered_df DataFrame to the new DataFrame
plot_df["Class"] = clustered_df["class"]
plot_df

Unnamed: 0_level_0,TotalCoinsMined,TotalCoinSupply,CoinName,Class
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,0.000000e+00,4.200000e-11,42 Coin,1
404,1.065855e-03,5.320000e-04,404Coin,1
1337,2.957551e-02,3.141593e-01,EliteCoin,1
BTC,1.810842e-05,2.100000e-05,Bitcoin,0
ETH,1.087731e-04,0.000000e+00,Ethereum,0
...,...,...,...,...
ZEPH,2.020225e-03,2.000000e-03,ZEPHYR,1
GAP,1.508199e-05,2.500000e-04,Gapcoin,1
BDX,9.901351e-04,1.400223e-03,Beldex,0
ZEN,7.370282e-06,2.100000e-05,Horizen,0


In [143]:
# Create an hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply"
plot = plot_df.hvplot.scatter(
    x="TotalCoinsMined",
    y="TotalCoinSupply",
    hover_cols=["CoinName"],
    by="Class"
)
plot

In [151]:
# create an hvplot.table
table = clustered_df[
   [ 
    "CoinName",
    "Algorithm",
    "ProofType",
    "TotalCoinSupply",
    "TotalCoinsMined",
    "class",
   ]
].hvplot.table()

table