In [276]:
# setting up dependencies
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [277]:
# loading crypto csv
file_path = "./Resources/crypto_data.csv"
crypto_df = pd.read_csv(file_path, index_col=0, encoding = "ISO-8859-1")
crypto_df.tail()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
XBC,BitcoinPlus,Scrypt,True,PoS,128327.0,1000000
DVTC,DivotyCoin,Scrypt,False,PoW/PoS,21491210.0,100000000
GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000
PUNK,SteamPunk,PoS,False,PoS,,40000000


In [278]:
# checking data types of dataframe
crypto_df.dtypes

CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [279]:
# Strip all the space and . 
crypto_df['TotalCoinSupply'] = crypto_df['TotalCoinSupply'].str.replace(" ","")
crypto_df['TotalCoinSupply'] = crypto_df['TotalCoinSupply'].str.replace(".","")

In [280]:
# converting totalcoinsupply to float 
crypto_df["TotalCoinSupply"] = crypto_df.TotalCoinSupply.astype(float)

In [281]:
# checking data types of dataframe
crypto_df.dtypes

CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply    float64
dtype: object

In [282]:
# Remove all cryptocurrencies that aren’t trading.
drop1 = crypto_df[crypto_df.IsTrading == True]
print(len(crypto_df))
print(len(drop1))

1252
1144


In [283]:
# Remove all cryptocurrencies that don’t have an algorithm defined.
drop2 = drop1.dropna(subset=['Algorithm'])
len(drop2)

1144

In [284]:
# Remove the IsTrading column
drop3 = drop2.drop(columns=['IsTrading'])
drop3.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,42 Coin,Scrypt,PoW/PoS,41.99995,42.0
365,365Coin,X11,PoW/PoS,,2300000000.0
404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000.0
611,SixEleven,SHA-256,PoW,,611000.0
808,808,SHA-256,PoW/PoS,0.0,0.0


In [285]:
# Remove all cryptocurrencies with at least one null value.
drop4 = drop3.dropna()
len(drop4)

685

In [286]:
# Remove all cryptocurrencies without coins mined.
drop5 = drop4[drop4.TotalCoinsMined > 0] 
len(drop5)


532

In [287]:
# Store the names of all cryptocurrencies on a DataFramed named coins_name, and use the crypto_df.index as the index for this new DataFrame.
coins_name = drop5[['CoinName']]
coins_name 


Unnamed: 0,CoinName
42,42 Coin
404,404Coin
1337,EliteCoin
BTC,Bitcoin
ETH,Ethereum
...,...
ZEPH,ZEPHYR
GAP,Gapcoin
BDX,Beldex
ZEN,Horizen


In [288]:
# Remove the CoinName column.
drop6 = drop5.drop(columns='CoinName')
drop6.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
42,Scrypt,PoW/PoS,41.99995,42.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0
1337,X13,PoW/PoS,29279420000.0,314159300000.0
BTC,SHA-256,PoW,17927180.0,21000000.0
ETH,Ethash,PoW,107684200.0,0.0


In [289]:
# Create dummies variables for all of the text features, and store the resulting data on a DataFrame named X.
# Creating dummy variables for categorical datatypes
X = pd.get_dummies(drop6, columns=['Algorithm', 'ProofType'],drop_first = True)
X

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,Algorithm_Cloverhash,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
42,4.199995e+01,4.200000e+01,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
404,1.055185e+09,5.320000e+08,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1337,2.927942e+10,3.141593e+11,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BTC,1.792718e+07,2.100000e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETH,1.076842e+08,0.000000e+00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZEPH,2.000000e+09,2.000000e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GAP,1.493105e+07,2.500000e+08,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
BDX,9.802226e+08,1.400223e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZEN,7.296538e+06,2.100000e+07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [290]:
# standardize all of the data from the X DataFrame.
X_scaled = StandardScaler().fit_transform(X)
X_scaled


array([[-0.11710817, -0.05621025, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.09396955, -0.05620339, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [ 0.52494561, -0.0521619 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       ...,
       [-0.09561336, -0.05619221, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11694817, -0.05620998, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11710536, -0.05621024, -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ]])

In [291]:
# Use PCA to reduce the dimensions of the X DataFrame down to three principal components.
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

# pca dataframe
pcs_df = pd.DataFrame(
    data = X_pca,
    columns = ["PC 1", "PC 2", "PC 3"]).set_index(X.index , drop=False)

pcs_df

Unnamed: 0,PC 1,PC 2,PC 3
42,-0.263302,1.127869,-0.448401
404,-0.250825,1.126523,-0.449045
1337,0.187072,1.847430,-0.565503
BTC,-0.196846,-1.306302,0.114742
ETH,-0.230504,-2.090144,0.344843
...,...,...,...
ZEPH,3.933674,0.099343,-0.152449
GAP,-0.263126,1.127850,-0.448411
BDX,-0.122556,-2.337314,0.281659
ZEN,-0.226583,-2.063945,0.365965


In [292]:
# Create an elbow curve to find the best value for K, and use the pcs_df DataFrame.
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pcs_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")


In [301]:
# run the K-means algorithm to predict the K clusters for the cryptocurrencies’ data. Use the pcs_df to run the K-means algorithm.
# Initialize the K-means model
model = KMeans(n_clusters=5, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(pcs_df)

# Add the predicted class columns
pcs_df["Class"] = model.labels_
pcs_df

Unnamed: 0,PC 1,PC 2,PC 3,Class
42,-0.263302,1.127869,-0.448401,0
404,-0.250825,1.126523,-0.449045,0
1337,0.187072,1.847430,-0.565503,0
BTC,-0.196846,-1.306302,0.114742,1
ETH,-0.230504,-2.090144,0.344843,1
...,...,...,...,...
ZEPH,3.933674,0.099343,-0.152449,3
GAP,-0.263126,1.127850,-0.448411,0
BDX,-0.122556,-2.337314,0.281659,1
ZEN,-0.226583,-2.063945,0.365965,1


In [302]:
# Create a new DataFrame named “clustered_df,” that includes the following columns: Algorithm, ProofType, TotalCoinsMined, TotalCoinSupply, PC 1, PC 2, PC 3, CoinName, and Class. 
# merging 2 at a time
clustered_df = drop6.merge(pcs_df, left_index=True, right_index=True)  

# adding coins_name column
clustered_df['CoinName'] = coins_name

# rearrange columns 
clustered_df = clustered_df[['Algorithm', 'ProofType', 'TotalCoinsMined', 'TotalCoinSupply', 'PC 1',
       'PC 2', 'PC 3', 'CoinName', 'Class' ]]

clustered_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
42,Scrypt,PoW/PoS,4.199995e+01,4.200000e+01,-0.263302,1.127869,-0.448401,42 Coin,0
404,Scrypt,PoW/PoS,1.055185e+09,5.320000e+08,-0.250825,1.126523,-0.449045,404Coin,0
1337,X13,PoW/PoS,2.927942e+10,3.141593e+11,0.187072,1.847430,-0.565503,EliteCoin,0
BTC,SHA-256,PoW,1.792718e+07,2.100000e+07,-0.196846,-1.306302,0.114742,Bitcoin,1
ETH,Ethash,PoW,1.076842e+08,0.000000e+00,-0.230504,-2.090144,0.344843,Ethereum,1
...,...,...,...,...,...,...,...,...,...
ZEPH,SHA-256,DPoS,2.000000e+09,2.000000e+09,3.933674,0.099343,-0.152449,ZEPHYR,3
GAP,Scrypt,PoW/PoS,1.493105e+07,2.500000e+08,-0.263126,1.127850,-0.448411,Gapcoin,0
BDX,CryptoNight,PoW,9.802226e+08,1.400223e+09,-0.122556,-2.337314,0.281659,Beldex,1
ZEN,Equihash,PoW,7.296538e+06,2.100000e+07,-0.226583,-2.063945,0.365965,Horizen,1


In [303]:
# Create a 3D scatter plot 

fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    width=800,
    hover_name="CoinName",
    hover_data=["Algorithm"]
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [304]:
# Use hvplot.table to create a data table with all the current tradable cryptocurrencies. 
# The table should have the following columns: CoinName, Algorithm, ProofType, TotalCoinSupply, TotalCoinsMined, and Class.
clustered_df.hvplot.table(
    columns=["CoinName", "Algorithm", "ProofType", "TotalCoinSupply", "TotalCoinsMined", "Class"],
                             width=1000, height=600,)

In [305]:
clustered_df.sort_values(by=['TotalCoinsMined'],ascending=False).head(15)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
BTT,TRC10,DPoS,989988700000.0,990000000000.0,28.110462,-1.991982,-1.520695,BitTorrent,4
BCN,CryptoNight,PoW,184066800000.0,184467400000.0,2.042428,-2.570693,0.170009,ByteCoin,1
GCN,Scrypt,PoW,163055100000.0,200000000000.0,1.6393,-1.291247,-0.087032,gCn Coin,1
NYC,Scrypt,PoW,143006700000.0,0.0,1.402231,-1.265805,-0.074797,NewYorkCoin,1
QWC,CryptoNight Heavy,PoW,99553110000.0,184470000000.0,1.368873,-2.400421,0.296467,Qwertycoin,1
EMB,X13,PoW/PoS,92192820000.0,850000000.0,0.931022,1.766997,-0.60385,EmberCoin,0
IFC,Scrypt,PoW,90595750000.0,90600000000.0,0.782475,-1.198907,-0.042843,Infinite Coin,1
LYNX,Scrypt,HPoW,77872060000.0,92000000000.0,1.517401,0.197232,-0.158161,Lynx,0
VET,VeChainThor Authority,Proof of Authority,55454730000.0,86712630000.0,5.601964,1.427897,6.661254,Vechain,3
TRTL,CryptoNight,PoW,53139840000.0,1000000000000.0,0.494216,-2.403205,0.249804,TurtleCoin,1


In [306]:
clustered_df.sort_values(by=['TotalCoinSupply'],ascending=False).head(15)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
DFT,Scrypt,PoS,18663300.0,1740589000000000.0,-0.156398,1.974822,-0.246377,Draftcoin,0
XEN,X11,PoW/PoS,3853327.0,385332700000000.0,-0.260592,1.538708,-0.507562,XenixCoin,0
ONION,X13,PoW/PoS,21917020.0,188981900000000.0,-0.161196,2.003718,-0.557096,DeepOnion,0
TRTL,CryptoNight,PoW,53139840000.0,1000000000000.0,0.494216,-2.403205,0.249804,TurtleCoin,1
BTT,TRC10,DPoS,989988700000.0,990000000000.0,28.110462,-1.991982,-1.520695,BitTorrent,4
MOON,Scrypt,PoW,88.0,384000000000.0,-0.288818,-1.083183,0.012385,MoonCoin,1
1337,X13,PoW/PoS,29279420000.0,314159300000.0,0.187072,1.84743,-0.565503,EliteCoin,0
GCN,Scrypt,PoW,163055100000.0,200000000000.0,1.6393,-1.291247,-0.087032,gCn Coin,1
QWC,CryptoNight Heavy,PoW,99553110000.0,184470000000.0,1.368873,-2.400421,0.296467,Qwertycoin,1
BCN,CryptoNight,PoW,184066800000.0,184467400000.0,2.042428,-2.570693,0.170009,ByteCoin,1


In [307]:
# Create a scatter plot using hvplot.scatter to present the clustered data about cryptocurrencies having x="TotalCoinsMined"
# and y="TotalCoinSupply" to contrast the number of available coins versus the total number of mined coins. Use the 
# hover_cols=["CoinName"] parameter to include the cryptocurrency name on each data point.

clustered_df.hvplot.scatter(
    x="TotalCoinsMined", 
    y="TotalCoinSupply", 
    xlim=(0, 10e+11),
    ylim=(0, 1.940589e+15),
    width=1200, height=600,
    by="Class",
    hover_cols=["CoinName"],
    title = "Total Coins Supplied Vs Mined (All)"
)

In [308]:
# if we ignore some of the extreme outliers and graph the above by setting lower axis limits
clustered_df.hvplot.scatter(
    x="TotalCoinsMined", 
    y="TotalCoinSupply", 
    xlim=(0, 2e+11),
    ylim=(0, 3.9e+11),
    width=1200, height=600,
    by="Class",
    hover_cols=["CoinName"],
    title = "Total Coins Supplied Vs Mined (AXIS LIMITED!)"
)