# Clustering Crypto

In [68]:
# Initial imports
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hvplot.pandas
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler =StandardScaler ()
mm = MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### Fetching Cryptocurrency Data

In [69]:
# Use the following endpoint to fetch json data
url = "https://min-api.cryptocompare.com/data/all/coinlist"

In [70]:
# Create a DataFrame 
# HINT: You will need to use the 'Data' key from the json response, then transpose the DataFrame.

In [71]:
# Alternatively, use the provided csv file:
file_path = pd.read_csv ("C:/Users/pvolc/rice_fintech/AWS-Lex/Instructions/Starter_Files/Resources/crypto_data.csv")

# Create a DataFrame
crypto_df = pd.DataFrame(file_path, index=None) 
crypto_df.tail()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
1247,XBC,BitcoinPlus,Scrypt,True,PoS,128327.0,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,21491210.0,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000
1251,PUNK,SteamPunk,PoS,False,PoS,,40000000


### Data Preprocessing

In [72]:
# Keep only necessary columns:
# 'CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'
crypto_col = pd.DataFrame (crypto_df,columns = ['CoinName','Algorithm','IsTrading','ProofType','TotalCoinsMined','TotalCoinSupply'])
crypto_col.tail()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
1247,BitcoinPlus,Scrypt,True,PoS,128327.0,1000000
1248,DivotyCoin,Scrypt,False,PoW/PoS,21491210.0,100000000
1249,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000
1251,SteamPunk,PoS,False,PoS,,40000000


In [73]:
# Keep only cryptocurrencies that are trading
crypto_col.iloc[:, -1]
df = crypto_col.loc [crypto_col["IsTrading"] ==True]
df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [74]:
# Keep only cryptocurrencies with a working algorithm
df_pow = df.loc [df["ProofType"] =="PoW/PoS"]
df_pow.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
4,808,SHA-256,True,PoW/PoS,0.0,0
5,EliteCoin,X13,True,PoW/PoS,29279420000.0,314159265359


In [75]:
# Remove the "IsTrading" column
df_clean = df_pow.drop(["IsTrading"], axis = 1)
df_clean.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,SHA-256,PoW/PoS,0.0,0
5,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359


In [76]:
# Find null values
for column in df_clean.columns:
    print(f"Column {column} has {df_clean[column].isnull().sum()} null values")

Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 197 null values
Column TotalCoinSupply has 0 null values


In [77]:
# Remove rows with at least 1 null value
df_notnull = df_clean.dropna()
df_notnull.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,SHA-256,PoW/PoS,0.0,0
5,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
10,Dash,X11,PoW/PoS,9031294.0,22000000


In [78]:
# Remove rows with cryptocurrencies having no coins mined
df_mined = df_notnull.loc [df_notnull["TotalCoinsMined"] > 0]
df_mined.dropna()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
10,Dash,X11,PoW/PoS,9.031294e+06,22000000
16,BitcoinDark,SHA-256,PoW/PoS,1.288862e+06,22000000
...,...,...,...,...,...
1193,OKCash,SHA-256,PoW/PoS,7.467606e+07,105000000
1196,WhiteCoin,Scrypt,PoW/PoS,2.520056e+08,300000000
1198,FriendshipCoin,NeoScrypt,PoW/PoS,1.120385e+06,60168145
1202,Triangles Coin,X13,PoW/PoS,1.407778e+05,120000


In [79]:
# Drop rows where there are 'N/A' text values
df_mined.dropna()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
10,Dash,X11,PoW/PoS,9.031294e+06,22000000
16,BitcoinDark,SHA-256,PoW/PoS,1.288862e+06,22000000
...,...,...,...,...,...
1193,OKCash,SHA-256,PoW/PoS,7.467606e+07,105000000
1196,WhiteCoin,Scrypt,PoW/PoS,2.520056e+08,300000000
1198,FriendshipCoin,NeoScrypt,PoW/PoS,1.120385e+06,60168145
1202,Triangles Coin,X13,PoW/PoS,1.407778e+05,120000


In [80]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from crypto_df
df_cn = pd.DataFrame(file_path, columns = ["CoinName"]) 
df_cn.head()

Unnamed: 0,CoinName
0,42 Coin
1,365Coin
2,404Coin
3,SixEleven
4,808


In [81]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
df_mined = df_mined.drop(["CoinName"], axis = 1)
df_mined.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,41.99995,42
2,Scrypt,PoW/PoS,1055185000.0,532000000
5,X13,PoW/PoS,29279420000.0,314159265359
10,X11,PoW/PoS,9031294.0,22000000
16,SHA-256,PoW/PoS,1288862.0,22000000


In [82]:
# Create dummy variables for text features
df_dumm = pd.get_dummies (df_mined, columns = ["Algorithm", "ProofType"])
df_dumm.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_536,Algorithm_BLAKE256,Algorithm_Blake2S,Algorithm_C11,Algorithm_Groestl,Algorithm_Keccak,Algorithm_Lyra2REv2,Algorithm_NIST5,...,Algorithm_SHA3,Algorithm_Scrypt,Algorithm_Skein,Algorithm_X11,Algorithm_X13,Algorithm_X14,Algorithm_X15,Algorithm_X16R,Algorithm_XEVAN,ProofType_PoW/PoS
0,41.99995,42,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
2,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
5,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
10,9031294.0,22000000,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
16,1288862.0,22000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [83]:
# Standardize data
data_scaler = StandardScaler()

In [84]:
# Fitting the scaler
data_scaler.fit (df_dumm)

StandardScaler()

In [85]:
# Transforming the data
df_dumm_scaled = data_scaler.transform (df_dumm)
df_dumm_scaled [:5]

array([[-0.25570783, -0.18518161, -0.07559289, -0.07559289, -0.07559289,
        -0.07559289, -0.10721125, -0.07559289, -0.07559289, -0.20351933,
        -0.13168538, -0.07559289, -0.07559289, -0.07559289, -0.17099639,
        -0.30523385, -0.15249857, -0.07559289,  1.1740436 , -0.10721125,
        -0.4982238 , -0.27050089, -0.07559289, -0.13168538, -0.07559289,
        -0.10721125,  0.        ],
       [-0.14765455, -0.16483251, -0.07559289, -0.07559289, -0.07559289,
        -0.07559289, -0.10721125, -0.07559289, -0.07559289, -0.20351933,
        -0.13168538, -0.07559289, -0.07559289, -0.07559289, -0.17099639,
        -0.30523385, -0.15249857, -0.07559289,  1.1740436 , -0.10721125,
        -0.4982238 , -0.27050089, -0.07559289, -0.13168538, -0.07559289,
        -0.10721125,  0.        ],
       [ 2.74257034, 11.83147436, -0.07559289, -0.07559289, -0.07559289,
        -0.07559289, -0.10721125, -0.07559289, -0.07559289, -0.20351933,
        -0.13168538, -0.07559289, -0.07559289, -0.0755

### Reducing Dimensions Using PCA

In [86]:
# Use PCA to reduce dimensions to 3 principal components
pca = PCA(n_components=3)

In [87]:
# Get two principal components for the iris data.
df_pca = pca.fit_transform(df_dumm_scaled)

In [88]:
# Create a DataFrame with the principal components data
df_pca = pd.DataFrame (data = df_pca, columns=["Coin1", "Coin2", "Coin3"])
df_pca.head()

Unnamed: 0,Coin1,Coin2,Coin3
0,-0.377249,1.385846,-0.199472
1,-0.302068,1.388717,-0.204339
2,9.971825,-0.509966,-0.928588
3,-0.662798,-1.492953,-1.53723
4,-0.437772,-0.892335,2.286034


### Clustering Crytocurrencies Using K-Means

#### Find the Best Value for `k` Using the Elbow Curve

In [89]:
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_pca)
    inertia.append(km.inertia_)

# Create the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

Running K-Means with `k= 7, inertia = 27.8

In [90]:
# Initialize the K-Means model with k = 7
model = KMeans(n_clusters=7, random_state=0)

# Fit the model
model.fit(df_pca)

# Predict clusters
predictions = model.predict(df_pca)

# Create a new DataFrame including predicted clusters and cryptocurrencies features
df_pca["Class"] = model.labels_
df_pca.head()

Unnamed: 0,Coin1,Coin2,Coin3,Class
0,-0.377249,1.385846,-0.199472,2
1,-0.302068,1.388717,-0.204339,2
2,9.971825,-0.509966,-0.928588,1
3,-0.662798,-1.492953,-1.53723,3
4,-0.437772,-0.892335,2.286034,6


### Visualizing Results

#### 3D-Scatter with Clusters

In [91]:
# Create a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    df_pca,
    x="Coin3",
    y="Coin2",
    z="Coin1",
    color="Class",
    symbol="Class",
    width=700,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

#### Table of Tradable Cryptocurrencies

In [92]:
# Table with tradable cryptos
df_cn = pd.DataFrame(file_path, columns = ["CoinName", "TotalCoinsMined", "TotalCoinSupply"]) 
df_cn.tail()

Unnamed: 0,CoinName,TotalCoinsMined,TotalCoinSupply
1247,BitcoinPlus,128327.0,1000000
1248,DivotyCoin,21491210.0,100000000
1249,Giotto Coin,,233100000
1250,OpenSourceCoin,,21000000
1251,SteamPunk,,40000000


In [93]:
# Print the total number of tradable cryptocurrencies
df_cn.dropna()

Unnamed: 0,CoinName,TotalCoinsMined,TotalCoinSupply
0,42 Coin,4.199995e+01,42
2,404Coin,1.055185e+09,532000000
4,808,0.000000e+00,0
5,EliteCoin,2.927942e+10,314159265359
7,Bitcoin,1.792718e+07,21000000
...,...,...,...
1242,Gapcoin,1.493105e+07,250000000
1245,Beldex,9.802226e+08,1400222610
1246,Horizen,7.296538e+06,21000000
1247,BitcoinPlus,1.283270e+05,1000000


In [94]:
df_cn = df_notnull.loc [df_notnull["TotalCoinsMined"] > 0]
df_cn.dropna()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
10,Dash,X11,PoW/PoS,9.031294e+06,22000000
16,BitcoinDark,SHA-256,PoW/PoS,1.288862e+06,22000000
...,...,...,...,...,...
1193,OKCash,SHA-256,PoW/PoS,7.467606e+07,105000000
1196,WhiteCoin,Scrypt,PoW/PoS,2.520056e+08,300000000
1198,FriendshipCoin,NeoScrypt,PoW/PoS,1.120385e+06,60168145
1202,Triangles Coin,X13,PoW/PoS,1.407778e+05,120000


#### Scatter Plot with Tradable Cryptocurrencies

In [114]:
# Scale data to create the scatter plot
#df_cn_scaled = scaler.fit_transform(df_cn[["TotalCoinsMined","TotalCoinSupply"]])

In [117]:
# Plot the scatter with x="TotalCoinsMined" and y="TotalCoinSupply"
