In [57]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE



In [2]:
fp = Path("crypto_data.csv")
df = pd.read_csv(fp)
df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1247,XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [3]:
crypto_istrading = df.loc[df["IsTrading"] == True]
crypto_istrading

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1243,SERO,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,UOS,SHA-256,True,DPoI,,1000000000
1245,BDX,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
1246,ZEN,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [4]:
crypto_df = crypto_istrading.drop(columns=['IsTrading'], axis=1)

In [5]:
for column in crypto_df.columns:
    print(f"Column {column} has {crypto_df[column].isnull().sum()} null values")


Column Unnamed: 0 has 0 null values
Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 459 null values
Column TotalCoinSupply has 0 null values


In [6]:
no_null = crypto_df.dropna(how='any')

In [7]:
for column in no_null.columns:
    print(f"Column {column} has {no_null[column].isnull().sum()} null values")

Column Unnamed: 0 has 0 null values
Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 0 null values
Column TotalCoinSupply has 0 null values


In [8]:
# Find duplicate entries
print(f"Duplicate entries: {no_null.duplicated().sum()}")

Duplicate entries: 0


In [33]:
mined_coins_df = no_null.loc[no_null['TotalCoinsMined'] > 0]
mined_coins_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,1337,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,BTC,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,ETH,Ethereum,Ethash,PoW,107684200.0,0


In [49]:
ml_df = mined_coins_df.drop(columns=["CoinName", "Unnamed: 0"])
# ml_df = drop_name.rename(columns={"Unnamed: 0": "crypto_symbol"})
ml_df

# df.rename(columns={"A": "a", "B": "c"})

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
2,Scrypt,PoW/PoS,1.055185e+09,532000000
5,X13,PoW/PoS,2.927942e+10,314159265359
7,SHA-256,PoW,1.792718e+07,21000000
8,Ethash,PoW,1.076842e+08,0
...,...,...,...,...
1238,SHA-256,DPoS,2.000000e+09,2000000000
1242,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,CryptoNight,PoW,9.802226e+08,1400222610
1246,Equihash,PoW,7.296538e+06,21000000


In [50]:
cryp_dummies = pd.get_dummies(ml_df)

In [51]:
cryp_dummies.shape

(532, 377)

In [52]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(cryp_dummies)
crypto_scaled = scaler.transform(cryp_dummies)
crypto_scaled

array([[-0.11710817, -0.0433963 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.09396955, -0.0433963 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [ 0.52494561, -0.0433963 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       ...,
       [-0.09561336, -0.0433963 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11694817, -0.0433963 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ],
       [-0.11710536, -0.0433963 , -0.0433963 , ..., -0.0433963 ,
        -0.0433963 , -0.0433963 ]])

In [53]:
# Applying PCA to reduce dimensions from 4 to 2

# Initialize PCA model
pca = PCA(n_components = .9, svd_solver='full')

# Get principal components for the crypto data.
crypto_pca = pca.fit_transform(crypto_scaled)

crypto_pca

array([[-2.67092248e-01, -1.16293969e-01, -4.07308196e-15, ...,
        -2.23119616e+00,  8.51753867e-01, -8.58553139e-01],
       [-2.49924699e-01, -1.17272732e-01, -3.24189932e-15, ...,
         5.38325025e-01, -8.55610849e-01, -1.97673497e-02],
       [ 3.14746498e-01, -2.86187718e-03, -1.21232852e-14, ...,
         9.85168494e-01,  4.74366202e-01,  8.77559983e-01],
       ...,
       [-6.67192566e-02, -3.11211774e-01,  3.94946004e-15, ...,
        -9.78312586e-01,  3.56291295e-01, -9.25294434e-01],
       [-2.90507995e-01, -2.45950491e-01,  3.90732370e-15, ...,
        -1.49684155e-15,  3.04374468e-15, -1.96241433e-15],
       [-1.99343294e-01, -6.23600551e-02, -2.59407208e-15, ...,
        -1.56486523e-12,  8.75468954e-13, -5.77350307e-13]])

In [54]:
crypto_pca.shape

(532, 274)

In [55]:
pca.explained_variance_ratio_

array([0.00839215, 0.00799006, 0.00797255, 0.00797255, 0.00794421,
       0.00741891, 0.00696609, 0.00687449, 0.00656659, 0.0061131 ,
       0.00588744, 0.00575784, 0.00569   , 0.00555906, 0.00549437,
       0.00532247, 0.00532026, 0.00531503, 0.00531503, 0.00531503,
       0.00531503, 0.00531503, 0.00531503, 0.00531488, 0.00531089,
       0.00527432, 0.00523105, 0.00521782, 0.00508932, 0.00501107,
       0.00498244, 0.00496689, 0.0049616 , 0.00491607, 0.00487546,
       0.00485005, 0.00483744, 0.00482475, 0.00478222, 0.00475574,
       0.00466862, 0.00465538, 0.00462004, 0.00460902, 0.0045821 ,
       0.0045637 , 0.00456113, 0.00454578, 0.00454094, 0.00453147,
       0.00448855, 0.00447791, 0.00446873, 0.00440764, 0.00439661,
       0.00427716, 0.00424332, 0.00418783, 0.00405474, 0.00401174,
       0.00393489, 0.00380768, 0.00379098, 0.00377029, 0.00374979,
       0.00364987, 0.00358469, 0.00355848, 0.00349388, 0.00344452,
       0.00340763, 0.00316666, 0.00298865, 0.00276281, 0.00270

In [58]:
# Initialize t-SNE model
tsne = TSNE(learning_rate=35)

In [59]:
# Reduce dimensions
tsne_features = tsne.fit_transform(crypto_pca)

In [60]:
tsne_features.shape

(532, 2)

In [66]:
tsne_features[:,1]

array([-6.74193115e+01, -8.14752769e+00, -5.04759169e+00, -4.17325306e+00,
       -5.67099571e+00,  1.71364002e+01,  3.19473863e+00,  1.91067469e+00,
        5.34787416e+00,  4.26773500e+00,  3.86616287e+01,  1.81420269e+01,
        3.18794131e+00, -1.70411739e+01, -4.08826083e-01, -1.88056049e+01,
       -5.40246058e+00,  2.12034550e+01, -5.66870356e+00, -2.69688511e+00,
        2.20347958e+01,  4.96831656e-01, -3.19719458e+00,  3.88485503e+00,
        8.14600945e-01, -3.31723595e+00,  9.66320574e-01,  1.57474623e+01,
        2.74241519e+00, -4.96858692e+00,  1.05457029e+01, -1.75010548e+01,
        8.86244118e-01, -1.31530380e+01,  3.05016875e+00,  7.83593369e+00,
        1.20834131e+01,  1.81536961e+01,  3.73403096e+00,  1.05456676e+01,
       -1.37984312e+00,  2.41912842e+00, -1.35740967e+01, -3.27886820e+00,
        3.76103640e+00,  1.31275558e+01,  5.23320532e+00,  1.32395658e+01,
        4.93072701e+00, -1.82634506e+01, -2.60430008e-01, -2.14827681e+00,
        3.19473863e+00, -

In [61]:
# Prepare to plot the dataset

# The first column of transformed features
crypto_pca['x'] = tsne_features[:,0]

# The second column of transformed features
crypto_pca['y'] = tsne_features[:,1]

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices