In [31]:
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

import pandas as pd
import numpy as np
from pathlib import Path

In [32]:
# Data loading
file_path = Path("crypto_data.csv")
df_crypto = pd.read_csv(file_path)
df_crypto.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [33]:
# Find duplicate entries
print(f"Duplicate entries: {df_crypto.duplicated().sum()}")

Duplicate entries: 0


In [34]:
# Find null values
for column in df_crypto.columns:
    print(f"Column {column} has {df_crypto[column].isnull().sum()} null values")

Column Unnamed: 0 has 0 null values
Column CoinName has 0 null values
Column Algorithm has 0 null values
Column IsTrading has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 508 null values
Column TotalCoinSupply has 0 null values


In [35]:
# Remove not traded and null
df_crypto[df_crypto.IsTrading != 'False']
df_crypto = df_crypto.dropna()
df_crypto=df_crypto.drop(columns=['IsTrading','Unnamed: 0','CoinName'])
df_crypto

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
2,Scrypt,PoW/PoS,1.055185e+09,532000000
4,SHA-256,PoW/PoS,0.000000e+00,0
5,X13,PoW/PoS,2.927942e+10,314159265359
7,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...
1242,Scrypt,PoW/PoS,1.493105e+07,250000000
1245,CryptoNight,PoW,9.802226e+08,1400222610
1246,Equihash,PoW,7.296538e+06,21000000
1247,Scrypt,PoS,1.283270e+05,1000000


In [36]:
# Find null values
for column in df_crypto.columns:
    print(f"Column {column} has {df_crypto[column].isnull().sum()} null values")

Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 0 null values
Column TotalCoinSupply has 0 null values


In [37]:
df_crypto.head(108)

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
2,Scrypt,PoW/PoS,1.055185e+09,532000000
4,SHA-256,PoW/PoS,0.000000e+00,0
5,X13,PoW/PoS,2.927942e+10,314159265359
7,SHA-256,PoW,1.792718e+07,21000000
...,...,...,...,...
162,Scrypt,PoW,2.421518e+07,250000000
165,X15,PoW/PoS,0.000000e+00,1000000
166,Scrypt,PoW/PoS,0.000000e+00,1000000
168,Scrypt,PoW,2.559374e+07,51200000


In [38]:
# to convert the remaining features with text values
dummies = pd.get_dummies(data=df_crypto, columns=['Algorithm', 'ProofType'])
dummies.head()
dummies.shape

# Standardizing data
df_scaled = StandardScaler().fit_transform(dummies)
df_scaled

array([[-0.10047714, -0.03668886, -0.03668644, ..., -0.03668644,
        -0.03668644, -0.03668644],
       [-0.07317373, -0.0366887 , -0.03668644, ..., -0.03668644,
        -0.03668644, -0.03668644],
       [-0.10047714, -0.03668886, -0.03668644, ..., -0.03668644,
        -0.03668644, -0.03668644],
       ...,
       [-0.10028834, -0.03668885, -0.03668644, ..., -0.03668644,
        -0.03668644, -0.03668644],
       [-0.10047382, -0.03668886, -0.03668644, ..., -0.03668644,
        -0.03668644, -0.03668644],
       [-0.09992104, -0.03668883, -0.03668644, ..., -0.03668644,
        -0.03668644, -0.03668644]])

In [24]:
# Initialize PCA model
pca = PCA(n_components=.9)
# Transform PCA data to a DataFrame
c_pca = pca.fit_transform(df_scaled)

# Fetch the explained variance
pca.explained_variance_ratio_

array([0.01971841, 0.01856098, 0.01778158, 0.01772294, 0.01772294,
       0.01772294, 0.01772294, 0.01772294, 0.01768814, 0.01554653,
       0.01513182, 0.01416038, 0.01398333, 0.01280794, 0.01273195,
       0.01240069, 0.01170192, 0.01090831, 0.01059317, 0.01015801,
       0.00957237, 0.00910587, 0.00907383, 0.00904121, 0.00899759,
       0.00897878, 0.00896468, 0.00893976, 0.00892175, 0.00892095,
       0.00891664, 0.00891493, 0.00890889, 0.00890425, 0.00889998,
       0.00889739, 0.00889691, 0.00889266, 0.00888714, 0.00888539,
       0.00888539, 0.00888286, 0.00887929, 0.00887799, 0.00887698,
       0.00887603, 0.00887382, 0.00887341, 0.00887341, 0.00887341,
       0.00887267, 0.0088702 , 0.00886719, 0.00886147, 0.00886147,
       0.00886147, 0.00886147, 0.00886147, 0.00886147, 0.00886147,
       0.00886147, 0.00886147, 0.00886147, 0.00886147, 0.00886147,
       0.00886147, 0.00886147, 0.00886147, 0.00886147, 0.00886147,
       0.00886147, 0.00886147, 0.00886147, 0.00886147, 0.00886

In [26]:
# Initialize t-SNE model
from sklearn.manifold import TSNE
tsne = TSNE(learning_rate=100,random_state=13)

# Reduce dimensions
tsne_features = tsne.fit_transform(df_scaled)

tsne_features.shape

(744, 2)

In [30]:
tsne_features

array([[ 15.936759 , -12.456148 ],
       [ 17.812193 ,  -7.5477014],
       [-26.371235 ,  12.973386 ],
       ...,
       [-12.352564 , -27.336775 ],
       [ -4.151967 ,  12.869517 ],
       [ 19.535711 , -12.055344 ]], dtype=float32)