In [125]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [126]:
# Loading the dataset from the resources folder
crypto_df = pd.read_csv(Path('Resources/crypto_data.csv'))

# Exploratory Data Analysis (EDA):

1. Drop the unnamed column as it does not contribute to clustering or add value to input data
2. List the DataFrame's data types to ensure they're aligned to the type of data stored on each column.
3. Is there any column whose data type need to be changed? If so, make the corresponding adjustments.
4. Is there any unnecessary column that needs to be dropped? If so, make the corresponding adjustments.
5. Check for duplicates.
6. In order to use unsupervised learning algorithms, all the features should be numeric, and also, on similar scales.
7. Rename the column if needed.

In [127]:
crypto_df

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...,...
1247,XBC,BitcoinPlus,Scrypt,True,PoS,1.283270e+05,1000000
1248,DVTC,DivotyCoin,Scrypt,False,PoW/PoS,2.149121e+07,100000000
1249,GIOT,Giotto Coin,Scrypt,False,PoW/PoS,,233100000
1250,OPSC,OpenSourceCoin,SHA-256,False,PoW/PoS,,21000000


In [128]:
crypto_df.shape

(1252, 7)

In [129]:
crypto_df['IsTrading'].unique()

array([ True, False])

In [130]:
crypto_df['IsTrading'].value_counts()

True     1144
False     108
Name: IsTrading, dtype: int64

In [131]:
crypto_df.dtypes

Unnamed: 0          object
CoinName            object
Algorithm           object
IsTrading             bool
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [132]:
# Find null values
for column in crypto_df.columns:
    print(f"Column {column} has {crypto_df[column].isnull().sum()} null values")


Column Unnamed: 0 has 0 null values
Column CoinName has 0 null values
Column Algorithm has 0 null values
Column IsTrading has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 508 null values
Column TotalCoinSupply has 0 null values


In [133]:
# Find duplicate entries
print(f"Duplicate entries: {crypto_df.duplicated().sum()}")


Duplicate entries: 0


In [134]:
# A list of the columns from the original DataFrame
crypto_df.columns

Index(['Unnamed: 0', 'CoinName', 'Algorithm', 'IsTrading', 'ProofType',
       'TotalCoinsMined', 'TotalCoinSupply'],
      dtype='object')

## Data Preparation:

In [135]:
# Drop the unnamed column as it does not contribute to clustering or add value to input data
crypto_df = crypto_df.loc[:, ~crypto_df.columns.str.contains('^Unnamed')]
crypto_df.head()

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.0,0


In [136]:
# Discard all cryptocurrencies that are not being traded. In other words, filter for currencies 
# that are currently being traded. Once you have done this, drop the IsTrading column from the dataframe.

# selecting rows based on condition 
# df[(df['C']==True)

trading_df = crypto_df[(crypto_df['IsTrading'] == True)]
trading_df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,4.199995e+01,42
1,365Coin,X11,True,PoW/PoS,,2300000000
2,404Coin,Scrypt,True,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,True,PoW,,611000
4,808,SHA-256,True,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...,...
1243,Super Zero,Ethash,True,PoW,,1000000000
1244,UOS,SHA-256,True,DPoI,,1000000000
1245,Beldex,CryptoNight,True,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,True,PoW,7.296538e+06,21000000


In [137]:
# Drop 'IsTrading' column from the dataframe

trading_df = trading_df.drop(['IsTrading'], axis=1)
trading_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.000000e+00,0
...,...,...,...,...,...
1243,Super Zero,Ethash,PoW,,1000000000
1244,UOS,SHA-256,DPoI,,1000000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [138]:
# Drop the null columns where all values are null
trading_df = trading_df.dropna(axis='columns', how='all')
trading_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.0,0


In [139]:
# Remove all rows that have at least one null value.
trading_df = trading_df.dropna(how = 'all')
trading_df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.0,0


In [140]:
trading_df.shape

(1144, 5)

In [141]:
# Find null values
for column in trading_df.columns:
    print(f"Column {column} has {trading_df[column].isnull().sum()} null values")


Column CoinName has 0 null values
Column Algorithm has 0 null values
Column ProofType has 0 null values
Column TotalCoinsMined has 459 null values
Column TotalCoinSupply has 0 null values


In [142]:
# Filter for cryptocurrencies that have been mined. That is, the total coins mined should be greater than zero.
# Filter the dataframe with non zero values in column 'TotalCoinsMined'
trading_df = trading_df.loc[~(trading_df['TotalCoinsMined'] == 0)]
trading_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,PoW,,611000
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
...,...,...,...,...,...
1243,Super Zero,Ethash,PoW,,1000000000
1244,UOS,SHA-256,DPoI,,1000000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [143]:
trading_df = trading_df.fillna(0)
trading_df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,4.199995e+01,42
1,365Coin,X11,PoW/PoS,0.000000e+00,2300000000
2,404Coin,Scrypt,PoW/PoS,1.055185e+09,532000000
3,SixEleven,SHA-256,PoW,0.000000e+00,611000
5,EliteCoin,X13,PoW/PoS,2.927942e+10,314159265359
...,...,...,...,...,...
1243,Super Zero,Ethash,PoW,0.000000e+00,1000000000
1244,UOS,SHA-256,DPoI,0.000000e+00,1000000000
1245,Beldex,CryptoNight,PoW,9.802226e+08,1400222610
1246,Horizen,Equihash,PoW,7.296538e+06,21000000


In [144]:
# In order for your dataset to be comprehensible to a machine learning algorithm, 
# its data should be numeric. Since the coin names do not contribute to the analysis of the data, 
# delete the CoinName from the original dataframe.

# Drop 'CoinName' column from the dataframe

trading_df = trading_df.drop(['CoinName'], axis=1)
trading_df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,4.199995e+01,42
1,X11,PoW/PoS,0.000000e+00,2300000000
2,Scrypt,PoW/PoS,1.055185e+09,532000000
3,SHA-256,PoW,0.000000e+00,611000
5,X13,PoW/PoS,2.927942e+10,314159265359
...,...,...,...,...
1243,Ethash,PoW,0.000000e+00,1000000000
1244,SHA-256,DPoI,0.000000e+00,1000000000
1245,CryptoNight,PoW,9.802226e+08,1400222610
1246,Equihash,PoW,7.296538e+06,21000000


In [145]:
trading_df = trading_df.reset_index()

In [146]:
# convert the remaining features with text values, Algorithm and ProofType, into numerical data
X_dummies = pd.get_dummies(trading_df)
X = X_dummies
print (X.columns)
X

Index(['index', 'TotalCoinsMined', 'Algorithm_1GB AES Pattern Search',
       'Algorithm_536', 'Algorithm_Argon2', 'Algorithm_Argon2d',
       'Algorithm_BLAKE256', 'Algorithm_Blake', 'Algorithm_Blake2S',
       'Algorithm_Blake2b',
       ...
       'TotalCoinSupply_98000000', 'TotalCoinSupply_98100000000',
       'TotalCoinSupply_987600000', 'TotalCoinSupply_989800',
       'TotalCoinSupply_99000000', 'TotalCoinSupply_9900000000',
       'TotalCoinSupply_990000000000', 'TotalCoinSupply_99792000',
       'TotalCoinSupply_999481516', 'TotalCoinSupply_9999999'],
      dtype='object', length=584)


Unnamed: 0,index,TotalCoinsMined,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,...,TotalCoinSupply_98000000,TotalCoinSupply_98100000000,TotalCoinSupply_987600000,TotalCoinSupply_989800,TotalCoinSupply_99000000,TotalCoinSupply_9900000000,TotalCoinSupply_990000000000,TotalCoinSupply_99792000,TotalCoinSupply_999481516,TotalCoinSupply_9999999
0,0,4.199995e+01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0.000000e+00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1.055185e+09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0.000000e+00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,2.927942e+10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987,1243,0.000000e+00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
988,1244,0.000000e+00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
989,1245,9.802226e+08,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
990,1246,7.296538e+06,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [147]:
trading_df.dtypes

index                int64
Algorithm           object
ProofType           object
TotalCoinsMined    float64
TotalCoinSupply     object
dtype: object

In [148]:
# Standardize your dataset so that columns that contain larger values do not unduly influence the outcome.
# Standarize data with StandarScaler

crypto_scaled = StandardScaler().fit_transform(X)
print(crypto_scaled)

[[-1.72099969 -0.08530836 -0.03176605 ... -0.03176605 -0.03176605
  -0.03176605]
 [-1.71816303 -0.08530836 -0.03176605 ... -0.03176605 -0.03176605
  -0.03176605]
 [-1.71532637 -0.05381294 -0.03176605 ... -0.03176605 -0.03176605
  -0.03176605]
 ...
 [ 1.81064039 -0.05605043 -0.03176605 ... -0.03176605 -0.03176605
  -0.03176605]
 [ 1.81347705 -0.08509057 -0.03176605 ... -0.03176605 -0.03176605
  -0.03176605]
 [ 1.8163137  -0.08530453 -0.03176605 ... -0.03176605 -0.03176605
  -0.03176605]]


## Dimensionality Reduction:

Creating dummy variables above dramatically increased the number of features in your dataset. Perform dimensionality reduction with PCA. Rather than specify the number of principal components when you instantiate the PCA model, it is possible to state the desired explained variance. For example, say that a dataset has 100 features. Using PCA(n_components=0.99) creates a model that will preserve approximately 99% of the explained variance, whether that means reducing the dataset to 80 principal components or 3. For this project, preserve 90% of the explained variance in dimensionality reduction. How did the number of the features change?

### Speeding up ML algorithms by bringing down the number of features using PCA:

In [165]:
# Applying PCA to reduce dimensions from 583 columns

# Initialize PCA model
pca = PCA(n_components=0.99)

# Get two principal components for the crypto data.
crypto_pca = pca.fit_transform(crypto_scaled)
crypto_pca

array([[-6.93074718e-01, -2.61076088e-01,  2.42005520e-05, ...,
        -5.79808652e-02, -2.17181299e-01,  1.26568096e-01],
       [-5.45273066e-01, -1.56429228e-01, -7.06047566e-05, ...,
         4.54424803e-01, -3.70675942e-01, -1.28251660e-01],
       [-6.70423520e-01, -2.62603268e-01,  2.52850458e-05, ...,
        -5.83195724e-02, -2.17097891e-01,  1.26335512e-01],
       ...,
       [ 4.39895348e-01,  7.19295126e-02,  4.59127018e-04, ...,
        -1.75676121e-02, -1.45100631e-01, -1.10454995e-01],
       [ 1.66865610e-01,  6.57222977e-02,  3.70090681e-04, ...,
         6.41496376e-01,  8.43498423e-02, -1.51395941e-01],
       [-6.97745047e-04,  1.53469443e-02,  2.10750861e-04, ...,
        -1.77294010e-01, -2.00145478e-01, -9.39261460e-01]])

In [166]:
# Transform PCA data to a DataFrame
# df_crypto_pca = pd.DataFrame(
#     data=crypto_pca, columns=["principal component 1", "principal component 2"]
# )
# df_crypto_pca.head()

In [167]:
# Fetch the explained variance
# pca.explained_variance_ratio_

### Sample Analysis:

### Further reduce the dataset dimensions with t-SNE and visually inspect the results:

Next, further reduce the dataset dimensions with t-SNE and visually inspect the results. In order to accomplish this task, run t-SNE on the principal components: the output of the PCA transformation. Then create a scatter plot of the t-SNE output. Observe whether there are distinct clusters or not.

In [168]:
# # Run t-SNE
# tsne = TSNE(learning_rate=250)
# tsne_features = tsne.fit_transform(scaled_features)

In [169]:
# # Check that dimensions have been reduced to two
# tsne_features.shape

In [None]:
# # Results colored by target value
# plt.scatter(tsne_features[:,0], tsne_features[:,1], c=labels)
# plt.show()

## Cluster Analysis with k-Means:

Create an elbow plot to identify the best number of clusters. Use a for-loop to determine the inertia for each k between 1 through 10. Determine, if possible, where the elbow of the plot is, and at which value of k it appears.

In [None]:
# # Initialize the K-Means model
# model = KMeans(n_clusters=3, random_state=0)

# # Fit the model
# model.fit(df_crypto_pca)

# # Predict clusters
# predictions = model.predict(df_crypto_pca)

# # Add the predicted class columns
# df_crypto_pca["class"] = model.labels_
# df_crypto_pca.head()

In [None]:
# BONUS: plot the 3 principal components
# import plotly.express as px
# fig = px.scatter_3d(
#     df_crypto_pca,
#     x="principal component 3",
#     y="principal component 2",
#     z="principal component 1",
#     color="class",
#     symbol="class",
#     width=800,
# )
# fig.update_layout(legend=dict(x=0, y=1))
# fig.show()

## Recommendation:

Based on your findings, make a brief (1-2 sentences) recommendation to your clients. Can the cryptocurrencies be clustered together? If so, into how many clusters?