In [2]:
pip install -U altair

Note: you may need to restart the kernel to use updated packages.


In [51]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


In [4]:
import sagemaker 
import sagemaker.amazon.common as smac 
from sagemaker.predictor import csv_serializer , json_deserializer
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri
import boto3
import io
import numpy as np
import os


In [5]:
# Alternatively, use the provided csv file:
# file_path = Path("Resources/crypto_data.csv")
crypto_df = pd.read_csv('Data/crypto_data (1).csv')
# Create a DataFrame
crypto_df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [6]:
crypto_df = crypto_df[crypto_df.IsTrading == True]

In [7]:
crypto_df = crypto_df.drop(['IsTrading'] , axis = 1)

In [8]:
crypto_df = crypto_df.dropna()

In [9]:
crypto_df = crypto_df.drop(crypto_df[crypto_df['TotalCoinsMined'] == 0].index)

In [10]:
unamed = pd.DataFrame(crypto_df['Unnamed: 0'])


In [11]:
coin_name_df = pd.DataFrame(crypto_df['CoinName'])

In [12]:
coin_name_df.set_index(crypto_df.index)

Unnamed: 0,CoinName
0,42 Coin
2,404Coin
5,EliteCoin
7,Bitcoin
8,Ethereum
...,...
1238,ZEPHYR
1242,Gapcoin
1245,Beldex
1246,Horizen


In [13]:
crypto_df = crypto_df.drop(['CoinName', 'Unnamed: 0'] , axis = 1)

In [14]:
X = pd.get_dummies(data = crypto_df , columns = ['Algorithm' , 'ProofType'])

In [15]:
scaler = StandardScaler()
scaler.fit(X)
crypto_scaled = scaler.transform(X)

In [16]:
pca = PCA(n_components=3)
crypto_pca = pca.fit_transform(crypto_scaled)

In [17]:
pca_df = pd.DataFrame(data = crypto_pca , columns = ['pca1' , 'pca2' , 'pca3'])

In [18]:
bucket = 'sagemaker-20210323-1830'

prefix = 'crypto-kmeans-classifier'

role = get_execution_role()

In [19]:
buf = io.BytesIO()
vectors = np.array(pca_df).astype('float32')
smac.write_numpy_to_dense_tensor(buf , vectors)
buf.seek(0)


0

In [20]:
key = 'non_linear.data'
boto3.resource('s3').Bucket(bucket).Object(
    os.path.join(prefix , 'train' , key) 
).upload_fileobj(buf)
s3_train_data = 's3://{}/{}/train/{}'.format(bucket , prefix , key)
print('Training data uploaded to {}'.format(s3_train_data))

Training data uploaded to s3://sagemaker-20210323-1830/crypto-kmeans-classifier/train/non_linear.data


In [21]:
from sagemaker import KMeans


In [22]:
inertia = []
g = list(range(2, 10))



# Calculate the inertia for the range of k values
for i in g: 
    Kmodel = KMeans( role = role, k = i , train_instance_count=1,
                train_instance_type='ml.c4.xlarge',)
    Kmodel.fit(Kmodel.record_set(vectors))
    inertia.append(Kmodel.inertia_)
    

# Create the Elbow Curve using hvPlot
elbow_data = {'k' :k , 'inertia': inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.altair(x= 'k' , y= 'inertia', xticks = g , title = 'Elbow Curve')

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-03-24 01:40:37 Starting - Starting the training job.

KeyboardInterrupt: 

In [24]:
sess = sagemaker.Session()

In [25]:
model = KMeans(role = role, k = 4 , train_instance_count=1,
                train_instance_type='ml.c4.xlarge',output_path='s3://{}/{}/output'.format(bucket , prefix),sagemaker_session = sess)               

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [26]:
model.fit(model.record_set(vectors))

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-03-24 01:41:20 Starting - Starting the training job...
2021-03-24 01:41:43 Starting - Launching requested ML instancesProfilerReport-1616550080: InProgress
......
2021-03-24 01:42:43 Starting - Preparing the instances for training.........
2021-03-24 01:44:16 Downloading - Downloading input data...
2021-03-24 01:44:49 Training - Training image download completed. Training in progress.
2021-03-24 01:44:49 Uploading - Uploading generated training model.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[03/24/2021 01:44:47 INFO 140176667305344] Reading default configuration from /opt/amazon/lib/python3.6/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', '

In [28]:
predictor = model.deploy(initial_instance_count=1, 
                                 instance_type='ml.t2.medium')

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


-------------------!

In [29]:
result=  predictor.predict(vectors)

In [30]:
cluster_labels = [r.label['closest_cluster'].float32_tensor.values[0] for r in result]

In [35]:
pca_df['class'] = cluster_labels

In [36]:
pca_df.head()

Unnamed: 0,pca1,pca2,pca3,class
0,-0.328107,0.963895,-0.410571,0.0
1,-0.311388,0.964038,-0.411076,0.0
2,2.318312,1.673575,-0.482046,2.0
3,-0.146341,-1.268227,0.169577,1.0
4,-0.157106,-1.997624,0.296731,1.0


In [37]:
crypto_df['ye'] = unamed

In [38]:
crypto_df = crypto_df.set_index('ye')

In [39]:
clustered_df = pd.DataFrame(crypto_df)

In [40]:
clustered_df.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
ye,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Scrypt,PoW/PoS,41.99995,42
404,Scrypt,PoW/PoS,1055185000.0,532000000
1337,X13,PoW/PoS,29279420000.0,314159265359
BTC,SHA-256,PoW,17927180.0,21000000
ETH,Ethash,PoW,107684200.0,0


In [41]:
req_index = clustered_df.index

In [42]:
clustered_df = clustered_df.reset_index()

In [45]:
coin_name_df = coin_name_df.reset_index()

In [46]:
df_4 = pd.merge(clustered_df, pca_df.iloc[:,0:3], left_index = True , right_index = True)
df_4.head()

Unnamed: 0,ye,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,pca1,pca2,pca3
0,42,Scrypt,PoW/PoS,41.99995,42,-0.328107,0.963895,-0.410571
1,404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.311388,0.964038,-0.411076
2,1337,X13,PoW/PoS,29279420000.0,314159265359,2.318312,1.673575,-0.482046
3,BTC,SHA-256,PoW,17927180.0,21000000,-0.146341,-1.268227,0.169577
4,ETH,Ethash,PoW,107684200.0,0,-0.157106,-1.997624,0.296731


In [47]:
df_5 = pd.merge( df_4 , coin_name_df ,left_index= True, right_index= True)
df_5.head()

Unnamed: 0,ye,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,pca1,pca2,pca3,index,CoinName
0,42,Scrypt,PoW/PoS,41.99995,42,-0.328107,0.963895,-0.410571,0,42 Coin
1,404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.311388,0.964038,-0.411076,2,404Coin
2,1337,X13,PoW/PoS,29279420000.0,314159265359,2.318312,1.673575,-0.482046,5,EliteCoin
3,BTC,SHA-256,PoW,17927180.0,21000000,-0.146341,-1.268227,0.169577,7,Bitcoin
4,ETH,Ethash,PoW,107684200.0,0,-0.157106,-1.997624,0.296731,8,Ethereum


In [48]:
clustered_df_final = pd.merge(df_5 , pca_df.iloc[:,3] , left_index = True , right_index = True)
clustered_df_final.head()
clustered_df_final = clustered_df_final.set_index('ye')
clustered_df_final = clustered_df_final.drop(['index'] , axis = 1)
clustered_df_final.head()

Unnamed: 0_level_0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,pca1,pca2,pca3,CoinName,class
ye,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
42,Scrypt,PoW/PoS,41.99995,42,-0.328107,0.963895,-0.410571,42 Coin,0.0
404,Scrypt,PoW/PoS,1055185000.0,532000000,-0.311388,0.964038,-0.411076,404Coin,0.0
1337,X13,PoW/PoS,29279420000.0,314159265359,2.318312,1.673575,-0.482046,EliteCoin,2.0
BTC,SHA-256,PoW,17927180.0,21000000,-0.146341,-1.268227,0.169577,Bitcoin,1.0
ETH,Ethash,PoW,107684200.0,0,-0.157106,-1.997624,0.296731,Ethereum,1.0


In [52]:
alt.Chart(clustered_df_final).mark_circle(size=60).encode(
    x='pca1',
    y='pca2',
    color='class',
    tooltip=['CoinName', 'Algorithm', 'TotalCoinsMined', 'TotalCoinSupply']
).interactive()

In [53]:
from sklearn.preprocessing  import MinMaxScaler
min_max_scaler = MinMaxScaler()
clustered_df_final[['TotalCoinsMined' , 'TotalCoinSupply']] = min_max_scaler.fit_transform(clustered_df_final[['TotalCoinsMined' , 'TotalCoinSupply']])

In [54]:
alt.Chart(clustered_df_final).mark_circle(size=60).encode(
    x='TotalCoinsMined',
    y='TotalCoinSupply',
    color='class',
    tooltip=['CoinName', 'Algorithm', 'TotalCoinsMined', 'TotalCoinSupply']
).interactive()