## Cryptocurrency clusters


In [72]:
import pandas as pd
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.decomposition import PCA

In [61]:
# Read crypto_data.csv into Pandas. 
df = pd.read_csv('crypto_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42,42 Coin,Scrypt,True,PoW/PoS,41.99995,42
1,365,365Coin,X11,True,PoW/PoS,,2300000000
2,404,404Coin,Scrypt,True,PoW/PoS,1055185000.0,532000000
3,611,SixEleven,SHA-256,True,PoW,,611000
4,808,808,SHA-256,True,PoW/PoS,0.0,0


In [62]:
# Keep only cryptocurrencies that are trading
df = df.loc[df['IsTrading'] == True]

In [63]:
# drop the IsTrading column from the dataframe.
df.drop('IsTrading', axis=1, inplace=True)
# drop the CoinName column from the dataframe
# df.drop('CoinName', axis=1, inplace=True)
df.drop('Unnamed: 0', axis=1, inplace = True)
df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
1,365Coin,X11,PoW/PoS,,2300000000
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
3,SixEleven,SHA-256,PoW,,611000
4,808,SHA-256,PoW/PoS,0.0,0


In [64]:
# Remove all rows that have at least one null value.
df = df.dropna(how='any',axis=0)
df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
4,808,SHA-256,PoW/PoS,0.0,0
5,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,Bitcoin,SHA-256,PoW,17927180.0,21000000


In [65]:
# Filter for cryptocurrencies that have been mined. That is, the total coins mined should be greater than zero.
# mined_df = df.query('TotalCoinsMined > 0')
df = df.loc[df.TotalCoinsMined > 0]
df.head()

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,41.99995,42
2,404Coin,Scrypt,PoW/PoS,1055185000.0,532000000
5,EliteCoin,X13,PoW/PoS,29279420000.0,314159265359
7,Bitcoin,SHA-256,PoW,17927180.0,21000000
8,Ethereum,Ethash,PoW,107684200.0,0


In [66]:
# Store the 'CoinName'column in its own DataFrame prior to dropping it from df 
coinname_df = pd.DataFrame(
    data = df, columns = ['CoinName'])
coinname_df.head()

Unnamed: 0,CoinName
0,42 Coin
2,404Coin
5,EliteCoin
7,Bitcoin
8,Ethereum


In [67]:
# Drop the 'CoinName' column since it's not going to be used on the clustering algorithm
df = df.drop(columns = 'CoinName')
df.head()

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,Scrypt,PoW/PoS,41.99995,42
2,Scrypt,PoW/PoS,1055185000.0,532000000
5,X13,PoW/PoS,29279420000.0,314159265359
7,SHA-256,PoW,17927180.0,21000000
8,Ethash,PoW,107684200.0,0


In [68]:
# convert the remaining features with text values, Algorithm and ProofType, into numerical data. 
category_columns = ['Algorithm','ProofType']
df = pd.get_dummies(df, columns = category_columns)
df.head()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
0,41.99995,42,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1055185000.0,532000000,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,29279420000.0,314159265359,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,17927180.0,21000000,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,107684200.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
# Standardize your dataset so that columns that contain larger values do not unduly influence the outcome.
# Scale the data

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
print(scaled_data)

[[-0.11710817 -0.1528703  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.09396955 -0.145009   -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [ 0.52494561  4.48942416 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 ...
 [-0.09561336 -0.13217937 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11694817 -0.15255998 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11710536 -0.15285552 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]]


In [70]:
scaled_data.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [37]:
print(df.shape)

(577, 101)


In [38]:
# print(df.describe())

In [39]:
# df.hist()

In [40]:
pyplot.show()

# Dimensionality Reduction

In [75]:
# Perform dimensionality reduction with PCA
pca = PCA(n_components=3)
pca_data = pca.fit_transform(scaled_data)

In [42]:
# reduce the dataset dimensions with t-SNE and visually inspect the results.

# To put into my da class notes notebook


In [76]:
# remove all null values 
for column in df.columns:
    print(f"column {column} has {df[column].isnull().sum()} null values")

column TotalCoinsMined has 0 null values
column TotalCoinSupply has 0 null values
column Algorithm_1GB AES Pattern Search has 0 null values
column Algorithm_536 has 0 null values
column Algorithm_Argon2d has 0 null values
column Algorithm_BLAKE256 has 0 null values
column Algorithm_Blake has 0 null values
column Algorithm_Blake2S has 0 null values
column Algorithm_Blake2b has 0 null values
column Algorithm_C11 has 0 null values
column Algorithm_Cloverhash has 0 null values
column Algorithm_Counterparty has 0 null values
column Algorithm_CryptoNight has 0 null values
column Algorithm_CryptoNight Heavy has 0 null values
column Algorithm_CryptoNight-V7 has 0 null values
column Algorithm_Cryptonight-GPU has 0 null values
column Algorithm_DPoS has 0 null values
column Algorithm_Dagger has 0 null values
column Algorithm_Dagger-Hashimoto has 0 null values
column Algorithm_ECC 256K1 has 0 null values
column Algorithm_Equihash has 0 null values
column Algorithm_Equihash+Scrypt has 0 null values

In [78]:
# remove duplicate entries
print(f"duplicate entries: {df.duplicated().sum()}")

duplicate entries: 1


In [79]:
# change column data from yes / no to 1/0
def changestatus(status):
    if status == "Yes":
        return 1
    else
        return 0
    

SyntaxError: invalid syntax (<ipython-input-79-4557f22dc846>, line 5)

In [80]:
df_shopping["Previous Shopper"] = df_shopping["Previous Shopper"].apply(changestatus)
df_shopping.head()

NameError: name 'df_shopping' is not defined

In [82]:
yes_no = ("Yes":1, "No":0)

SyntaxError: invalid syntax (<ipython-input-82-0ca428676184>, line 1)

In [81]:
# use map 
df_shopping["Previous Shopper"] = df_shopping["Previous Shopper"].map(yes_no)

NameError: name 'df_shopping' is not defined

In [83]:
# rename columns in data frame
new_df = new_df.rename(columns = {'Spending score (1-100)': 'Spending_Score'})
new_df.head()

NameError: name 'new_df' is not defined

In [None]:
# save the cleaned data to a .csv 
new_df.to_csv(file_path, index=False)