In [1]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
print("pandas: %s" % pd.__version__)

import sklearn
from sklearn.preprocessing import StandardScaler
print("sklearn: %s" % sklearn.__version__)

pandas: 1.0.3
sklearn: 0.23.2


In [2]:
# Display all the columns for the dataframes (not-truncated)
pd.set_option("display.max_columns", None)

In [3]:
# Read `crypto_data.csv` into Pandas. 
df = pd.read_csv("Resources/crypto_raw_data.csv")

print(df.describe())
df

       TotalCoinsMined  TotalCoinSupply
count     2.579000e+03     2.580000e+03
mean      4.196324e+19     1.048836e+11
std       1.864065e+21     3.708391e+12
min      -3.768142e+14    -1.000000e+00
25%       3.960000e-03    -1.000000e+00
50%       9.850000e+07    -1.000000e+00
75%       1.000000e+09     0.000000e+00
max       9.354691e+22     1.829060e+14


Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,0.000000e+00,0.0
1,300 token,,True,,3.000000e+02,300.0
2,365Coin,X11,True,PoW/PoS,0.000000e+00,0.0
3,404Coin,Scrypt,True,PoW/PoS,0.000000e+00,0.0
4,433 Token,,False,,,
...,...,...,...,...,...,...
6990,Aion,"Equihash210,9",True,PoW/PoS,4.953610e+08,-1.0
6991,Aeternity,,True,PoW/PoS,3.887295e+08,-1.0
6992,Achain,DPoS,True,DPoS,1.000000e+09,0.0
6993,Bitcicoin,,True,Proof-of-Authority (PoA),1.000000e+11,-1.0


In [4]:
def normalize_decimal_separator(line):
    """
    clean the string number from containing spaces 
    and underscores as the thousands seperator and
    also remove any commas or periods as the 
    thousands seperator
    
    returns a string
    """
    try:
        float(line)
        return line
    except ValueError: 
        # remove the spaces as the thousands seperator
        if (" " in line):
            line = line.replace(" ", "")
        # remove the underscore as the thousands seperator
        if ("_" in line):
            line = line.replace("_", "")
        # remove periods if they are the thousands seperator
        if (line.count(".") > 1):
            line = line.replace(".", "")
        # remove commas if they are the thousands seperator
        if (line.count(",") > 1):
            line = line.replace(",", "")
        #
        return line

In [5]:
# Fix the error in the total coin supply where the values are coming in as a string
if not is_numeric_dtype(df["TotalCoinSupply"]):
    df["TotalCoinSupply"] = pd.to_numeric(df["TotalCoinSupply"].apply(normalize_decimal_separator))
    print(df["TotalCoinSupply"].describe())
df

Unnamed: 0,CoinName,Algorithm,IsTrading,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,True,PoW/PoS,0.000000e+00,0.0
1,300 token,,True,,3.000000e+02,300.0
2,365Coin,X11,True,PoW/PoS,0.000000e+00,0.0
3,404Coin,Scrypt,True,PoW/PoS,0.000000e+00,0.0
4,433 Token,,False,,,
...,...,...,...,...,...,...
6990,Aion,"Equihash210,9",True,PoW/PoS,4.953610e+08,-1.0
6991,Aeternity,,True,PoW/PoS,3.887295e+08,-1.0
6992,Achain,DPoS,True,DPoS,1.000000e+09,0.0
6993,Bitcicoin,,True,Proof-of-Authority (PoA),1.000000e+11,-1.0


In [6]:
# Discard all cryptocurrencies that are not being traded
df = df[df["IsTrading"] == True]
df["IsTrading"].value_counts()

True    5582
Name: IsTrading, dtype: int64

In [7]:
# Drop the `IsTrading` column from the dataframe
df = df.drop(columns=["IsTrading"])
df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,0.000000e+00,0.0
1,300 token,,,3.000000e+02,300.0
2,365Coin,X11,PoW/PoS,0.000000e+00,0.0
3,404Coin,Scrypt,PoW/PoS,0.000000e+00,0.0
5,SixEleven,SHA-256,PoW,0.000000e+00,0.0
...,...,...,...,...,...
6990,Aion,"Equihash210,9",PoW/PoS,4.953610e+08,-1.0
6991,Aeternity,,PoW/PoS,3.887295e+08,-1.0
6992,Achain,DPoS,DPoS,1.000000e+09,0.0
6993,Bitcicoin,,Proof-of-Authority (PoA),1.000000e+11,-1.0


In [8]:
# Remove all rows that have at least one null value
print(df.isnull().sum())
df = df.dropna()
df

CoinName              0
Algorithm          4037
ProofType          3949
TotalCoinsMined    3251
TotalCoinSupply    3250
dtype: int64


Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
0,42 Coin,Scrypt,PoW/PoS,0.000000e+00,0.000000e+00
2,365Coin,X11,PoW/PoS,0.000000e+00,0.000000e+00
3,404Coin,Scrypt,PoW/PoS,0.000000e+00,0.000000e+00
5,SixEleven,SHA-256,PoW,0.000000e+00,0.000000e+00
6,808,SHA-256,PoW/PoS,0.000000e+00,0.000000e+00
...,...,...,...,...,...
6984,Bitcoin,SHA-256,PoW,1.880552e+07,2.100000e+07
6985,Binance Coin,BEP-2,PoSA,1.681370e+08,-1.000000e+00
6987,ARK,DPoS,DPoS,1.596877e+08,-1.000000e+00
6990,Aion,"Equihash210,9",PoW/PoS,4.953610e+08,-1.000000e+00


In [9]:
# Filter for cryptocurrencies that have been mined
df = df[df["TotalCoinsMined"] > 0]
df

Unnamed: 0,CoinName,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
133,NovaCoin,Scrypt,PoW/PoS,3.383067e+06,-1.000000e+00
197,NuShares,PoS,PoS,6.164543e+09,0.000000e+00
198,MonaCoin,Scrypt,PoW,8.294686e+07,-1.000000e+00
231,Triangles Coin,X13,PoW/PoS,1.857394e+05,0.000000e+00
448,SafeExchangeCoin,Scrypt,PoC,2.147484e+09,-1.000000e+00
...,...,...,...,...,...
6984,Bitcoin,SHA-256,PoW,1.880552e+07,2.100000e+07
6985,Binance Coin,BEP-2,PoSA,1.681370e+08,-1.000000e+00
6987,ARK,DPoS,DPoS,1.596877e+08,-1.000000e+00
6990,Aion,"Equihash210,9",PoW/PoS,4.953610e+08,-1.000000e+00


In [10]:
# delete the `Unnamed: 0` and `CoinName` from the original dataframe
if ("Unnamed: 0" in df.columns):
    df = df.drop(columns=["Unnamed: 0","CoinName"])
else:
    df = df.drop(columns=["CoinName"])
df

Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply
133,Scrypt,PoW/PoS,3.383067e+06,-1.000000e+00
197,PoS,PoS,6.164543e+09,0.000000e+00
198,Scrypt,PoW,8.294686e+07,-1.000000e+00
231,X13,PoW/PoS,1.857394e+05,0.000000e+00
448,Scrypt,PoC,2.147484e+09,-1.000000e+00
...,...,...,...,...
6984,SHA-256,PoW,1.880552e+07,2.100000e+07
6985,BEP-2,PoSA,1.681370e+08,-1.000000e+00
6987,DPoS,DPoS,1.596877e+08,-1.000000e+00
6990,"Equihash210,9",PoW/PoS,4.953610e+08,-1.000000e+00


In [11]:
df.describe()

Unnamed: 0,TotalCoinsMined,TotalCoinSupply
count,123.0,123.0
mean,22072340000.0,12220580000.0
std,127685900000.0,91969640000.0
min,1982.992,-1.0
25%,27943000.0,-1.0
50%,128175900.0,20000000.0
75%,1000000000.0,146624100.0
max,1000017000000.0,1000017000000.0


In [12]:
df.to_csv("Resources/crypto_clean_data.csv", index=False)