#Dataset Generation


In [None]:
import random
import string

# Function to generate Bitcoin addresses with data augmentation
def generate_btc_address():
    prefixes = ['1', '3', 'bc1']
    prefix = random.choice(prefixes)

    # Generate a random length between 27 and 34 for the body
    body_length = random.randint(27, 34)

    if prefix == 'bc1':
        body = ''.join(random.choices(string.ascii_lowercase + string.digits, k=body_length))
    else:
        body = ''.join(random.choices(string.ascii_letters + string.digits, k=body_length))

    # Apply data augmentation techniques
    augmented_address = list(body)
    current_length = len(augmented_address)

    for _ in range(random.randint(1, 5)):  # Randomly substitute characters
        index = random.randint(0, current_length - 1)
        augmented_address[index] = random.choice(string.ascii_letters + string.digits)

    if current_length > 27:  # Truncate if length exceeds 27
        augmented_address = augmented_address[:27]
    elif current_length < 34:  # Pad if length is less than 34
        augmented_address += random.choices(string.ascii_letters + string.digits, k=34 - current_length)

    current_length = len(augmented_address)  # Update the current length after truncation/padding

    for _ in range(random.randint(1, 3)):  # Randomly transpose characters
        index1 = random.randint(0, current_length - 1)
        index2 = random.randint(0, current_length - 1)
        augmented_address[index1], augmented_address[index2] = augmented_address[index2], augmented_address[index1]

    for _ in range(random.randint(1, 5)):  # Randomly add noise
        index = random.randint(0, current_length)
        augmented_address.insert(index, random.choice(string.ascii_letters + string.digits))
        current_length += 1  # Increment the length after adding noise

    return prefix + ''.join(augmented_address)

# Function to generate Ethereum addresses with data augmentation
def generate_eth_address():
    body = ''.join(random.choices(string.hexdigits, k=40))  # Include both upper and lower case

    # Apply data augmentation techniques
    for _ in range(random.randint(1, 5)):
        index = random.randint(0, 40)
        body = body[:index] + random.choice(string.hexdigits) + body[index:]

    return '0x' + body

# Function to generate Litecoin addresses with data augmentation
def generate_ltc_address():
    prefixes = ['L', 'M', '3', 'ltc1']
    prefix = random.choice(prefixes)

    # Generate a random length between 33 and 39 for the body
    body_length = random.randint(33, 39)

    if prefix == 'ltc1':
        body = ''.join(random.choices(string.ascii_lowercase + string.digits, k=body_length))
    else:
        body = ''.join(random.choices(string.ascii_letters + string.digits, k=body_length))

    # Apply data augmentation techniques
    for _ in range(random.randint(1, 5)):
        index = random.randint(0, body_length)
        body = body[:index] + random.choice(string.ascii_letters + string.digits) + body[index:]

    return prefix + body

# Function to generate Bitcoin Cash addresses with data augmentation
def generate_bch_address():
    prefixes = ['1', '3', 'bitcoincash:']
    prefix = random.choice(prefixes)

    # Generate a random length between 42 and 49 for the body
    body_length = random.randint(42, 49)

    if prefix == 'bitcoincash:':
        body = ''.join(random.choices(string.ascii_lowercase + string.digits, k=body_length))
    else:
        body = ''.join(random.choices(string.ascii_letters + string.digits, k=body_length))

    # Apply data augmentation techniques
    for _ in range(random.randint(1, 5)):
        index = random.randint(0, body_length)
        body = body[:index] + random.choice(string.ascii_letters + string.digits) + body[index:]

    return prefix + body


In [None]:
import pandas as pd
# Generating a synthetic dataset
data = []

for _ in range(5000):
    data.append((generate_btc_address(), 'Bitcoin'))
    data.append((generate_eth_address(), 'Ethereum'))
    data.append((generate_ltc_address(), 'Litecoin'))
    data.append((generate_bch_address(), 'Bitcoin Cash'))

df = pd.DataFrame(data, columns=['address', 'crypto_type'])

# Save to CSV for future use
df.to_csv('synthetic_crypto_addresses.csv', index=False)

print("Synthetic dataset created and saved to 'synthetic_crypto_addresses.csv'.")

Synthetic dataset created and saved to 'synthetic_crypto_addresses.csv'.


In [None]:
print(df[:10])

                                             address   crypto_type  \
0                   1AWqLhphSRLuEwgbJ9U3g5WmXMuvGFuL       Bitcoin   
1     0xeda246eDAECfF433A38cff9Ac08D96d148E5edAe1Ffd      Ethereum   
2          LJilNq3WYTEtOxNrcC2wK7bekMT7H1gpglFHGjVwA      Litecoin   
3  3U2S1zDB7bGwhtRvJDaEFQeq1pObRPTJFxys3YbidhBiMB...  Bitcoin Cash   
4                    1inwXvMNM1GEgvHOy1jFQF06McLzocy       Bitcoin   
5    0xD1d7Ab99ec5FEdcDA57fACaEDa868aB5EedB9DAecc5f5      Ethereum   
6            MzSFs5nXHU2Vt3MnRAFtnWGsfzJaInmGCi4EbVH      Litecoin   
7  bitcoincash:55zmqneqvQocsdxc8b3mlkmvljk42lgn5g...  Bitcoin Cash   
8                   3vgkZyp7hC9EJQce2Ky5AB7X5UcXtka5       Bitcoin   
9      0x4D541bd5aAc60160B547e25ca684aE2FEF7ae074Ae0      Ethereum   

                                            features  
0  [32, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,...  
1  [46, 0x, 2, 2, 1, 3, 3, 1, 2, 0, 3, 2, 1, 0, 2...  
2  [41, L, 0, 1, 1, 1, 0, 0, 0, 2, 0, 0, 0, 1, 1,...  
3  [50, 3

#Loading and preprocessing of the dataset for extracting features

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle

def extract_features(address):
    length = len(address)
    if address.startswith('bc1'):
        prefix = 'bc1'
    elif address.startswith('0x'):
        prefix = '0x'
    elif address.startswith('ltc1'):
        prefix = 'ltc1'
    elif address.startswith('bitcoincash:'):
        prefix = 'bitcoincash:'
    else:
        prefix = address[0]

    char_distribution = {char: address.count(char) for char in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'}
    feature_vector = [length, prefix] + [char_distribution.get(char, 0) for char in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ']

    return feature_vector


# Apply feature extraction to the dataset
df['features'] = df['address'].apply(extract_features)

# Create a DataFrame from the feature lists
df_features = pd.DataFrame(df['features'].tolist())

# Separate prefix and other features
prefixes = df_features[1]
features = df_features.drop(columns=[1])

# Encode prefixes
label_encoder = LabelEncoder()
encoded_prefixes = label_encoder.fit_transform(prefixes)
pickle.dump(label_encoder, open('label_encoder.pkl', 'wb'))

# Combine encoded prefixes with other features
X = np.column_stack((encoded_prefixes, features))

# Labels
y = df['crypto_type']


In [None]:
print(X[:10])
print()
print(y[:10])

[[ 1 32  0  1  0  1  0  1  0  0  0  1  0  1  0  0  0  0  2  2  0  0  0  0
   1  0  0  1  1  0  0  0  3  1  1  0  0  0  1  0  0  0  1  1  1  0  0  1
   0  3  1  0  0  0  0  1  1  0  1  0  2  1  0  0]
 [ 0 46  2  2  1  3  3  1  2  0  3  2  1  0  2  4  4  4  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  1  0  0  4  0  1  2  2  2  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 3 41  0  1  1  1  0  0  0  2  0  0  0  1  1  0  1  0  2  0  1  1  1  2
   0  0  0  1  1  1  0  1  0  0  2  1  0  0  1  0  1  0  1  1  1  2  0  1
   1  1  1  2  1  0  0  0  0  2  0  1  1  0  1  0]
 [ 2 50  0  2  1  2  1  0  0  1  0  0  1  3  1  1  1  1  0  2  2  0  0  0
   0  0  0  1  1  0  1  2  0  1  1  1  1  1  0  3  0  2  1  2  1  0  0  2
   0  0  1  0  1  1  1  2  1  1  1  0  0  0  1  0]
 [ 1 31  1  3  0  0  0  0  1  0  0  0  0  0  2  0  0  0  1  0  1  1  0  0
   0  1  1  0  0  0  0  0  0  2  1  0  2  1  0  0  0  0  1  2  1  1  0  0
   0  1  3  1  1  0  1  0  0  0  0  0  0  1  0  0]
 [ 0 

#Training and assessing different ML models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to train and evaluate a model
def train_and_evaluate(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Results for {model_name}:")
    print(classification_report(y_test, y_pred))
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print('-' * 60)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
train_and_evaluate(log_reg, "Logistic Regression")

# Decision Tree
dec_tree = DecisionTreeClassifier()
train_and_evaluate(dec_tree, "Decision Tree")

# Random Forest
rand_forest = RandomForestClassifier()
train_and_evaluate(rand_forest, "Random Forest")

# Gradient Boosting
grad_boost = GradientBoostingClassifier()
train_and_evaluate(grad_boost, "Gradient Boosting")

# Support Vector Machine
svm = SVC()
train_and_evaluate(svm, "Support Vector Machine")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Results for Logistic Regression:
              precision    recall  f1-score   support

     Bitcoin       0.95      0.93      0.94       978
Bitcoin Cash       0.99      0.99      0.99      1008
    Ethereum       1.00      1.00      1.00       998
    Litecoin       0.93      0.94      0.93      1016

    accuracy                           0.97      4000
   macro avg       0.97      0.97      0.97      4000
weighted avg       0.97      0.97      0.97      4000

Accuracy: 0.97
------------------------------------------------------------
Results for Decision Tree:
              precision    recall  f1-score   support

     Bitcoin       0.96      0.96      0.96       978
Bitcoin Cash       0.99      0.99      0.99      1008
    Ethereum       1.00      1.00      1.00       998
    Litecoin       0.95      0.96      0.95      1016

    accuracy                           0.98      4000
   macro avg       0.98      0.98      0.98      4000
weighted avg       0.98      0.98      0.98      

#Sample Prediction after extracting features

In [None]:
def extract_feature(address):
    length = len(address)
    if address.startswith('bc1'):
        prefix = 'bc1'
    elif address.startswith('0x'):
        prefix = '0x'
    elif address.startswith('ltc1'):
        prefix = 'ltc1'
    elif address.startswith('bitcoincash:'):
        prefix = 'bitcoincash:'
    else:
        prefix = address[0]

    char_distribution = {char: address.count(char) for char in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'}
    feature_vector = [length, prefix] + [char_distribution.get(char, 0) for char in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ']

    return feature_vector

# Load the saved label encoder
label_encoder = pickle.load(open('label_encoder.pkl', 'rb'))

features = extract_feature("LgSZymhGV3DZbU1iMCLkhjobTqtiAsNsvJ")

# Extract and encode the prefix
prefix = features[1]
encoded_prefix = label_encoder.transform([prefix])[0]

# Combine encoded prefix with other features
combined_features =  [encoded_prefix] + [features[0]] + features[2:]

# Convert features to numpy array and reshape
features_np = np.array(combined_features).reshape(1, -1)

grad_boost.predict(features_np)


array(['Litecoin'], dtype=object)

#Deep Neural Network

In [None]:
from sklearn.preprocessing import LabelBinarizer

# One-hot encode the labels
encoder = LabelBinarizer()
y_encoded = encoder.fit_transform(y)

In [None]:
print(y_encoded[:5])
print(y[:5])

[[1 0 0 0]
 [0 0 1 0]
 [0 0 0 1]
 [0 1 0 0]
 [1 0 0 0]]
0         Bitcoin
1        Ethereum
2        Litecoin
3    Bitcoin Cash
4         Bitcoin
Name: crypto_type, dtype: object


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Define the model
model = Sequential()

# Input layer
model.add(Dense(256, input_dim=X.shape[1], activation='relu'))
model.add(BatchNormalization())  # Add BatchNormalization after input layer

# Hidden layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))  # Adjust dropout rate

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))  # Adjust dropout rate

# Output layer
model.add(Dense(y_encoded.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               16640     
                                                                 
 batch_normalization (Batch  (None, 256)               1024      
 Normalization)                                                  
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                        

In [None]:
# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {accuracy:.2f}")

# Predictions and classification report
y_pred = model.predict(X_test)
y_pred_classes = encoder.inverse_transform(y_pred)
y_test_classes = encoder.inverse_transform(y_test)

from sklearn.metrics import classification_report
print(classification_report(y_test_classes, y_pred_classes))

Test Accuracy: 0.97
              precision    recall  f1-score   support

     Bitcoin       0.97      0.95      0.96       978
Bitcoin Cash       0.98      0.99      0.99      1008
    Ethereum       1.00      1.00      1.00       998
    Litecoin       0.94      0.96      0.95      1016

    accuracy                           0.97      4000
   macro avg       0.97      0.97      0.97      4000
weighted avg       0.97      0.97      0.97      4000



In [None]:
from tensorflow.keras.models import save_model

model.save("final.h5")

  saving_api.save_model(
