## 1. Format raw data
Data is obtained from: https://www.kaggle.com/xblock/bitcoin-partial-transaction-dataset

In [None]:
from data_util import parse_data

In [None]:
# Parse data from Kaggle into an easier format
# Creates "raw data" file that can be loaded into Transaction objects
parse_data('archive/dataset3_2016_1_1500000')

## 2. Run The Clustering Algorithms

In [1]:
import cluster
from data_util import load_data
import heuristics

import pickle
import matplotlib.pyplot as plt
import numpy as np
import time
from collections import defaultdict

In [2]:
# Load pickled data
data = load_data('dataset3_2016_1_1500000.pkl')

In [4]:
def cluster_and_pickle(heuristic):
    # Optain clustering from heuristic
    clusters = cluster.cluster(data, heuristic)
    
    # Save compressed data
    file_name = 'dataset3_' + heuristic.__name__ + '.pkl'
    with open(file_name, 'wb') as f:
            pickle.dump(clusters, f)
            
    return clusters

def load_cluster(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f)
    
def plot_and_analyze(clusters_list, title):
    # Count the number of clusters at a given size
    cluster_sizes = defaultdict(int)
    total_addr = 0
    for cluster in clusters_list:
        cluster_size = len(cluster)
        total_addr += cluster_size
        cluster_sizes[cluster_size] += 1

    # Generate some summary statistics
    sizes = list(cluster_sizes.keys())
    counts = list(cluster_sizes.values())
    avg_size = sum(size * count for size, count in zip(sizes, counts)) / sum(counts)
    avg_non_single_size = sum(size * count for size, count in zip(sizes, counts) if size > 1) / \
                          sum(count for size, count in zip(sizes, counts) if size > 1)
    print('Total number of clusters:           ', len(clusters_list))
    print('Number of single-address clusters:  ', cluster_sizes[1])
    print('Size of largest cluster:            ', max(sizes))
    print('Average cluster size:               ', avg_size)
    print('Average cluster size (excluding single-address clusters): ', avg_non_single_size)
    
    # Get cluster size counts in particular range
    x, y = zip(*[(size, count) for size, count in cluster_sizes.items() if 100 > size > 1])

    # Plot cluster sizes as a bar graph
    plt.title(title)
    plt.bar(x, y)
    plt.ylabel('Number of Clusters')
    plt.xlabel('Number of addresses in cluster')
    plt.yscale('log')

In [4]:
multi_input_clusters = cluster_and_pickle(heuristics.multi_input)

Percent Complete: 0                 Minutes Elapsed 0.0
Percent Complete: 5                 Minutes Elapsed 0.019871159394582113
Percent Complete: 10                 Minutes Elapsed 0.0585299293200175
Percent Complete: 15                 Minutes Elapsed 0.15045073827107747
Percent Complete: 20                 Minutes Elapsed 0.26105910539627075
Percent Complete: 25                 Minutes Elapsed 0.4204951922098796
Percent Complete: 30                 Minutes Elapsed 0.61038818359375
Percent Complete: 35                 Minutes Elapsed 0.8524929960568746
Percent Complete: 40                 Minutes Elapsed 1.1642302552858987
Percent Complete: 45                 Minutes Elapsed 1.5084414879480998
Percent Complete: 50                 Minutes Elapsed 1.884193221728007
Percent Complete: 55                 Minutes Elapsed 2.296970001856486
Percent Complete: 60                 Minutes Elapsed 2.789648203055064
Percent Complete: 65                 Minutes Elapsed 3.5998483141263327
Percent Co

In [4]:
optimal_change_clusters = cluster_and_pickle(heuristics.optimal_change)

Percent Complete: 0                 Minutes Elapsed 0.0
Percent Complete: 5                 Minutes Elapsed 0.012486167748769124
Percent Complete: 10                 Minutes Elapsed 0.02828974723815918
Percent Complete: 15                 Minutes Elapsed 0.08555270433425903
Percent Complete: 20                 Minutes Elapsed 0.10715759595235189
Percent Complete: 25                 Minutes Elapsed 0.13246346314748128
Percent Complete: 30                 Minutes Elapsed 0.16310359239578248
Percent Complete: 35                 Minutes Elapsed 0.24948980808258056
Percent Complete: 40                 Minutes Elapsed 0.30230191151301067
Percent Complete: 45                 Minutes Elapsed 0.36336559454600015
Percent Complete: 50                 Minutes Elapsed 0.44096649885177613
Percent Complete: 55                 Minutes Elapsed 0.5178672393163045
Percent Complete: 60                 Minutes Elapsed 0.6214573661486308
Percent Complete: 65                 Minutes Elapsed 0.807966244220733

In [4]:
multi_input_opimal_change_clusters = cluster_and_pickle(heuristics.multi_input_optimal_change)

Percent Complete: 0                 Minutes Elapsed 0.0
Percent Complete: 5                 Minutes Elapsed 0.026739386717478435
Percent Complete: 10                 Minutes Elapsed 0.4372990846633911
Percent Complete: 15                 Minutes Elapsed 3.3994531591733295
Percent Complete: 20                 Minutes Elapsed 11.607161136468251
Percent Complete: 25                 Minutes Elapsed 25.290275502204896
Percent Complete: 30                 Minutes Elapsed 45.55417519013087
Percent Complete: 35                 Minutes Elapsed 80.35993365844091
Percent Complete: 40                 Minutes Elapsed 115.87575832207997
Percent Complete: 45                 Minutes Elapsed 146.51456751823426
Percent Complete: 50                 Minutes Elapsed 191.90222622156142
Percent Complete: 55                 Minutes Elapsed 232.2477931777636
Percent Complete: 60                 Minutes Elapsed 296.9123491247495
Percent Complete: 65                 Minutes Elapsed 374.86166259050367
Percent Com

In [7]:
shadow_clusters = cluster_and_pickle(heuristics.shadow)

Percent Complete: 0 		Minutes Elapsed: 0.0
Percent Complete: 5 		Minutes Elapsed: 1.2233937621116637
Percent Complete: 10 		Minutes Elapsed: 9.993245673179626
Percent Complete: 15 		Minutes Elapsed: 29.443925416469575
Percent Complete: 20 		Minutes Elapsed: 61.19755054314931
Percent Complete: 25 		Minutes Elapsed: 103.85583473841349
Percent Complete: 30 		Minutes Elapsed: 157.96338029305142
Percent Complete: 35 		Minutes Elapsed: 229.29513103961943
Percent Complete: 40 		Minutes Elapsed: 318.7076974034309
Percent Complete: 45 		Minutes Elapsed: 417.9994752327601
Percent Complete: 50 		Minutes Elapsed: 539.7690722107887
Percent Complete: 55 		Minutes Elapsed: 653.8277641057969
Percent Complete: 60 		Minutes Elapsed: 795.6694319009781
Percent Complete: 65 		Minutes Elapsed: 958.2277228514354
Percent Complete: 70 		Minutes Elapsed: 1305.4008401433628
Percent Complete: 75 		Minutes Elapsed: 1502.1993415157
Percent Complete: 80 		Minutes Elapsed: 1729.45293400685
Percent Complete: 85 		Minu