In [19]:
import pandas
from sklearn.preprocessing import StandardScaler

# The data file path used in this document
KOIN_path = "./data/KOIN.csv"
BAYC_path = "./data/BAYC.csv"
DOODLES_path = "./data/DOODLES.csv"
MEKA_path = "./data/MEKA.csv"

constants = [
    {
        'path': KOIN_path,
        'name': "KOIN"
    },
    {
        'path': BAYC_path,
        'name': "BAYC"
    },
    {
        'path': DOODLES_path,
        'name': "DOODLES"
    },
    {
        'path': MEKA_path,
        'name': "MEKA"
    },
]

# Function to retrieve and set data in a data frame
def retrieve_data_from_csv(path):
    df = pandas.read_csv(path, index_col='token_id')
    return df

# Retrieve all data from constants
def retrieve_data():
    data_frames = []

    for constant in constants:
        data_frames.append(retrieve_data_from_csv(constant['path']))

    return data_frames

# Scale data using StandardScaler
def scale_data():
    scaled_data_frames = []

    for df in NFT_dfs:
        scaled_data_frames.append(StandardScaler().fit_transform(df))

    return scaled_data_frames

NFT_dfs = retrieve_data()
scaled_dfs = scale_data()

In [20]:
# Some functions necessary to analyze the clusters

# Creates an object containing information about the cluster
def get_clusters_dict(labels, data_frame):
    clusters_dict = {}

    for index, token_id in enumerate(data_frame.index):
        # Grab tha lable from the clusters array
        label = labels[index]
        token_information = {
            "token_id": token_id, 
            "rank": index + 1, 
            "token_information": data_frame.iloc[index].values
        }

        # Check if label is in dictionary
        if label in clusters_dict.keys():
            clusters_dict[label].append(token_information)
        else:
            clusters_dict[label] = [token_information]

    return clusters_dict

# Generates Avgs based on Rank and Score, as well as top five ranked NFTs for a cluster
def analyze_clusters(clusters_dict):
    analysis = []
    rank_avg = []
    rank_sum_avg = []
    cluster_keys = clusters_dict.keys()

    for cluster_num in cluster_keys:
        rank_sum = 0
        rank_score_sum = 0
        token_ids = []
        ranks = []

        # Generate rank_sum, rank_score_sum, and gather token_ids
        for token in clusters_dict[cluster_num]:
            rank_sum += token['rank']
            rank_score_sum += sum(token['token_information'])
            token_ids.append(token['token_id'])
            ranks.append(token['rank'])
        
        cluster = sorted(zip(ranks, token_ids), key=lambda t: t[0])

        # Add top_five tokens in a cluster and cluster number to analysis array
        analysis.append({
            "cluster_#": cluster_num + 1,
            "cluster": cluster
        })
        rank_avg.append(rank_sum/len(clusters_dict[cluster_num]))
        rank_sum_avg.append(rank_score_sum/len(clusters_dict[cluster_num]))

    # Load avgs into pandas df
    avg_df = pandas.DataFrame({"Rank Avg": rank_avg, "Rank Sum Avg": rank_sum_avg}, columns = ['Rank Avg', 'Rank Sum Avg'])

    return {"analysis": analysis, "avg_df": avg_df}

# Creates analysis for all data sets
def get_analysis(clusters):
    cluster_dicts = []

    for i in range(len(NFT_dfs)):
        cluster_dicts.append(get_clusters_dict(clusters[i], NFT_dfs[i]))
    
    analysis = []

    for cluster_dict in cluster_dicts:
        analysis.append(analyze_clusters(cluster_dict))

    return analysis

### DBSCAN

In [21]:
from sklearn.cluster import DBSCAN

def get_DBSCAN_clusters(embeddings):
    clusters = []

    for embedding in embeddings:
        clusterer_DBSCAN = DBSCAN(eps=.5)
        clusters.append(clusterer_DBSCAN.fit_predict(embedding))

    return clusters

DBSCAN_clusters = get_DBSCAN_clusters(scaled_dfs)
DBSCAN_analysis = get_analysis(DBSCAN_clusters)

### KMeans

In [22]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans_kwargs = {
    "init": "random",
    "n_init": 50,
    "max_iter": 500,
}

def get_KMeans_labels_via_silhouette(embeddings):
    kmeans_labels = []

    for embedding in embeddings:
        best_score = -1
        labels = 0

        for k in range(10, 20):
            kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
            kmeans.fit(embedding)
            score = silhouette_score(embedding, kmeans.labels_)

            if score > best_score:
                best_score = score
                labels = kmeans.labels_

        kmeans_labels.append(labels)

    return kmeans_labels

KMean_clusters = get_KMeans_labels_via_silhouette(scaled_dfs)
KMean_analysis = get_analysis(KMean_clusters)

### Aglomerative

In [None]:
from sklearn.cluster import AgglomerativeClustering

def get_Agglomerative_labels_via_silhouette(embeddings):
    agglomerative_labels = []

    for embedding in embeddings:
        best_score = -1
        labels = 0

        for k in range(10, 20):
            agglomerative_model = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='ward')
            agglomerative_model.fit(embedding)
            score = silhouette_score(embedding, agglomerative_model.labels_)

            if score > best_score:
                best_score = score
                labels = agglomerative_model.labels_

        agglomerative_labels.append(labels)

    return agglomerative_labels

Agglomerative_clusters = get_Agglomerative_labels_via_silhouette(scaled_dfs)
Agglomerative_analysis = get_analysis(Agglomerative_clusters)

In [24]:
def print_clusters(analysis):
    print("Length: {}".format(len(analysis)))
    for i in range(0, len(analysis)):
        print("Cluster {}".format(analysis[i]['cluster_#']))
        for t in analysis[i]['cluster']:
            print("\t Rank: {}\t TokenId: {}".format(t[0], t[1]))

In [34]:
data_to_analyze = 3
print_clusters(DBSCAN_analysis[data_to_analyze]['analysis'])

Length: 81
Cluster 0
	 Rank: 1	 TokenId: 8597
	 Rank: 2	 TokenId: 6273
	 Rank: 3	 TokenId: 6242
	 Rank: 4	 TokenId: 1922
	 Rank: 5	 TokenId: 1784
	 Rank: 6	 TokenId: 2423
	 Rank: 7	 TokenId: 4370
	 Rank: 8	 TokenId: 3139
	 Rank: 9	 TokenId: 7823
	 Rank: 10	 TokenId: 396
	 Rank: 11	 TokenId: 1924
	 Rank: 12	 TokenId: 3581
	 Rank: 13	 TokenId: 5162
	 Rank: 14	 TokenId: 945
	 Rank: 15	 TokenId: 848
	 Rank: 16	 TokenId: 1564
	 Rank: 17	 TokenId: 1754
	 Rank: 18	 TokenId: 2742
	 Rank: 19	 TokenId: 7075
	 Rank: 20	 TokenId: 2764
	 Rank: 21	 TokenId: 6599
	 Rank: 22	 TokenId: 5435
	 Rank: 23	 TokenId: 6674
	 Rank: 24	 TokenId: 2232
	 Rank: 25	 TokenId: 3699
	 Rank: 26	 TokenId: 7313
	 Rank: 27	 TokenId: 7616
	 Rank: 28	 TokenId: 1633
	 Rank: 29	 TokenId: 8395
	 Rank: 30	 TokenId: 7709
	 Rank: 31	 TokenId: 610
	 Rank: 32	 TokenId: 1492
	 Rank: 33	 TokenId: 6430
	 Rank: 34	 TokenId: 5503
	 Rank: 35	 TokenId: 5661
	 Rank: 36	 TokenId: 2194
	 Rank: 37	 TokenId: 3995
	 Rank: 38	 TokenId: 2009
	 Ra

In [35]:
print_clusters(KMean_analysis[data_to_analyze]['analysis'])

Length: 17
Cluster 4
	 Rank: 1	 TokenId: 8597
	 Rank: 2	 TokenId: 6273
	 Rank: 3	 TokenId: 6242
	 Rank: 4	 TokenId: 1922
Cluster 1
	 Rank: 5	 TokenId: 1784
	 Rank: 6	 TokenId: 2423
	 Rank: 7	 TokenId: 4370
	 Rank: 8	 TokenId: 3139
	 Rank: 9	 TokenId: 7823
	 Rank: 12	 TokenId: 3581
	 Rank: 18	 TokenId: 2742
	 Rank: 19	 TokenId: 7075
	 Rank: 20	 TokenId: 2764
	 Rank: 24	 TokenId: 2232
	 Rank: 35	 TokenId: 5661
	 Rank: 39	 TokenId: 1175
	 Rank: 42	 TokenId: 3503
	 Rank: 52	 TokenId: 519
	 Rank: 56	 TokenId: 1360
	 Rank: 58	 TokenId: 6840
	 Rank: 59	 TokenId: 7922
	 Rank: 66	 TokenId: 3636
	 Rank: 68	 TokenId: 7577
	 Rank: 75	 TokenId: 5930
	 Rank: 77	 TokenId: 8060
	 Rank: 84	 TokenId: 1435
	 Rank: 89	 TokenId: 3489
	 Rank: 91	 TokenId: 8108
	 Rank: 101	 TokenId: 1467
	 Rank: 102	 TokenId: 3644
	 Rank: 103	 TokenId: 740
	 Rank: 154	 TokenId: 6786
	 Rank: 185	 TokenId: 4471
	 Rank: 186	 TokenId: 6774
	 Rank: 192	 TokenId: 4355
Cluster 12
	 Rank: 10	 TokenId: 396
	 Rank: 11	 TokenId: 1924
	

In [36]:
print_clusters(Agglomerative_analysis[data_to_analyze]['analysis'])

Length: 16
Cluster 16
	 Rank: 1	 TokenId: 8597
	 Rank: 2	 TokenId: 6273
	 Rank: 3	 TokenId: 6242
	 Rank: 4	 TokenId: 1922
Cluster 7
	 Rank: 5	 TokenId: 1784
	 Rank: 6	 TokenId: 2423
	 Rank: 7	 TokenId: 4370
	 Rank: 8	 TokenId: 3139
	 Rank: 9	 TokenId: 7823
	 Rank: 12	 TokenId: 3581
	 Rank: 18	 TokenId: 2742
	 Rank: 19	 TokenId: 7075
	 Rank: 20	 TokenId: 2764
	 Rank: 24	 TokenId: 2232
	 Rank: 35	 TokenId: 5661
	 Rank: 39	 TokenId: 1175
	 Rank: 42	 TokenId: 3503
	 Rank: 52	 TokenId: 519
	 Rank: 56	 TokenId: 1360
	 Rank: 58	 TokenId: 6840
	 Rank: 59	 TokenId: 7922
	 Rank: 66	 TokenId: 3636
	 Rank: 68	 TokenId: 7577
	 Rank: 75	 TokenId: 5930
	 Rank: 77	 TokenId: 8060
	 Rank: 84	 TokenId: 1435
	 Rank: 89	 TokenId: 3489
	 Rank: 91	 TokenId: 8108
	 Rank: 101	 TokenId: 1467
	 Rank: 102	 TokenId: 3644
	 Rank: 103	 TokenId: 740
	 Rank: 154	 TokenId: 6786
	 Rank: 185	 TokenId: 4471
	 Rank: 186	 TokenId: 6774
	 Rank: 192	 TokenId: 4355
Cluster 15
	 Rank: 10	 TokenId: 396
	 Rank: 11	 TokenId: 1924
