In [15]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
from scipy.sparse import csr_matrix, save_npz
import pickle

## Get each NFT collections

In [16]:
'''
The data is imported from the date folders in the etherscan folder.
'''

dates = ['220518', '220712', '220722', '220731', '220812', '220819', '220828', '220905', '220920']
collections = ['BAYC', 'MEEBITS', 'DOODLES', 'AZUKI', 'COOLCATS']
data_list = []

start_index = 0
for collection in collections:

    # combine data for all dates
    df_collection = pd.concat([pd.read_csv(f'data/etherscan/{date}/{collection}.csv').drop(columns=['confirmations']) for date in dates], ignore_index=True).drop_duplicates()
    print(f'{collection}: ', len(df_collection), 'transactions, ',
            df_collection['tokenID'].nunique(), 'unique tokenIDs')
    
    # map tokenIDs to indices
    tokenID_to_index = {tokenID: index for index, tokenID in enumerate(df_collection['tokenID'].unique())}
    df_collection['tokenID_idx'] = df_collection['tokenID'].map(tokenID_to_index)
    
    # map user address to indices
    user_to_index = {user: index for index, user in enumerate(df_collection['to'].unique())}
    df_collection['user_idx'] = df_collection['to'].map(user_to_index)

    print(df_collection['user_idx'].nunique(), 'unique users')
    print(df_collection['tokenID_idx'].nunique(), 'unique items')
    print(df_collection.shape[0], 'transactions')
    print('')

    # create a sparse matrix 
    # format: csr_matrix((data, (row_idx, col_idx)), [shape=(M, N)])
    # where [row_idx[k], col_idx[k]] is the coordinate of the kth nonzero element
    rows = df_collection['user_idx'].values
    cols = df_collection['tokenID_idx'].values
    ratings = np.array([1]*len(rows))
    sparse_matrix = csr_matrix((ratings, (rows, cols)),
                                    shape=(len(np.unique(rows)), len(np.unique(cols))))
    
    # save saprse matrix
    save_npz(f'data/etherscan/sparse_matrix/sparse_matrix_{collection}.npz', sparse_matrix)
    
    # save tokenID_to_index as pickle file
    with open(f'data/etherscan/pickle/tokenID_to_index_{collection}.pkl', 'wb') as f:
        pickle.dump(tokenID_to_index, f)
    # save user_to_index as pickle file
    with open(f'data/etherscan/pickle/user_to_index_{collection}.pkl', 'wb') as f:
        pickle.dump(user_to_index, f)


BAYC:  22246 transactions,  4329 unique tokenIDs
6358 unique users
4329 unique items
22246 transactions

MEEBITS:  19320 transactions,  5213 unique tokenIDs
6461 unique users
5213 unique items
19320 transactions

DOODLES:  20788 transactions,  5112 unique tokenIDs
7967 unique users
5112 unique items
20788 transactions

AZUKI:  20789 transactions,  4910 unique tokenIDs
6946 unique users
4910 unique items
20789 transactions

COOLCATS:  14514 transactions,  4745 unique tokenIDs
8296 unique users
4745 unique items
14514 transactions



## Combine each NFT collections

In [17]:
dates = ['220518', '220712', '220722', '220731', '220812', '220819', '220828', '220905', '220920']
collections = ['BAYC', 'MEEBITS', 'DOODLES', 'AZUKI', 'COOLCATS']
data_list = []

start_index = 0
for collection in collections:

    # combine data for all dates
    df_collection = pd.concat([pd.read_csv(f'data/etherscan/{date}/{collection}.csv').drop(columns=['confirmations']) for date in dates], ignore_index=True).drop_duplicates()
    print(f'{collection}: ', len(df_collection), 'transactions, ',
            df_collection['tokenID'].nunique(), 'unique tokenIDs')

    # map tokenIDs to indices
    tokenID_to_index = {tokenID: index for index, tokenID in enumerate(df_collection['tokenID'].unique())}
    df_collection['tokenID_idx'] = df_collection['tokenID'].map(tokenID_to_index) + start_index
    start_index += len(df_collection['tokenID'].unique())

    # save data
    data_list.append(df_collection)

df_combined = pd.concat(data_list)

BAYC:  22246 transactions,  4329 unique tokenIDs
MEEBITS:  19320 transactions,  5213 unique tokenIDs
DOODLES:  20788 transactions,  5112 unique tokenIDs
AZUKI:  20789 transactions,  4910 unique tokenIDs
COOLCATS:  14514 transactions,  4745 unique tokenIDs


In [18]:
df_combined

Unnamed: 0,blockNumber,timeStamp,hash,nonce,blockHash,from,contractAddress,to,tokenID,tokenName,tokenSymbol,tokenDecimal,transactionIndex,gas,gasPrice,gasUsed,cumulativeGasUsed,input,tokenID_idx
0,14796711,1652847797,0xd53ac790ef514be48efd810b5aa8aed9148d0acb8c8a...,88,0xfe96291e142addf3f1a7c52269bd2654784edc7eefb6...,0x2a59d2927541d17c0fa19140703ecb4b697b765d,0xbc4ca0eda7647a8ab7c2061c2e118a18a936f13d,0x710f6e3bfc837f451efb2955feedaf8a601bbfb6,321,BoredApeYachtClub,BAYC,0,55,124599,27839268413,114999,3967099,deprecated,0
1,14796711,1652847797,0x95abb416954e9dab82609fc695f47494c615102c80e3...,51,0xfe96291e142addf3f1a7c52269bd2654784edc7eefb6...,0xa282017d04148a8f5623461838aef2dc58054a72,0xbc4ca0eda7647a8ab7c2061c2e118a18a936f13d,0xa8340d926c0ea053b5d8a0cf53aacf4ee4120f86,4785,BoredApeYachtClub,BAYC,0,45,124599,27839268413,110199,3353838,deprecated,1
2,14796615,1652846370,0x3d4ecde65e80f1a876e443fc6c294459b7200ad17ae0...,44,0x3b469831dc7ccc7dfc6c2f0549e2bfbdcf82d251d235...,0x9d11f86874198590c6e168e02000c9da8ffd9baf,0xbc4ca0eda7647a8ab7c2061c2e118a18a936f13d,0xa282017d04148a8f5623461838aef2dc58054a72,4785,BoredApeYachtClub,BAYC,0,50,124587,25101927609,114987,3923802,deprecated,1
3,14796555,1652845681,0x1125d8dc789b62b26a79c8550a64fb5ca3537119ccfd...,334,0xc5d93c96cdef5f69d508d86604efbaa9f429a7af6808...,0x13d8faf4a690f5ae52e2d2c52938d1167057b9af,0xbc4ca0eda7647a8ab7c2061c2e118a18a936f13d,0xa742203fe554fff075077b749f5402c62fc01bba,9355,BoredApeYachtClub,BAYC,0,106,820762,30482663141,508585,5090765,deprecated,2
4,14796536,1652845309,0x5a57f01de114d6c2e10683aa5b608a3e9a50882f97c5...,464,0x855e912a10415285e858ad4d82a588920a08520c5d13...,0x6449bbb550f1049236d803f0e43759999b9a2bee,0xbc4ca0eda7647a8ab7c2061c2e118a18a936f13d,0x13d8faf4a690f5ae52e2d2c52938d1167057b9af,9355,BoredApeYachtClub,BAYC,0,448,1526704,24456836748,1003215,27532255,deprecated,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80500,15478025,1662382811,0xf3253a254c7bb1c75beb1af8016be326ce9fd9d5ce49...,140,0xbe61c2fb33ca98ec9634048b96eaa511a5b789614105...,0x4b139e0e7f57a3400346c6be7bb2d53775a82107,0x1a92f7381b9f03921564a437210bb9396471050c,0x114f1388fab456c4ba31b1850b244eedcd024136,7231,Cool Cats,COOL,0,388,691066,10238310780,568590,27513679,deprecated,24181
80501,15477572,1662376866,0x6e688205ccb684db83b963bb920386291d2d8ad984e2...,73,0x7ee2a3f32b8d482f54372b84eb983f3d36bb1a6c8a37...,0x2a8b0e30093d8dfa7512d7ed2ab30e32fd8a79dc,0x1a92f7381b9f03921564a437210bb9396471050c,0x1e589429fe9f4e5971fb9621ab2dd198bf0c770e,9495,Cool Cats,COOL,0,112,219964,4515992436,205393,13036539,deprecated,23931
80502,15477442,1662375359,0x0c0be056ee9212b90e63ce7caad6df3cdbc4ca2456df...,377,0x3e85df6ce31bdbc0b521bd7fe6c76ae218e7d85bca8f...,0x2f3db2a40c01d1fdd6fd4ac6040684292b8807e8,0x1a92f7381b9f03921564a437210bb9396471050c,0x5d15989394195207534a9ecbf582d712a2d2ebe8,5456,Cool Cats,COOL,0,171,371094,8278941046,329389,15961956,deprecated,22414
80503,15477063,1662370526,0x9381b993ab31901cc6308590d2d029446301b5e0ae9e...,358,0x68f4576fc112080e50ba889e6422b3875d730f4158b7...,0xcbc5cf551ba75496d10727e62dd660efc6a5b347,0x1a92f7381b9f03921564a437210bb9396471050c,0xdf550c82681220762a0d2d5bf6b651a6df6c6480,2224,Cool Cats,COOL,0,56,222273,8131166050,207702,11272712,deprecated,20658


In [None]:
df_combined.to_csv('data/etherscan/combined.csv', index=False)

# Create a sparse user-item matrix

In [19]:
# map user address to indices
user_to_index = {user: index for index, user in enumerate(df_combined['to'].unique())}
df_combined['user_idx'] = df_combined['to'].map(user_to_index)

print(df_combined['user_idx'].nunique(), 'unique users')
print(df_combined['tokenID_idx'].nunique(), 'unique items')
print(df_combined.shape[0], 'transactions')

30714 unique users
24309 unique items
97657 transactions


In [20]:
# create a sparse matrix 
# format: csr_matrix((data, (row_idx, col_idx)), [shape=(M, N)])
# where [row_idx[k], col_idx[k]] is the coordinate of the kth nonzero element

rows = df_combined['user_idx'].values
cols = df_combined['tokenID_idx'].values
ratings = np.array([1]*len(rows))

sparse_matrix = csr_matrix((ratings, (rows, cols)),
                                   shape=(len(np.unique(rows)), len(np.unique(cols))))
sparse_matrix

<30714x24309 sparse matrix of type '<class 'numpy.intc'>'
	with 73729 stored elements in Compressed Sparse Row format>

In [21]:
# save saprse matrix
save_npz('data/etherscan/sparse_matrix/sparse_matrix.npz', sparse_matrix)

# Make it into Pandas DataFrame

In [24]:
def get_user_item_dict(df, name):
    """
    generates a user-item dictionary for one collection
    
    INPUT
        df: (pandas DataFrame) csv file
        name: (str) collection name
    OUTPUT
        user_item: (dictionary)
    """
    user_item = defaultdict(list)
    user_i = []

    items = df["tokenID"].unique()
    for i in tqdm(items):
        user_i = []
        df_i = df[df["tokenID"] == i]
        user_i.extend(df_i["from"].unique())
        user_i.extend(df_i["to"].unique())
        asset_name = name + "/" + str(i)
        user_item[asset_name] = user_i
    return user_item


In [29]:
collections = data_list # [df_bayc, df_meebits, df_doodles, df_azuki, df_coolcats]
coll_name = ["Bayc", "Meebits", "Doodles", "Azuki", "Coolcats"]

user_item_dict = defaultdict(list)

for nft, name in zip(collections, coll_name):
    user_item_dict.update(get_user_item_dict(nft, name))


100%|██████████| 4329/4329 [00:01<00:00, 3233.94it/s]
100%|██████████| 5213/5213 [00:01<00:00, 3274.93it/s]
100%|██████████| 5112/5112 [00:01<00:00, 3236.62it/s]
100%|██████████| 4910/4910 [00:01<00:00, 3253.02it/s]
100%|██████████| 4745/4745 [00:01<00:00, 3329.15it/s]


In [30]:
row = set()
col = list(user_item_dict.keys())
row.update(*user_item_dict.values())
row = list(row)

df_user_item = pd.DataFrame(index=row, columns=col)

for item in tqdm(user_item_dict.keys()): # ta
    users = user_item_dict[item]
    for u in users:
        df_user_item[item][u] = 1

df_user_item = df_user_item.fillna(0)
df_user_item


100%|██████████| 24309/24309 [00:25<00:00, 969.57it/s] 


Unnamed: 0,Bayc/321,Bayc/4785,Bayc/9355,Bayc/7048,Bayc/4194,Bayc/956,Bayc/4263,Bayc/3428,Bayc/6380,Bayc/3168,...,Coolcats/7880,Coolcats/6831,Coolcats/6156,Coolcats/8743,Coolcats/1535,Coolcats/5920,Coolcats/2481,Coolcats/1431,Coolcats/3923,Coolcats/1101
0xc8bbed5b568b2f08f3b775948f047f9dab5cb965,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x8dc00a9eed1e0d28cac01e96d050f703e408eb3f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x79c0d3a1200cb3cebef74f3ee10aa53407a966cf,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0xdad6c364209c821b459c3bd337bcb8a542f561ba,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0xe9b4a11f3447b908f0680b97982d616af3066ead,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0x8d0731fa064132fb60261158c0ec13865f1dacc5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0xcedae06271b8335bd7d96fdb756de2e893a9ba3d,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x943a9e521cb588f3bcf6b21c7b06ba087791b00c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x391c8f2d68c38d1fc60869800e65e8bcdf21cfc8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
