# Create dataset
This code preprocesses the raw data of transaction interactions, item features, and user features that we collected from four NFT collections into a format that can be inputted into the model. The preprocessed data has been uploaded to Google Drive. The main points are as follows:
- 1. **User filtering**: Filter only users who have made at least 5 transactions.
- 2. **Temporal user split**: Randomly sample 40% of each user's interactions to use as the test set.

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse
import os
from collections import defaultdict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pickle

In [2]:
'''
Select NFT collection name from ['bayc', 'coolcats', 'doodles', 'meebits'], for data pre-processing
'''

COLLECTION = 'bayc'

df_collection = pd.read_csv(f"dataset/transactions/{COLLECTION}.csv")
# set save_path if not exist
save_path = 'dataset/collections/'+COLLECTION+'/'
os.makedirs(save_path, exist_ok=True)

In [3]:
"""
Data preprocessing (filtering)
"""

print(f"Number of interactions before filtering: {len(df_collection)}")

# 1) drop duplicated interactions (i.e., drop rows that Buyer and Token ID are identical)
# df_collection = df_collection.drop_duplicates(subset=['Buyer', 'Token ID'], keep='first')

# 2) Exclude items that we do not have features data for.
#   That is, Only items that exist in the item features file will be left.
#   For reference, 'price' and 'transaction' files include only items that were first traded before 2023.
image = pd.read_csv(f'dataset/features_item/{COLLECTION}_img.csv', index_col=0)
text = pd.read_csv(f'dataset/features_item/{COLLECTION}_txt.csv', index_col=0)
price = pd.read_csv(f'dataset/features_item/{COLLECTION}_prices.csv', index_col=0)
transaction = pd.read_csv(f'dataset/features_item/{COLLECTION}_txns.csv', index_col=0)
indices = set(image.index).intersection(set(text.index)).intersection(set(price.index)).intersection(set(transaction.index))
df_collection = df_collection[df_collection['Token ID'].isin(indices)]

# 3) Exclude users that we do not have features data for.
#   That is, Only users that exist in the user features file will be left.
df_feature = pd.read_csv('dataset/features_user/user_features.csv', index_col=0) #.drop(['Unnamed: 0'], axis=1)
# leave only user that exist in the df_feature 'Buyer' column
df_collection = df_collection[df_collection['Buyer'].isin(df_feature['Buyer'])]

print(f"Number of interactions after filtering: {len(df_collection)}")

Number of interactions before filtering: 29972
Number of interactions after filtering: 29529


## interactions ('inter')
- input
    - NFT transactions data in 'transactions' folder, collected from Etherscan NFT tracker (https://etherscan.io/nfttracker)
- output
    - An .npy formatted interaction file (user, item, label)

In [4]:
"""
USER_CUT: Only users that have been traded at least CUT times will be used.
"""
CUT = 5

# print len of df_collection
print(f"Number of transactions before USER CUT: {len(df_collection)}")

# get the list of "Buyer" whose count is more than 3
user_count = df_collection['Buyer'].value_counts()
user_count = user_count[user_count >= CUT]
user_count = user_count.index.tolist()

# drop rows whose "Buyer" is not in user_count
df_collection = df_collection[df_collection['Buyer'].isin(user_count)]

# print len of df_collection
print(f"Number of transactions before USER CUT: {len(df_collection)}")

Number of transactions before USER CUT: 29529
Number of transactions before USER CUT: 13763


In [5]:
"""
Generate price labels, for later use of multi-objectives training
"""

# drop rows where 'Price' does not contain '$'
df_collection = df_collection[df_collection['Price'].str.contains("\$")]
# convert 'Price' to the value before 'ETH'
df_collection['Price'] = df_collection['Price'].apply(lambda x: x.split(' ')[2][2:-1].replace(',', '').replace('.', ''))
df_collection['Price'] = df_collection['Price'].astype(float)

# create a new variable 'Price_diff' which is the difference between the future price and the current price 
# get price differences from the same 'Token ID'
df_collection['Price_diff'] = df_collection.groupby('Token ID')['Price'].diff(-1)
# convert rows where 'Price_diff' is NaN into 0
df_collection['Price_diff'] = df_collection['Price_diff'].fillna(0)
# put minus to Price_diff
df_collection['Price_diff'] = df_collection['Price_diff'].apply(lambda x: -x)
# convert 'Price_diff' to 1 if the value is greater than 0, otherwise 0
df_collection['Price_diff'] = df_collection['Price_diff'].apply(lambda x: 1 if x > 0 else 0)

# create an np.array with 'Buyer'
user = df_collection['Buyer'].values
item = df_collection['Token ID'].values
labels = df_collection['Price_diff'].values
data = (user, item, labels)

# save as npy file
np.save(save_path + f'{COLLECTION}.npy', data)

# print user length and item length
print('user length: ', len(set(user)))
print('item length: ', len(set(item)))
print('inter length: ', len(labels))

# save user length and item length as a dictionary
dict = {'num_user': len(set(user)), 'num_item': len(set(item))}
np.save(save_path + 'num_user_item.npy', dict)

user length:  1230
item length:  6726
inter length:  13737


In [201]:
"""
*For RecBole*
To use the same train, validation, and test sets when conducting baseline model experiments in RecBole, index information is stored.
"""

# save df_collection as csv file
recbole_path = '/home/felab1/workspace/LEE/RecBole/dataset/transactions/'
df_collection.reset_index(drop=True).to_csv(recbole_path + f'{COLLECTION}.csv', index=False)

In [202]:
"""
Ensure that the indices for the user and item do not overlap with each other.
We map indices using dict where the key is the original index and the value is the new index.

We map the item indices to the range of [0, len(set(item))).
We add len(set(item)) to the user indices.
For example,
    Before:
        item: [5, 6, 8. 9, 10, 13, 15, 20, 21, 29]
        user: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    After:
        item: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        user: [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

"""

# 1) Map the item idx: start from 0.
item_unique = np.unique(item)
mapping_i = {}
for i in range(len(item_unique)):
    mapping_i[item_unique[i]] = i

# 2) Map the user idx: start from num_item.
#   firstly, Change the user addresses to integers starting from 0 (e.g., 0x9137a5d195f0ab57e428c5a2be9bc8c4620445cb -> 0)
#   then, add len(set(item)) to the user indices.
user_unique = np.unique(user)
mapping_u = {}
for i in range(len(user_unique)):
    mapping_u[user_unique[i]] = i + len(set(item))

# 3) Create inter
user_ = np.array([mapping_u[u] for u in user])
item_ = np.array([mapping_i[i] for i in item])
inter = np.array([user_, item_, labels]).T
inter = inter.astype(np.int64)
print('num of interactions: ', inter.shape)
inter

num of interactions:  (15708, 3)


array([[7389,  831,    0],
       [6959, 1532,    1],
       [7677,  832,    1],
       ...,
       [6874, 4276,    0],
       [6957, 6224,    0],
       [7893, 3510,    0]])

## user features (user_feat.npy)
- input
    - User features data in 'features_user' folder, collected and preprocessed from transactions file
- output
    - An .npy formatted user features file ('# of transactions', 'Avg transaction price', 'avg holding period')

In [203]:
# read 'user features.csv'
df_feature = pd.read_csv('dataset/features_user/user_features.csv', index_col=0) #.drop(['Unnamed: 0'], axis=1)

# scaling columns "# of transactions", "Avg transaction price", "avg holding period": MinMaxScaler
scaler = MinMaxScaler()
names = ['# of transactions', 'Avg transaction price', 'holding period']
df_feature[names] = scaler.fit_transform(df_feature[names])

# convert column 'Buyer' using mapping_u
# if the value is not in mapping_u, remove the row
df_feature['Buyer'] = df_feature['Buyer'].apply(lambda x: mapping_u[x] if x in mapping_u else np.nan)
df_feature = df_feature.dropna()
# convert column 'Buyer' to int
df_feature['Buyer'] = df_feature['Buyer'].astype(int)
print('num_user: ', len(df_feature))

# set 'Buyer' as index
df_feature = df_feature.set_index('Buyer')

# save df as npy file
np.save(save_path+'user_feat.npy', df_feature, allow_pickle=True)

num_user:  1647


## data split (train.npy, val.npy, test.npy)

In [205]:
"""
data split: use 40% of each user's interactions as validation and test data
"""
# for each user, a random transaction and create a separate dataset with them
valid_and_test = []
random_idx_list = []
for u in np.unique(inter[:,0]):
    num_sample = int(len(np.where(inter[:,0]==u)[0])*0.4) # 40% of the number of transactions
    random_idx = np.random.choice(np.where(inter[:,0]==u)[0], num_sample, replace=False)
    valid_and_test.extend(inter[random_idx])
    random_idx_list.extend(random_idx)
valid_and_test = np.array(valid_and_test)

"""
train
"""
# create a separate dataset where inter not in random_idx_list
train = np.delete(inter, random_idx_list, axis=0)
# get list of indices inter-random_idx_list
train_idx_list = list(set(range(len(inter))) - set(random_idx_list))

"""
valid, test
"""
# split valid_and_test into valid and test
# split random_idx_list into 5:5
valid_idx_list, test_idx_list = train_test_split(random_idx_list, test_size=0.5, random_state=42)
valid = inter[valid_idx_list]
test = inter[test_idx_list]

# get ratio of train/inter, in percentage
print(f'Train ratio: {len(train)/len(inter)*100:.2f}%')
print(f'Valid and Test ratio: {len(valid_and_test)/len(inter)*100:.2f}%')

Train ratio: 63.48%
Valid and Test ratio: 36.52%


In [206]:
"""
*For RecBole*
To use the same train, validation, and test sets when conducting baseline model experiments in RecBole, index information is stored.
"""

# create a list of lists, where each list contains indices of train, validation, and test sets
indices = [train_idx_list, valid_idx_list, test_idx_list]

# save indices as pkl file
recbole_path = f'/home/felab1/workspace/LEE/RecBole/dataset/collections/{COLLECTION}/'
with open(recbole_path + 'split_indices.pkl', 'wb') as f:
    pickle.dump(indices, f)

In [156]:
"""
preprocessing valid data
"""
# using valid, create a dict where keys are unique users and values are items
valid_dict = {}
for i in range(len(valid)):
    if valid[i][0] in valid_dict:
        valid_dict[valid[i][0]].append(valid[i][1])
    else:
        valid_dict[valid[i][0]] = [valid[i][1]]

# show the first five items in valid_dict
list(valid_dict.items())[:5]

[(7075, [283, 174]),
 (7370, [113, 4191]),
 (6789, [4853, 2111]),
 (7306, [2220, 6324]),
 (6833, [3878, 6238, 416, 419])]

In [157]:
"""
Extract the item index in the order of the most traded (popular).
"""

# concat all values in valid_dict as a list
valid_list = []
for i in valid_dict.values():
    valid_list += i

# value count valid_list and sort values
value_counts = pd.Series(valid_list).value_counts().sort_values(ascending=False)

# extract indices of value_counts
indices = value_counts.index

# save indices as npy
np.save(save_path+'indices_valid.npy', indices, allow_pickle=True)

In [158]:
"""
Convert to the form required by the model
e.g., 12656: [7314, 4820, 6304] -> list([12656, 7314, 4820, 6304])
"""

# Create an empty numpy array with dtype 'object'
valid_array = np.empty(len(valid_dict), dtype=object)

# Assign the lists directly to the elements of the array
for i, (key, val) in enumerate(valid_dict.items()):
    # include key in the list
    valid_array[i] = [key] + val

# show the first five items in valid_array
valid_array[:5]

array([list([7075, 283, 174]), list([7370, 113, 4191]),
       list([6789, 4853, 2111]), list([7306, 2220, 6324]),
       list([6833, 3878, 6238, 416, 419])], dtype=object)

In [159]:
"""
preprocessing test data
"""

# using test, create a dict where keys are unique users and values are items
test_dict = {}
for i in range(len(test)):
    if test[i][0] in test_dict:
        test_dict[test[i][0]].append(test[i][1])
    else:
        test_dict[test[i][0]] = [test[i][1]]

# show the first five items in test_dict
list(test_dict.items())[:5]

[(6967, [6227, 1284]),
 (7560, [479, 3184, 1000, 3181]),
 (6928, [3063, 909]),
 (7538, [6371, 2008]),
 (6737, [1982])]

In [160]:
"""
Extract the item index in the order of the most traded (popular).
"""

# concat all values in test_dict as a list
test_list = []
for i in test_dict.values():
    test_list += i

# value count test_list and sort values
value_counts = pd.Series(test_list).value_counts().sort_values(ascending=False)

# extract indices of value_counts
indices = value_counts.index

# save indices as npy
np.save(save_path+'indices_test.npy', indices, allow_pickle=True)

In [161]:
"""
Convert to the form required by the model
e.g., 12656: [7314, 4820, 6304] -> list([12656, 7314, 4820, 6304])
"""

# Create an empty numpy array with dtype 'object'
test_array = np.empty(len(test_dict), dtype=object)

# Assign the lists directly to the elements of the array
for i, (key, val) in enumerate(test_dict.items()):
    # include key in the list
    test_array[i] = [key] + val

# show the first five items in test_array
test_array[:5]

array([list([6967, 6227, 1284]), list([7560, 479, 3184, 1000, 3181]),
       list([6928, 3063, 909]), list([7538, 6371, 2008]),
       list([6737, 1982])], dtype=object)

In [162]:
# save train, valid, test as npy file
np.save(save_path+'train.npy', train, allow_pickle=True)
np.save(save_path+'val.npy', valid_array, allow_pickle=True)
np.save(save_path+'test.npy', test_array, allow_pickle=True)

## adjacency matrix (adj_dict.npy)

In [129]:
# first column of inter is user
# second column of inter is item

# create a dict where keys are user and values are items
adj_dict = {}
for i in range(len(inter)):
    if inter[i][0] in adj_dict:
        adj_dict[inter[i][0]].append(inter[i][1])
    else:
        adj_dict[inter[i][0]] = [inter[i][1]]

# show the first five items in adj_dict
print(list(adj_dict.items())[:5])

# save adj_dict as npy file
np.save(save_path+'adj_dict.npy', adj_dict, allow_pickle=True)

[(6710, [2395, 1652, 6064, 4108, 381]), (7019, [6597, 56, 3208, 314, 3477, 1547, 1087, 3086, 5349, 3239, 1332, 6461]), (7150, [2368, 1798, 6532, 3313, 337, 10, 1692, 1693, 1069, 2969, 6657, 3430, 4603, 5094, 151, 4050, 3562, 1261, 5994, 1789, 883, 3514, 4029, 5517, 2165, 2212, 689, 1310, 4129, 1600, 2722, 3157, 6122, 4244, 3752, 205, 5301, 6230, 3170, 4178, 437, 2509, 5390, 2230, 3391, 5943, 121, 2772, 4592, 5047, 841, 3636, 13, 3603, 4048, 4980, 4912, 527, 5655, 5682, 38, 1969, 5023, 3391, 6649, 4980, 4570, 4923, 4668, 6225, 1602, 1295, 5850, 3081, 5654, 6331, 5476, 1617, 4003, 1824, 1649, 6497, 3529, 3073, 2782, 5763, 6166, 3382, 5115, 1721, 2468, 6387, 4569, 789, 3553, 5031, 2758, 820, 2823, 4440, 1093, 3423, 2162, 4555, 6177, 248, 4351, 6050, 2433, 6554, 6531, 2093, 1508, 4404, 2538, 1708, 5415, 1860, 2713, 2664, 6255, 2399, 1571, 4101, 2977, 6069, 470, 855, 328, 2987, 2586, 3812, 2310, 4513, 6155, 6577, 6416, 1108, 5473, 1111, 6356, 5669, 3998, 2478, 971, 482, 5654, 4340, 5839, 37

## item features (feat.npy)
When using features, there is no need for tokenID to match inter because the index is used in features.

- input
    - Item features data in 'features_item' folder, collected and preprocessed from OpenSea
- output
    - An .npy formatted item features file (image, text, price, transaction)

In [130]:
# print image, text, price shape
print('Before')
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)
print('transaction shape: ', transaction.shape)
print('')

"""
Keep only the items that appear in the inter
"""
# for dataset image, text, price, filter rows whose indices are in item_unique
item_unique = np.unique(item)
image = image.loc[image.index.isin(item_unique)]
text = text.loc[text.index.isin(item_unique)]
price = price.loc[price.index.isin(item_unique)]
transaction = transaction.loc[transaction.index.isin(item_unique)]

"""
Change the item index to start from 0
"""
# convert indices using mapping_i
image.index = image.index.map(mapping_i)
text.index = text.index.map(mapping_i)
price.index = price.index.map(mapping_i)
transaction.index = transaction.index.map(mapping_i)

# print image, text, price shape
print('After')
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)
print('transaction shape: ', transaction.shape)

Before
image shape:  (20000, 1024)
text shape:  (20000, 1800)
price shape:  (9682, 64)
transaction shape:  (9682, 64)

After
image shape:  (6693, 1024)
text shape:  (6693, 1800)
price shape:  (6693, 64)
transaction shape:  (6693, 64)


In [131]:
# assert that the indices of image, text, price are the same, regardless of the order
assert np.array_equal(np.sort(image.index.values), np.sort(text.index.values))
assert np.array_equal(np.sort(image.index.values), np.sort(price.index.values))
assert np.array_equal(np.sort(image.index.values), np.sort(transaction.index.values))

# save df as npy file
np.save(save_path+'image_feat.npy', image)
np.save(save_path+'text_feat.npy', text)
np.save(save_path+'price_feat.npy', price)
np.save(save_path+'transaction_feat.npy', transaction)