# Import packages

In [4]:
import pandas as pd
import numpy as np
import scipy.sparse
import os
from collections import defaultdict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
'''
Select NFT collection name from ['azuki', 'bayc', 'coolcats', 'doodles', 'meebits']
'''

COLLECTION = 'azuki'
df_azuki = pd.read_csv(f"dataset/transactions/{COLLECTION}.csv")

In [70]:
"""
Data preprocessing (filtering)
"""

# 1) drop duplicated interactions (i.e., drop rows that Buyer and Token ID are identical)
df_azuki = df_azuki.drop_duplicates(subset=['Buyer', 'Token ID'], keep='first')

# 2) Exclude items that we do not have features data for.
#   That is, Only items that exist in the item features file will be left.
#   For reference, all items existing in the transaction data were first traded before September 2022.
image = pd.read_csv(f'dataset/item_features/{COLLECTION}_image.csv', index_col=0)
text = pd.read_csv(f'dataset/item_features/{COLLECTION}_text.csv', index_col=0)
price = pd.read_csv(f'dataset/item_features/{COLLECTION}_prices.csv', index_col=0)
transaction = pd.read_csv(f'dataset/item_features/{COLLECTION}_transactions.csv', index_col=0)
df_azuki = df_azuki[df_azuki['Token ID'].isin(image.index)]
df_azuki = df_azuki[df_azuki['Token ID'].isin(text.index)]
df_azuki = df_azuki[df_azuki['Token ID'].isin(price.index)]
df_azuki = df_azuki[df_azuki['Token ID'].isin(transaction.index)]

# Create interactions ('inter')
- input
    - NFT transactions data in 'transactions' folder, collected from Etherscan NFT tracker (https://etherscan.io/nfttracker)
- output
    - An .npy formatted interaction file (user, item, label)

In [71]:
# set save_path if not exist
save_path = 'dataset/collections/'+COLLECTION+'/'
if not os.path.exists(save_path):
    os.makedirs(save_path)

In [43]:
# """
# ITEM_CUT: Only items that have been traded CUT times or more will be used.
# """
# CUT = 5

# # print len of df_azuki
# print(f"ITEM CUT 전 거래 개수: {len(df_azuki)}")

# # get the list of "Token ID" whose count is more than 3
# item_count = df_azuki['Token ID'].value_counts()
# item_count = item_count[item_count >= CUT]
# item_count = item_count.index.tolist()

# # drop rows whose "Token ID" is not in item_count
# df_azuki = df_azuki[df_azuki['Token ID'].isin(item_count)]

# # print len of df_azuki
# print(f"ITEM CUT 후 거래 개수: {len(df_azuki)}")

In [44]:
# """
# USER_CUT: Only users that have been traded CUT times or more will be used.
# """
# CUT = 5

# # print len of df_azuki
# print(f"USER CUT 전 거래 개수: {len(df_azuki)}")

# # get the list of "Buyer" whose count is more than 3
# user_count = df_azuki['Buyer'].value_counts()
# user_count = user_count[user_count >= CUT]
# user_count = user_count.index.tolist()

# # drop rows whose "Buyer" is not in user_count
# df_azuki = df_azuki[df_azuki['Buyer'].isin(user_count)]

# # print len of df_azuki
# print(f"USER CUT 후 거래 개수: {len(df_azuki)}")

In [72]:
"""
Generate price labels, for later use of multi-objectives training
"""

# drop rows where 'Price' does not contain '$'
df_azuki = df_azuki[df_azuki['Price'].str.contains("\$")]
# convert 'Price' to the value before 'ETH'
df_azuki['Price'] = df_azuki['Price'].apply(lambda x: x.split(' ')[2][2:-1].replace(',', '').replace('.', ''))
df_azuki['Price'] = df_azuki['Price'].astype(float)

# create a new variable 'Price_diff' which is the difference between the future price and the current price 
# get price differences from the same 'Token ID'
df_azuki['Price_diff'] = df_azuki.groupby('Token ID')['Price'].diff(-1)
# convert rows where 'Price_diff' is NaN into 0
df_azuki['Price_diff'] = df_azuki['Price_diff'].fillna(0)
# put minus to Price_diff
df_azuki['Price_diff'] = df_azuki['Price_diff'].apply(lambda x: -x)
# convert 'Price_diff' to 1 if the value is greater than 0, otherwise 0
df_azuki['Price_diff'] = df_azuki['Price_diff'].apply(lambda x: 1 if x > 0 else 0)

# create an np.array with 'Buyer'
user = df_azuki['Buyer'].values
item = df_azuki['Token ID'].values
labels = df_azuki['Price_diff'].values

data = (user, item, labels)

# save and read npy file
np.save(save_path + f'{COLLECTION}.npy', data)
azuki = np.load(save_path + COLLECTION+'.npy', allow_pickle=True)

In [73]:
user = azuki[0]
item = azuki[1]
labels = azuki[2]

# print user length and item length
print('user length: ', len(set(user)))
print('item length: ', len(set(item)))
print('inter length: ', len(labels))

# save user length and item length as a dictionary
dict = {'num_user': len(set(user)), 'num_item': len(set(item))}
np.save(save_path + 'num_user_item.npy', dict)

user length:  11811
item length:  8386
inter length:  22987


In [74]:
"""
Change the user addresses to integers starting from 0.
e.g., 0x9137a5d195f0ab57e428c5a2be9bc8c4620445cb -> 0
"""

# create a dict where keys are user and values are new indices starting from 0
user_unique = np.unique(user)
mapping_u = {}
for i in range(len(user_unique)):
    mapping_u[user_unique[i]] = i

# apply mapping to user
user = np.array([mapping_u[u] for u in user])

# create a 2D np.array where first columns are users and second column is items
inter = np.array([user, item, labels]).T
# convert inter type as int64
inter = inter.astype(np.int64)
inter

array([[5457, 1430,    0],
       [3360,  845,    1],
       [7595, 1431,    1],
       ...,
       [7166, 6322,    0],
       [1987, 3792,    0],
       [9862, 9251,    0]], dtype=int64)

In [76]:
"""
TEMPORAL
TEMPORAL
TEMPORAL
TEMPORAL
TEMPORAL

Split the data into train, validation, and test sets.
And get indices of train, validation, and test sets.
"""

inter_len = len(inter)
# get random indices
indices = np.random.permutation(inter_len)
# split indices into train, validation, and test sets
train_indices = indices[:int(inter_len*0.8)]
val_indices = indices[int(inter_len*0.8):int(inter_len*0.9)]
test_indices = indices[int(inter_len*0.9):]

# create a list of lists, where each list contains indices of train, validation, and test sets
indices = [list(train_indices), list(val_indices), list(test_indices)]

# save indices as pkl file
import pickle
with open(save_path + 'indices.pkl', 'wb') as f:
    pickle.dump(indices, f)

In [49]:
"""
Ensure that the indices for the user and item do not overlap with each other.
We add len(set(item)) to the user indices.
We map the item indices to the range of [0, len(set(item))).

For example,
    Before:
        user: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        item: [0, 3, 4, 7, 8, 9, 10, 20, 21, 22]
    After:
        user: [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
        item: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

"""

# 1) Change the user idx: start from num_item.

num_item = len(set(item))
user += num_item

inter = np.array([user, item, labels]).T
inter = inter.astype(np.int64)


# 2) Map the item idx: start from 0.

# create a dict where keys are item and values are new indices starting from 0
item_unique = np.unique(item)
mapping_i = {}
for i in range(len(item_unique)):
    mapping_i[item_unique[i]] = i
mapping_i

# convert the second column of inter to new indices using mapping
inter[:, 1] = [mapping_i[i] for i in inter[:, 1]]
inter

array([[13843,  1080,     0],
       [11746,   576,     1],
       [15981,  1081,     1],
       ...,
       [15552,  5292,     0],
       [10373,  3121,     0],
       [18248,  7837,     0]], dtype=int64)

# Create user features (user_feat.npy)
- input
    - User features data in 'user_features' folder, collected and preprocessed from transactions file
- output
    - An .npy formatted user features file ('# of transactions', 'Avg transaction price', 'avg holding period')

In [2]:
# read 'user features.csv'
df_feature = pd.read_csv('dataset/user_features/user_features.csv', index_col=0).drop(['Unnamed: 0'], axis=1)

# scaling columns "# of transactions", "Avg transaction price", "avg holding period": MinMaxScaler
scaler = MinMaxScaler()
names = ['# of transactions', 'Avg transaction price', 'avg holding period']
df_feature[names] = scaler.fit_transform(df_feature[names])

# convert column 'Buyer' using mapping_u
# if the value is not in mapping_u, remove the row
df_feature['Buyer'] = df_feature['Buyer'].apply(lambda x: mapping_u[x] if x in mapping_u else np.nan)
df_feature = df_feature.dropna()
# convert column 'Buyer' to int
df_feature['Buyer'] = df_feature['Buyer'].astype(int)
print('num_user: ', len(df_feature))

# set 'Buyer' as index
df_feature = df_feature.set_index('Buyer')

# save df as npy file
np.save(save_path+'user_feat.npy', df_feature, allow_pickle=True)

NameError: name 'mapping_u' is not defined

# Create train data (train.npy)

In [51]:
# random split inter
train, valid_and_test = train_test_split(inter, test_size=0.2, random_state=2023)
valid, test = train_test_split(valid_and_test, test_size=0.5, random_state=2023)

# print train, valid shape
print('train shape: ', train.shape)
print('valid shape: ', valid.shape)
print('test shape: ', test.shape)

# save inter as npy file
np.save(save_path+'train.npy', train, allow_pickle=True)

train shape:  (18389, 3)
valid shape:  (2299, 3)
test shape:  (2299, 3)


# Create valid data (val.npy)

In [52]:
# using valid, create a dict where keys are unique users and values are items

valid_dict = {}
for i in range(len(valid)):
    if valid[i][0] in valid_dict:
        valid_dict[valid[i][0]].append(valid[i][1])
    else:
        valid_dict[valid[i][0]] = [valid[i][1]]

# show the first five items in valid_dict
list(valid_dict.items())[:5]

[(15556, [1608]),
 (10417, [7924, 2246]),
 (11908, [8218]),
 (20172, [2021]),
 (15446, [5431, 453, 5260, 4947, 5978, 5242])]

In [53]:
"""
Extract the item index in the order of the most traded (popular).
"""

# concat all values in valid_dict as a list
valid_list = []
for i in valid_dict.values():
    valid_list += i

# value count valid_list and sort values
value_counts = pd.Series(valid_list).value_counts().sort_values(ascending=False)

# extract indices of value_counts
indices = value_counts.index

# save indices as npy
np.save(save_path+'indices_valid.npy', indices, allow_pickle=True)

In [54]:
"""
Convert to the form required by the model
e.g., 12656: [7314, 4820, 6304] -> list([12656, 7314, 4820, 6304])
"""

# Create an empty numpy array with dtype 'object'
my_array = np.empty(len(valid_dict), dtype=object)

# Assign the lists directly to the elements of the array
for i, (key, val) in enumerate(valid_dict.items()):
    # include key in the list
    my_array[i] = [key] + val

# show the first five items in my_array
my_array[:5]

array([list([15556, 1608]), list([10417, 7924, 2246]),
       list([11908, 8218]), list([20172, 2021]),
       list([15446, 5431, 453, 5260, 4947, 5978, 5242])], dtype=object)

In [55]:
# save my_array as npy file

np.save(save_path+'val.npy', my_array, allow_pickle=True)

# Create test data (test.npy)

In [56]:
# using test, create a dict where keys are unique users and values are items

test_dict = {}
for i in range(len(test)):
    if test[i][0] in test_dict:
        test_dict[test[i][0]].append(test[i][1])
    else:
        test_dict[test[i][0]] = [test[i][1]]

# show the first five items in test_dict
list(test_dict.items())[:5]

[(10739, [7712]),
 (12897, [1753]),
 (13508, [1484, 6924]),
 (19022, [2484]),
 (14053, [4533])]

In [57]:
# concat all values in test_dict as a list

test_list = []
for i in test_dict.values():
    test_list += i


# value count test_list and sort values

value_counts = pd.Series(test_list).value_counts().sort_values(ascending=False)

# extract indices of value_counts

indices = value_counts.index
indices

# save indices as npy

np.save(save_path+'indices_test.npy', indices, allow_pickle=True)

In [58]:
# Create an empty numpy array with dtype 'object'
my_array = np.empty(len(test_dict), dtype=object)

# Assign the lists directly to the elements of the array
for i, (key, val) in enumerate(test_dict.items()):
    # include key in the list
    my_array[i] = [key] + val

# show the first five items in my_array
my_array[:5]

array([list([10739, 7712]), list([12897, 1753]),
       list([13508, 1484, 6924]), list([19022, 2484]),
       list([14053, 4533])], dtype=object)

In [59]:
# 원래 이렇게하면 됐었는데 에러나서 버림

# # convert test_dict into a 1D np.array where each element is a list
# # a list where the first element is the key of test_dict and the value is the value of test_dict

# test_dict = np.array([[k]+v for k, v in test_dict.items()])
# test_dict

In [60]:
# save test_dict as npy file

np.save(save_path+'test.npy', my_array, allow_pickle=True)

# Create adjacency matrix (adj_dict.npy)

In [61]:
# first column of inter is user
# second column of inter is item

# create a dict where keys are user and values are items
adj_dict = {}
for i in range(len(inter)):
    if inter[i][0] in adj_dict:
        adj_dict[inter[i][0]].append(inter[i][1])
    else:
        adj_dict[inter[i][0]] = [inter[i][1]]

# show the first five items in adj_dict
list(adj_dict.items())[:5]

[(13843, [1080, 5354, 1189]),
 (11746, [576, 5046]),
 (15981, [1081, 2112, 2114, 5619, 3237, 1096, 1785, 85]),
 (15962, [1019]),
 (10689, [1943, 1947, 598, 1952])]

In [62]:
# save adj_dict as npy file

np.save(save_path+'adj_dict.npy', adj_dict, allow_pickle=True)

In [63]:
# count the ratio of the number of values in adj_dict where the length of values is greater than 1

count = 0
for i in adj_dict.values():
    if len(i) > 1:
        count += 1
print(count/len(adj_dict))

0.33815934298535266


# Create item features (feat.npy)
When using features, there is no need for tokenID to match inter because the index is used in features.

- input
    - Item features data in 'item_features' folder, collected and preprocessed from OpenSea
- output
    - An .npy formatted item features file (image, text, price, transaction)

In [67]:
"""
Only keep items that appear in inter.
"""

# print image, text, price shape
print('Before')
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)
print('transaction shape: ', transaction.shape)
print('')

item_unique = np.unique(item)

# for dataset image, text, price, filter rows whose indices are in item_unique
image = image.loc[image.index.isin(item_unique)]
text = text.loc[text.index.isin(item_unique)]
price = price.loc[price.index.isin(item_unique)]
transaction = transaction.loc[transaction.index.isin(item_unique)]

# print image, text, price shape
print('After')
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)
print('transaction shape: ', transaction.shape)

Before
image shape:  (8386, 1024)
text shape:  (8386, 1800)
price shape:  (8386, 64)
transaction shape:  (8386, 64)

After
image shape:  (8386, 1024)
text shape:  (8386, 1800)
price shape:  (8386, 64)
transaction shape:  (8386, 64)


In [66]:
# assert that the indices of image, text, price are the same, regardless of the order
assert np.array_equal(np.sort(image.index.values), np.sort(text.index.values))
assert np.array_equal(np.sort(image.index.values), np.sort(price.index.values))
assert np.array_equal(np.sort(image.index.values), np.sort(transaction.index.values))

# save df as npy file
np.save(save_path+'image_feat.npy', image)
np.save(save_path+'text_feat.npy', text)
np.save(save_path+'price_feat.npy', price)
np.save(save_path+'transaction_feat.npy', transaction)