# Import packages

In [33]:
import pandas as pd
import numpy as np
import scipy.sparse
import os
from collections import defaultdict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [34]:
'''
select NFT collection name for preprocessing, from ['azuki', 'bayc', 'coolcats', 'doodles', 'meebits']
'''

COLLECTION = 'azuki'

# set save_path if not exist
save_path = 'dataset/collections/'+COLLECTION+'/'
if not os.path.exists(save_path):
    os.makedirs(save_path)

df_azuki = pd.read_csv(f"dataset/transactions/{COLLECTION}.csv")
# drop duplicated interactions (i.e., drop rows that Buyer and Token ID are identical)
df_azuki = df_azuki.drop_duplicates(subset=['Buyer', 'Token ID'], keep='first')

# Create interactions ('inter')
- input
    - NFT transactions data in 'transactions' folder, collected from Etherscan NFT tracker (https://etherscan.io/nfttracker)
- output
    - An .npy formatted interaction file (user, item, label)

In [35]:
"""
First, exclude items that were first traded after September 2022. Only items that exist in the transaction features file will be left.
"""

transaction = pd.read_csv(f'dataset/item_features/{COLLECTION}_transactions.csv', index_col=0)
# drop rows where Token ID is not in indices of transaction
df_azuki = df_azuki[df_azuki['Token ID'].isin(transaction.index)]

In [36]:
"""
ITEM_CUT: Only items that have been traded three times or more will be used.
"""

# print len of df_azuki
print(f"ITEM CUT 전 거래 개수: {len(df_azuki)}")

# get the list of "Token ID" whose count is more than 3
item_count = df_azuki['Token ID'].value_counts()
item_count = item_count[item_count >= 3]
item_count = item_count.index.tolist()

# drop rows whose "Token ID" is not in item_count
df_azuki = df_azuki[df_azuki['Token ID'].isin(item_count)]


# print len of df_azuki
print(f"ITEM CUT 후 거래 개수: {len(df_azuki)}")

ITEM CUT 전 거래 개수: 22990
ITEM CUT 후 거래 개수: 16401


In [37]:
"""
Generate price labels, for later use of multi-objectives training
"""

# drop rows where 'Price' does not contain '$'
df_azuki = df_azuki[df_azuki['Price'].str.contains("\$")]
# convert 'Price' to the value before 'ETH'
df_azuki['Price'] = df_azuki['Price'].apply(lambda x: x.split(' ')[2][2:-1].replace(',', '').replace('.', ''))
df_azuki['Price'] = df_azuki['Price'].astype(float)

# create a new variable 'Price_diff' which is the difference between the future price and the current price 
# get price differences from the same 'Token ID'
df_azuki['Price_diff'] = df_azuki.groupby('Token ID')['Price'].diff(-1)
# convert rows where 'Price_diff' is NaN into 0
df_azuki['Price_diff'] = df_azuki['Price_diff'].fillna(0)
# put minus to Price_diff
df_azuki['Price_diff'] = df_azuki['Price_diff'].apply(lambda x: -x)
# convert 'Price_diff' to 1 if the value is greater than 0, otherwise 0
df_azuki['Price_diff'] = df_azuki['Price_diff'].apply(lambda x: 1 if x > 0 else 0)

# create an np.array with 'Buyer'
user = df_azuki['Buyer'].values
item = df_azuki['Token ID'].values
price = df_azuki['Price_diff'].values

data = (user, item, price)

# save data as npy file
np.save(save_path + f'{COLLECTION}.npy', data)

In [38]:
# read azuki.npy file
azuki = np.load(save_path + COLLECTION+'.npy', allow_pickle=True)

user = azuki[0]
item = azuki[1]
labels = azuki[2]

# print user length and item length
print('user length: ', len(set(user)))
print('item length: ', len(set(item)))
print('inter length: ', len(labels))

# save user length and item length as a dictionary
dict = {'num_user': len(set(user)), 'num_item': len(set(item))}
np.save(save_path + 'num_user_item.npy', dict)

user length:  9380
item length:  3930
inter length:  16398


In [39]:
"""
Change the user addresses to indices starting from 0.
"""

# create a dict where keys are user and values are new indices starting from 0
user_unique = np.unique(user)
mapping_u = {}
for i in range(len(user_unique)):
    mapping_u[user_unique[i]] = i

# apply mapping to user
user = np.array([mapping_u[u] for u in user])

# create a 2D np.array where first columns are users and second column is items
inter = np.array([user, item, labels]).T
# convert inter type as int64
inter = inter.astype(np.int64)
inter

array([[4336, 1430,    0],
       [2629,  845,    1],
       [6055, 1431,    1],
       ...,
       [5096, 4891,    0],
       [5709, 6322,    0],
       [1557, 3792,    0]], dtype=int64)

In [40]:
"""
Ensure that the indices for the user and item do not overlap with each other.
"""

# 1) Change the user idx: start from num_item.

num_item = len(set(item))
user += num_item

inter = np.array([user, item, labels]).T
inter = inter.astype(np.int64)

# 2) Map the item idx: start from 0.

# create a dict where keys are item and values are new indices starting from 0
item_unique = np.unique(item)
mapping_i = {}
for i in range(len(item_unique)):
    mapping_i[item_unique[i]] = i
mapping_i

# convert the second column of inter to new indices using mapping
inter[:, 1] = [mapping_i[i] for i in inter[:, 1]]
inter

array([[8266,  501,    0],
       [6559,  264,    1],
       [9985,  502,    1],
       ...,
       [9026, 1904,    0],
       [9639, 2508,    0],
       [5487, 1484,    0]], dtype=int64)

# Create user features (user_feat.npy)
- input
    - User features data in 'user_features' folder, collected and preprocessed from transactions file
- output
    - An .npy formatted user features file ('# of transactions', 'Avg transaction price', 'avg holding period')

In [41]:
# read 'user features.csv'
df_feature = pd.read_csv('dataset/user_features/user_features.csv', index_col=0).drop(['Unnamed: 0'], axis=1)

# scaling columns "# of transactions", "Avg transaction price", "avg holding period": MinMaxScaler
scaler = MinMaxScaler()
names = ['# of transactions', 'Avg transaction price', 'avg holding period']
df_feature[names] = scaler.fit_transform(df_feature[names])

# convert column 'Buyer' using mapping_u
# if the value is not in mapping_u, remove the row
df_feature['Buyer'] = df_feature['Buyer'].apply(lambda x: mapping_u[x] if x in mapping_u else np.nan)
df_feature = df_feature.dropna()
# convert column 'Buyer' to int
df_feature['Buyer'] = df_feature['Buyer'].astype(int)
print('num_user: ', len(df_feature))

# set 'Buyer' as index
df_feature = df_feature.set_index('Buyer')

# save df as npy file
np.save(save_path+'user_feat.npy', df_feature, allow_pickle=True)

num_user:  9380


# Create train data (train.npy)

In [42]:
# random split inter
train, valid_and_test = train_test_split(inter, test_size=0.2, random_state=2023)
valid, test = train_test_split(valid_and_test, test_size=0.5, random_state=2023)

# print train, valid shape
print('train shape: ', train.shape)
print('valid shape: ', valid.shape)
print('test shape: ', test.shape)

# save inter as npy file
np.save(save_path+'train.npy', train, allow_pickle=True)

train shape:  (13118, 3)
valid shape:  (1640, 3)
test shape:  (1640, 3)


# Create valid data (val.npy)

In [43]:
# using valid, create a dict where keys are unique users and values are items

valid_dict = {}
for i in range(len(valid)):
    if valid[i][0] in valid_dict:
        valid_dict[valid[i][0]].append(valid[i][1])
    else:
        valid_dict[valid[i][0]] = [valid[i][1]]

# show the first five items in valid_dict
list(valid_dict.items())[:5]

[(9183, [3786]),
 (9049, [512]),
 (12689, [1090, 1275]),
 (9874, [2554]),
 (4656, [1008, 580, 1921])]

In [44]:
"""
Extract the item index in the order of the most traded (popular).
"""

# concat all values in valid_dict as a list
valid_list = []
for i in valid_dict.values():
    valid_list += i

# value count valid_list and sort values
value_counts = pd.Series(valid_list).value_counts().sort_values(ascending=False)

# extract indices of value_counts
indices = value_counts.index

# save indices as npy
np.save(save_path+'indices_valid.npy', indices, allow_pickle=True)

In [45]:
"""
Convert to the form required by the model
e.g., 12656: [7314, 4820, 6304] -> list([12656, 7314, 4820, 6304])
"""

# Create an empty numpy array with dtype 'object'
my_array = np.empty(len(valid_dict), dtype=object)

# Assign the lists directly to the elements of the array
for i, (key, val) in enumerate(valid_dict.items()):
    # include key in the list
    my_array[i] = [key] + val

# show the first five items in my_array
my_array[:5]

array([list([9183, 3786]), list([9049, 512]), list([12689, 1090, 1275]),
       list([9874, 2554]), list([4656, 1008, 580, 1921])], dtype=object)

In [46]:
# save my_array as npy file

np.save(save_path+'val.npy', my_array, allow_pickle=True)

# Create test data (test.npy)

In [47]:
# using test, create a dict where keys are unique users and values are items

test_dict = {}
for i in range(len(test)):
    if test[i][0] in test_dict:
        test_dict[test[i][0]].append(test[i][1])
    else:
        test_dict[test[i][0]] = [test[i][1]]

# show the first five items in test_dict
list(test_dict.items())[:5]

[(9829, [1878]),
 (4000, [3610]),
 (8861, [2705]),
 (11955, [3406, 2144]),
 (11642, [2236])]

In [48]:
# concat all values in test_dict as a list

test_list = []
for i in test_dict.values():
    test_list += i


# value count test_list and sort values

value_counts = pd.Series(test_list).value_counts().sort_values(ascending=False)

# extract indices of value_counts

indices = value_counts.index
indices

# save indices as npy

np.save(save_path+'indices_test.npy', indices, allow_pickle=True)

In [49]:
# Create an empty numpy array with dtype 'object'
my_array = np.empty(len(test_dict), dtype=object)

# Assign the lists directly to the elements of the array
for i, (key, val) in enumerate(test_dict.items()):
    # include key in the list
    my_array[i] = [key] + val

# show the first five items in my_array
my_array[:5]

array([list([9829, 1878]), list([4000, 3610]), list([8861, 2705]),
       list([11955, 3406, 2144]), list([11642, 2236])], dtype=object)

In [50]:
# 원래 이렇게하면 됐었는데 에러나서 버림

# # convert test_dict into a 1D np.array where each element is a list
# # a list where the first element is the key of test_dict and the value is the value of test_dict

# test_dict = np.array([[k]+v for k, v in test_dict.items()])
# test_dict

In [51]:
# save test_dict as npy file

np.save(save_path+'test.npy', my_array, allow_pickle=True)

# Create adjacency matrix (adj_dict.npy)

In [52]:
# first column of inter is user
# second column of inter is item

# create a dict where keys are user and values are items
adj_dict = {}
for i in range(len(inter)):
    if inter[i][0] in adj_dict:
        adj_dict[inter[i][0]].append(inter[i][1])
    else:
        adj_dict[inter[i][0]] = [inter[i][1]]

# show the first five items in adj_dict
list(adj_dict.items())[:5]

[(8266, [501]),
 (6559, [264, 2373]),
 (9985, [502, 998, 1000, 2670, 1542, 845]),
 (5727, [917, 922]),
 (12692, [495])]

In [53]:
# save adj_dict as npy file

np.save(save_path+'adj_dict.npy', adj_dict, allow_pickle=True)

In [54]:
# count the ratio of the number of values in adj_dict where the length of values is greater than 1

count = 0
for i in adj_dict.values():
    if len(i) > 1:
        count += 1
print(count/len(adj_dict))

0.30575692963752665


# Create item features (feat.npy)
When using features, there is no need for tokenID to match inter because the index is used in features.

- input
    - Item features data in 'item_features' folder, collected and preprocessed from OpenSea
- output
    - An .npy formatted item features file (image, text, price, transaction)

In [55]:
# read 'bayc_image.csv' file
# set index as first column

image = pd.read_csv(f'dataset/item_features/{COLLECTION}_image.csv', index_col=0)
text = pd.read_csv(f'dataset/item_features/{COLLECTION}_text.csv', index_col=0)
price = pd.read_csv(f'dataset/item_features/{COLLECTION}_prices.csv', index_col=0)
transaction = pd.read_csv(f'dataset/item_features/{COLLECTION}_transactions.csv', index_col=0)

# print image, text, price shape
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)
print('transaction shape: ', transaction.shape)

image shape:  (10000, 1024)
text shape:  (10000, 1800)
price shape:  (8480, 64)
transaction shape:  (8480, 64)


In [56]:
"""
Only keep items that appear in inter.
"""

item_unique = np.unique(item)

# for dataset image, text, price, filter rows whose indices are in item_unique
image = image.loc[image.index.isin(item_unique)]
text = text.loc[text.index.isin(item_unique)]
price = price.loc[price.index.isin(item_unique)]
transaction = transaction.loc[transaction.index.isin(item_unique)]

# print image, text, price shape
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)
print('transaction shape: ', transaction.shape)

image shape:  (3930, 1024)
text shape:  (3930, 1800)
price shape:  (3930, 64)
transaction shape:  (3930, 64)


In [57]:
"""
Keep only items that exist in the transaction features data and delete the rest. 
The purpose is to delete items that were first traded after September.
"""

# drop rows whose indices are not in indices of transaction
image = image.loc[image.index.isin(transaction.index)]
text = text.loc[text.index.isin(transaction.index)]
price = price.loc[price.index.isin(transaction.index)]

# print image, text, price shape
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)
print('transaction shape: ', transaction.shape)

image shape:  (3930, 1024)
text shape:  (3930, 1800)
price shape:  (3930, 64)
transaction shape:  (3930, 64)


In [54]:
"""
Optional: If the length of any of image, text, price is less than len(set(item)), fill it with random values.
"""

# compare indices of image and text 
# and fill empty rows with random values
image = image.reindex(text.index)

# convert rows that are nan values to random vector
image = image.fillna(np.random.rand(1)[0])

# print image, text, price shape
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)

image shape:  (928, 1024)
text shape:  (928, 1800)
price shape:  (928, 64)


In [81]:
"""
Optional: If the length of any of image, text, price is less than len(set(item)), fill it with random values.
"""

# compare indices of image and text 
# and fill empty rows with random values
text = text.reindex(image.index)

# convert rows that are nan values to random vector
text = text.fillna(np.random.rand(1)[0])

# print image, text, price shape
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)

image shape:  (1646, 1024)
text shape:  (1646, 1500)
price shape:  (1646, 64)


In [58]:
# save df as npy file

np.save(save_path+'image_feat.npy', image)
np.save(save_path+'text_feat.npy', text)
np.save(save_path+'price_feat.npy', price)
np.save(save_path+'transaction_feat.npy', transaction)