# Import packages

In [109]:
import pandas as pd
import numpy as np
import scipy.sparse
import os
from collections import defaultdict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [110]:
'''
select NFT collection name for preprocessing, from ['azuki', 'bayc', 'coolcats', 'doodles', 'meebits']
'''

COLLECTION = 'meebits'

# set save_path if not exist
save_path = 'dataset/collections/'+COLLECTION+'/'
if not os.path.exists(save_path):
    os.makedirs(save_path)

df_azuki = pd.read_csv(f"dataset/transactions/{COLLECTION}.csv")
# drop duplicated interactions (i.e., drop rows that Buyer and Token ID are identical)
df_azuki = df_azuki.drop_duplicates(subset=['Buyer', 'Token ID'], keep='first')

# Create interactions ('inter')
- input
    - NFT transactions data in 'transactions' folder, collected from Etherscan NFT tracker (https://etherscan.io/nfttracker)
- output
    - An .npy formatted interaction file (user, item, label)

In [111]:
"""
First, exclude items that were first traded after September 2022. Only items that exist in the transaction features file will be left.
"""

transaction = pd.read_csv(f'dataset/item_features/{COLLECTION}_transactions.csv', index_col=0)
# drop rows where Token ID is not in indices of transaction
df_azuki = df_azuki[df_azuki['Token ID'].isin(transaction.index)]

In [112]:
"""
ITEM_CUT: Only items that have been traded three times or more will be used.
"""

# print len of df_azuki
print(f"ITEM CUT 전 거래 개수: {len(df_azuki)}")

# get the list of "Token ID" whose count is more than 3
item_count = df_azuki['Token ID'].value_counts()
item_count = item_count[item_count >= 3]
item_count = item_count.index.tolist()

# drop rows whose "Token ID" is not in item_count
df_azuki = df_azuki[df_azuki['Token ID'].isin(item_count)]


# print len of df_azuki
print(f"ITEM CUT 후 거래 개수: {len(df_azuki)}")

ITEM CUT 전 거래 개수: 12807
ITEM CUT 후 거래 개수: 7317


In [113]:
"""
Generate price labels, for later use of multi-objectives training
"""

# drop rows where 'Price' does not contain '$'
df_azuki = df_azuki[df_azuki['Price'].str.contains("\$")]
# convert 'Price' to the value before 'ETH'
df_azuki['Price'] = df_azuki['Price'].apply(lambda x: x.split(' ')[2][2:-1].replace(',', '').replace('.', ''))
df_azuki['Price'] = df_azuki['Price'].astype(float)

# create a new variable 'Price_diff' which is the difference between the future price and the current price 
# get price differences from the same 'Token ID'
df_azuki['Price_diff'] = df_azuki.groupby('Token ID')['Price'].diff(-1)
# convert rows where 'Price_diff' is NaN into 0
df_azuki['Price_diff'] = df_azuki['Price_diff'].fillna(0)
# put minus to Price_diff
df_azuki['Price_diff'] = df_azuki['Price_diff'].apply(lambda x: -x)
# convert 'Price_diff' to 1 if the value is greater than 0, otherwise 0
df_azuki['Price_diff'] = df_azuki['Price_diff'].apply(lambda x: 1 if x > 0 else 0)

# create an np.array with 'Buyer'
user = df_azuki['Buyer'].values
item = df_azuki['Token ID'].values
price = df_azuki['Price_diff'].values

data = (user, item, price)

# save data as npy file
np.save(save_path + f'{COLLECTION}.npy', data)

In [114]:
# read azuki.npy file
azuki = np.load(save_path + COLLECTION+'.npy', allow_pickle=True)

user = azuki[0]
item = azuki[1]
labels = azuki[2]

# print user length and item length
print('user length: ', len(set(user)))
print('item length: ', len(set(item)))
print('inter length: ', len(labels))

# save user length and item length as a dictionary
dict = {'num_user': len(set(user)), 'num_item': len(set(item))}
np.save(save_path + 'num_user_item.npy', dict)

user length:  4572
item length:  1693
inter length:  7299


In [115]:
"""
Change the user addresses to indices starting from 0.
"""

# create a dict where keys are user and values are new indices starting from 0
user_unique = np.unique(user)
mapping_u = {}
for i in range(len(user_unique)):
    mapping_u[user_unique[i]] = i

# apply mapping to user
user = np.array([mapping_u[u] for u in user])

# create a 2D np.array where first columns are users and second column is items
inter = np.array([user, item, labels]).T
# convert inter type as int64
inter = inter.astype(np.int64)
inter

array([[ 2209,   843,     0],
       [ 1989,  7057,     0],
       [ 2561, 11782,     0],
       ...,
       [ 1578,  7054,     0],
       [ 1118, 13581,     0],
       [ 3702, 10341,     0]], dtype=int64)

In [116]:
"""
Ensure that the indices for the user and item do not overlap with each other.
"""

# 1) Change the user idx: start from num_item.

num_item = len(set(item))
user += num_item

inter = np.array([user, item, labels]).T
inter = inter.astype(np.int64)

# 2) Map the item idx: start from 0.

# create a dict where keys are item and values are new indices starting from 0
item_unique = np.unique(item)
mapping_i = {}
for i in range(len(item_unique)):
    mapping_i[item_unique[i]] = i
mapping_i

# convert the second column of inter to new indices using mapping
inter[:, 1] = [mapping_i[i] for i in inter[:, 1]]
inter

array([[3902,   73,    0],
       [3682,  603,    0],
       [4254, 1037,    0],
       ...,
       [3271,  602,    0],
       [2811, 1167,    0],
       [5395,  914,    0]], dtype=int64)

# Create user features (user_feat.npy)
- input
    - User features data in 'user_features' folder, collected and preprocessed from transactions file
- output
    - An .npy formatted user features file ('# of transactions', 'Avg transaction price', 'avg holding period')

In [117]:
# read 'user features.csv'
df_feature = pd.read_csv('dataset/user_features/user_features.csv', index_col=0).drop(['Unnamed: 0'], axis=1)

# scaling columns "# of transactions", "Avg transaction price", "avg holding period": MinMaxScaler
scaler = MinMaxScaler()
names = ['# of transactions', 'Avg transaction price', 'avg holding period']
df_feature[names] = scaler.fit_transform(df_feature[names])

# convert column 'Buyer' using mapping_u
# if the value is not in mapping_u, remove the row
df_feature['Buyer'] = df_feature['Buyer'].apply(lambda x: mapping_u[x] if x in mapping_u else np.nan)
df_feature = df_feature.dropna()
# convert column 'Buyer' to int
df_feature['Buyer'] = df_feature['Buyer'].astype(int)
print('num_user: ', len(df_feature))

# set 'Buyer' as index
df_feature = df_feature.set_index('Buyer')

# save df as npy file
np.save(save_path+'user_feat.npy', df_feature, allow_pickle=True)

num_user:  4572


# Create train data (train.npy)

In [118]:
# random split inter
train, valid_and_test = train_test_split(inter, test_size=0.2, random_state=2023)
valid, test = train_test_split(valid_and_test, test_size=0.5, random_state=2023)

# print train, valid shape
print('train shape: ', train.shape)
print('valid shape: ', valid.shape)
print('test shape: ', test.shape)

# save inter as npy file
np.save(save_path+'train.npy', train, allow_pickle=True)

train shape:  (5839, 3)
valid shape:  (730, 3)
test shape:  (730, 3)


# Create valid data (val.npy)

In [119]:
# using valid, create a dict where keys are unique users and values are items

valid_dict = {}
for i in range(len(valid)):
    if valid[i][0] in valid_dict:
        valid_dict[valid[i][0]].append(valid[i][1])
    else:
        valid_dict[valid[i][0]] = [valid[i][1]]
valid_dict

{2506: [844],
 3159: [1511],
 3584: [222],
 3643: [1262],
 4004: [568, 588],
 2295: [733],
 3024: [150],
 4131: [260, 169],
 3769: [874],
 2367: [255],
 2533: [284],
 6131: [142, 853, 481, 419],
 2463: [7],
 4683: [1472],
 5259: [817],
 2068: [280],
 4702: [314],
 5129: [1172, 1309, 15],
 5285: [981],
 1950: [210, 1400],
 2640: [828],
 2231: [799, 1507],
 2258: [486],
 3151: [1167, 79, 1549],
 1783: [769],
 4117: [1684],
 3041: [415],
 5315: [1580, 520],
 5071: [1617],
 6151: [1176],
 3886: [792],
 4609: [1109],
 3585: [1186],
 5424: [637],
 4891: [831, 1134],
 5033: [887],
 4046: [1067],
 2370: [171],
 5755: [899, 906, 1211, 385, 486, 1231],
 5433: [81, 946],
 5120: [1014, 383],
 2736: [997],
 5334: [1167],
 2711: [988],
 1754: [857],
 2319: [1148],
 4892: [940],
 3279: [127],
 4631: [112],
 2259: [1079],
 3557: [453],
 4535: [408],
 2350: [427, 392],
 3970: [754],
 5492: [535],
 2305: [461],
 4969: [474],
 3262: [1485],
 4083: [930],
 4684: [444],
 1935: [471],
 2567: [182],
 3864: [

In [120]:
"""
Extract the item index in the order of the most traded (popular).
"""

# concat all values in valid_dict as a list
valid_list = []
for i in valid_dict.values():
    valid_list += i

# value count valid_list and sort values
value_counts = pd.Series(valid_list).value_counts().sort_values(ascending=False)

# extract indices of value_counts
indices = value_counts.index

# save indices as npy
np.save(save_path+'indices_valid.npy', indices, allow_pickle=True)

In [121]:
"""
Convert to the form required by the model
e.g., 12656: [7314, 4820, 6304] -> list([12656, 7314, 4820, 6304])
"""

# Create an empty numpy array with dtype 'object'
my_array = np.empty(len(valid_dict), dtype=object)

# Assign the lists directly to the elements of the array
for i, (key, val) in enumerate(valid_dict.items()):
    # include key in the list
    my_array[i] = [key] + val

my_array

array([list([2506, 844]), list([3159, 1511]), list([3584, 222]),
       list([3643, 1262]), list([4004, 568, 588]), list([2295, 733]),
       list([3024, 150]), list([4131, 260, 169]), list([3769, 874]),
       list([2367, 255]), list([2533, 284]),
       list([6131, 142, 853, 481, 419]), list([2463, 7]),
       list([4683, 1472]), list([5259, 817]), list([2068, 280]),
       list([4702, 314]), list([5129, 1172, 1309, 15]), list([5285, 981]),
       list([1950, 210, 1400]), list([2640, 828]),
       list([2231, 799, 1507]), list([2258, 486]),
       list([3151, 1167, 79, 1549]), list([1783, 769]),
       list([4117, 1684]), list([3041, 415]), list([5315, 1580, 520]),
       list([5071, 1617]), list([6151, 1176]), list([3886, 792]),
       list([4609, 1109]), list([3585, 1186]), list([5424, 637]),
       list([4891, 831, 1134]), list([5033, 887]), list([4046, 1067]),
       list([2370, 171]), list([5755, 899, 906, 1211, 385, 486, 1231]),
       list([5433, 81, 946]), list([5120, 1014, 3

In [122]:
# save my_array as npy file

np.save(save_path+'val.npy', my_array, allow_pickle=True)

# Create test data (test.npy)

In [123]:
# using test, create a dict where keys are unique users and values are items

test_dict = {}
for i in range(len(test)):
    if test[i][0] in test_dict:
        test_dict[test[i][0]].append(test[i][1])
    else:
        test_dict[test[i][0]] = [test[i][1]]
test_dict

{1865: [1263],
 2176: [1035],
 2883: [1435],
 3138: [146],
 5632: [721],
 1768: [116],
 6188: [1322],
 4890: [1279],
 2910: [1444],
 1832: [652],
 2566: [875],
 5951: [1470],
 4280: [744],
 2914: [433, 1267],
 2185: [860],
 5755: [1150, 1299, 1138, 1351, 330, 843],
 2642: [809],
 3020: [1289],
 1950: [495],
 2900: [504],
 2242: [1579],
 5761: [249],
 3735: [1314],
 2854: [377],
 2198: [144, 433],
 4732: [322],
 1695: [830],
 3189: [211, 1597],
 4239: [1129],
 2427: [1361],
 6122: [442, 1339],
 4202: [355],
 4098: [250],
 5337: [65],
 2990: [397, 1301],
 3243: [19],
 1857: [863],
 3226: [281],
 3851: [1641],
 3219: [1397],
 2530: [1397],
 3957: [1511],
 4528: [770],
 3539: [1398],
 2689: [665],
 2511: [439],
 2332: [1656],
 1772: [441],
 2617: [655],
 1696: [1644],
 5637: [472],
 5965: [1403],
 5615: [1103],
 3210: [423],
 2974: [659],
 2579: [1620, 662],
 5091: [1061],
 5502: [1055],
 5554: [1496],
 3398: [594, 1084],
 5430: [14],
 4635: [57],
 3953: [1506],
 2504: [225],
 4790: [1672]

In [124]:
# concat all values in test_dict as a list

test_list = []
for i in test_dict.values():
    test_list += i


# value count test_list and sort values

value_counts = pd.Series(test_list).value_counts().sort_values(ascending=False)

# extract indices of value_counts

indices = value_counts.index
indices

# save indices as npy

np.save(save_path+'indices_test.npy', indices, allow_pickle=True)

In [125]:
# Create an empty numpy array with dtype 'object'
my_array = np.empty(len(test_dict), dtype=object)

# Assign the lists directly to the elements of the array
for i, (key, val) in enumerate(test_dict.items()):
    # include key in the list
    my_array[i] = [key] + val

my_array

array([list([1865, 1263]), list([2176, 1035]), list([2883, 1435]),
       list([3138, 146]), list([5632, 721]), list([1768, 116]),
       list([6188, 1322]), list([4890, 1279]), list([2910, 1444]),
       list([1832, 652]), list([2566, 875]), list([5951, 1470]),
       list([4280, 744]), list([2914, 433, 1267]), list([2185, 860]),
       list([5755, 1150, 1299, 1138, 1351, 330, 843]), list([2642, 809]),
       list([3020, 1289]), list([1950, 495]), list([2900, 504]),
       list([2242, 1579]), list([5761, 249]), list([3735, 1314]),
       list([2854, 377]), list([2198, 144, 433]), list([4732, 322]),
       list([1695, 830]), list([3189, 211, 1597]), list([4239, 1129]),
       list([2427, 1361]), list([6122, 442, 1339]), list([4202, 355]),
       list([4098, 250]), list([5337, 65]), list([2990, 397, 1301]),
       list([3243, 19]), list([1857, 863]), list([3226, 281]),
       list([3851, 1641]), list([3219, 1397]), list([2530, 1397]),
       list([3957, 1511]), list([4528, 770]), list([

In [126]:
# 원래 이렇게하면 됐었는데 에러나서 버림

# # convert test_dict into a 1D np.array where each element is a list
# # a list where the first element is the key of test_dict and the value is the value of test_dict

# test_dict = np.array([[k]+v for k, v in test_dict.items()])
# test_dict

In [127]:
# save test_dict as npy file

np.save(save_path+'test.npy', my_array, allow_pickle=True)

# Create adjacency matrix (adj_dict.npy)

In [128]:
# first column of inter is user
# second column of inter is item

# create a dict where keys are user and values are items
adj_dict = {}
for i in range(len(inter)):
    if inter[i][0] in adj_dict:
        adj_dict[inter[i][0]].append(inter[i][1])
    else:
        adj_dict[inter[i][0]] = [inter[i][1]]
adj_dict

{3902: [73],
 3682: [603,
  452,
  1646,
  863,
  86,
  423,
  424,
  271,
  1682,
  905,
  1185,
  1305,
  40,
  1063,
  935,
  318],
 4254: [1037, 50],
 1801: [1018],
 2631: [902, 1192, 603],
 3833: [1127, 304, 982, 9],
 5150: [1192],
 4858: [156],
 5382: [1679],
 5183: [679, 497, 407],
 3386: [1361],
 3994: [76, 1606, 903],
 5173: [1262],
 5232: [130],
 6020: [407],
 3664: [869],
 5424: [990, 1206, 637, 85],
 3910: [1280,
  1497,
  578,
  170,
  1340,
  1679,
  61,
  528,
  813,
  571,
  143,
  724,
  888,
  574],
 1905: [497],
 5186: [410, 1203, 228],
 6249: [433],
 2198: [44, 433, 144, 499, 1650, 52, 193, 356, 266, 1070],
 5626: [1605],
 4389: [881],
 5551: [725],
 3102: [927, 1674, 1159],
 3186: [1466, 767, 612, 340],
 2559: [47],
 5235: [1343],
 6238: [1098],
 4755: [187, 768],
 3304: [34, 1338, 1242, 1221, 596],
 5624: [1104],
 4299: [907],
 4183: [1504],
 5041: [187],
 4720: [1596],
 4522: [601, 907, 667, 692],
 5182: [1135],
 3379: [226],
 6000: [1504, 103, 618, 1451],
 5550:

In [129]:
# save adj_dict as npy file

np.save(save_path+'adj_dict.npy', adj_dict, allow_pickle=True)

In [130]:
# count the ratio of the number of values in adj_dict where the length of values is greater than 1

count = 0
for i in adj_dict.values():
    if len(i) > 1:
        count += 1
print(count/len(adj_dict))

0.23337707786526685


# Create item features (feat.npy)
When using features, there is no need for tokenID to match inter because the index is used in features.

- input
    - Item features data in 'item_features' folder, collected and preprocessed from OpenSea
- output
    - An .npy formatted item features file (image, text, price, transaction)

In [131]:
# read 'bayc_image.csv' file
# set index as first column

image = pd.read_csv(f'dataset/item_features/{COLLECTION}_image.csv', index_col=0)
text = pd.read_csv(f'dataset/item_features/{COLLECTION}_text.csv', index_col=0)
price = pd.read_csv(f'dataset/item_features/{COLLECTION}_prices.csv', index_col=0)
transaction = pd.read_csv(f'dataset/item_features/{COLLECTION}_transactions.csv', index_col=0)

# print image, text, price shape
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)
print('transaction shape: ', transaction.shape)

image shape:  (20000, 1024)
text shape:  (20000, 1800)
price shape:  (5949, 64)
transaction shape:  (5949, 64)


In [132]:
"""
Only keep items that appear in inter.
"""

item_unique = np.unique(item)

# for dataset image, text, price, filter rows whose indices are in item_unique
image = image.loc[image.index.isin(item_unique)]
text = text.loc[text.index.isin(item_unique)]
price = price.loc[price.index.isin(item_unique)]
transaction = transaction.loc[transaction.index.isin(item_unique)]

# print image, text, price shape
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)
print('transaction shape: ', transaction.shape)

image shape:  (1693, 1024)
text shape:  (1693, 1800)
price shape:  (1693, 64)
transaction shape:  (1693, 64)


In [133]:
"""
Keep only items that exist in the transaction features data and delete the rest. 
The purpose is to delete items that were first traded after September.
"""

# drop rows whose indices are not in indices of transaction
image = image.loc[image.index.isin(transaction.index)]
text = text.loc[text.index.isin(transaction.index)]
price = price.loc[price.index.isin(transaction.index)]

# print image, text, price shape
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)
print('transaction shape: ', transaction.shape)

image shape:  (1693, 1024)
text shape:  (1693, 1800)
price shape:  (1693, 64)
transaction shape:  (1693, 64)


In [54]:
"""
Optional: If the length of any of image, text, price is less than len(set(item)), fill it with random values.
"""

# compare indices of image and text 
# and fill empty rows with random values
image = image.reindex(text.index)

# convert rows that are nan values to random vector
image = image.fillna(np.random.rand(1)[0])

# print image, text, price shape
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)

image shape:  (928, 1024)
text shape:  (928, 1800)
price shape:  (928, 64)


In [81]:
"""
Optional: If the length of any of image, text, price is less than len(set(item)), fill it with random values.
"""

# compare indices of image and text 
# and fill empty rows with random values
text = text.reindex(image.index)

# convert rows that are nan values to random vector
text = text.fillna(np.random.rand(1)[0])

# print image, text, price shape
print('image shape: ', image.shape)
print('text shape: ', text.shape)
print('price shape: ', price.shape)

image shape:  (1646, 1024)
text shape:  (1646, 1500)
price shape:  (1646, 64)


In [134]:
# save df as npy file

np.save(save_path+'image_feat.npy', image)
np.save(save_path+'text_feat.npy', text)
np.save(save_path+'price_feat.npy', price)
np.save(save_path+'transaction_feat.npy', transaction)