# Create atomic files 
- User-item interactions
    - Raw data (.csv) -> Atomic files (.inter)
- Item features
    - Raw data (.csv) -> Atomic files (.itememb)

In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import yaml
import os
import glob
import pickle
from tqdm import tqdm

In [2]:
# Raw data .csv files 
files = glob.glob('./dataset/transactions/*.csv')
files

['./dataset/transactions\\azuki.csv',
 './dataset/transactions\\bayc.csv',
 './dataset/transactions\\coolcats.csv',
 './dataset/transactions\\doodles.csv',
 './dataset/transactions\\meebits.csv']

In [3]:
# Get names which is the one before '.csv'
names = [os.path.basename(x).split('.')[0] for x in files]
names

['azuki', 'bayc', 'coolcats', 'doodles', 'meebits']

## .inter

In [5]:
for name, file in zip(tqdm(names), files):

    df_azuki = pd.read_csv(file)
    # df_azuki = df_azuki.drop_duplicates(subset=['Buyer', 'Token ID'], keep='first') # drop duplicated interactions
    user = df_azuki['Buyer'].values
    item = df_azuki['Token ID'].values

    save_path = './dataset/collections/' + name
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    f = open(save_path + f"/{name}.inter", 'w')
    f.write("user_id:token\titem_id:token\n")
    for i in range(len(user)):
        f.write("%s\t%d\n"%(user[i], item[i]))
    f.close()

100%|██████████| 5/5 [00:00<00:00, 15.41it/s]


## .itememb

### img, txt, price

In [11]:
for name in names:
    print('Collection name: ', name)
    for attribute in ['img', 'txt', 'price']:
        print('--- ', attribute)

        # Get raw data file
        file = pd.read_csv(f'./dataset/item_features/{name}_{attribute}.csv')
        print('before: ', file.shape)
        
        # 우리가 가진 기간 내 interaction에 등장하는 아이템만 남기기
        inter = pd.read_csv(f'./dataset/collections/{name}/{name}.inter', sep='\t')
        token_ids = inter['item_id:token'].unique()
        file = file[file['token_id'].isin(token_ids)].reset_index(drop=True)
        print('after: ', file.shape)

        save_path = './dataset/collections/' + name
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        f = open(save_path + f"/{name}.itememb_{attribute}", 'w')
        f.write(f"iid_{attribute}:token" + '\t' + f'item_emb_{attribute}:float_seq' + '\n')
        for i in tqdm(range(len(file))):
            # get token_id
            token_id = file['token_id'][i]
            # get the rest of the features
            features = file.iloc[i, 1:] # Series
            # write
            f.write(str(token_id) + '\t')
            for j in range(len(features)):
                f.write(f"{features[j].astype(np.float32)}") 
                # if it is not the last iteration
                if j != len(features) - 1:
                    f.write(' ')
            f.write('\n')    

        f.close()

Collection name:  azuki
---  img
before:  (10000, 65)
after:  (8386, 65)


100%|██████████| 8386/8386 [00:03<00:00, 2247.58it/s]


---  txt
before:  (10000, 1801)
after:  (8386, 1801)


100%|██████████| 8386/8386 [01:14<00:00, 112.69it/s]


---  price
before:  (8386, 2)
after:  (8386, 2)


100%|██████████| 8386/8386 [00:01<00:00, 7565.44it/s]


Collection name:  bayc
---  img
before:  (9983, 65)
after:  (4008, 65)


100%|██████████| 4008/4008 [00:01<00:00, 2337.21it/s]


---  txt
before:  (10000, 1801)
after:  (4025, 1801)


100%|██████████| 4025/4025 [00:35<00:00, 113.05it/s]


---  price
before:  (4025, 2)
after:  (4025, 2)


100%|██████████| 4025/4025 [00:00<00:00, 7607.53it/s]


Collection name:  coolcats
---  img
before:  (9952, 65)
after:  (4908, 65)


100%|██████████| 4908/4908 [00:02<00:00, 2301.61it/s]


---  txt
before:  (9941, 1501)
after:  (4903, 1501)


100%|██████████| 4903/4903 [00:35<00:00, 139.25it/s]


---  price
before:  (4908, 2)
after:  (4908, 2)


100%|██████████| 4908/4908 [00:00<00:00, 7725.33it/s]


Collection name:  doodles
---  img
before:  (9999, 65)
after:  (7738, 65)


100%|██████████| 7738/7738 [00:03<00:00, 2313.45it/s]


---  txt
before:  (10000, 1501)
after:  (7739, 1501)


100%|██████████| 7739/7739 [00:56<00:00, 136.27it/s]


---  price
before:  (7642, 2)
after:  (7642, 2)


100%|██████████| 7642/7642 [00:01<00:00, 7586.61it/s]


Collection name:  meebits
---  img
before:  (12306, 65)
after:  (4942, 65)


100%|██████████| 4942/4942 [00:02<00:00, 2330.22it/s]


---  txt
before:  (20000, 1801)
after:  (5702, 1801)


100%|██████████| 5702/5702 [00:50<00:00, 112.40it/s]


---  price
before:  (5702, 2)
after:  (5702, 2)


100%|██████████| 5702/5702 [00:00<00:00, 7419.86it/s]
