In [None]:
import pandas as pd
import numpy as np
import scipy.sparse
import scipy.sparse as sp
import yaml
import os
import glob
import pickle
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F

from recbole.utils import InputType
from recbole.model.abstract_recommender import GeneralRecommender
from recbole.model.loss import BPRLoss, EmbLoss
from recbole.model.init import xavier_normal_initialization
from recbole.model.layers import BiGNNLayer, SparseDropout

from logging import getLogger
from recbole.quick_start import run_recbole
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.general_recommender import BPR
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger
from recbole.utils import get_model, get_trainer
from recbole.trainer import HyperTuning
from recbole.quick_start import objective_function
# import create_dataset from recbole
from recbole.data import create_dataset

from newmodel import NGCFpretrain,LightGCNpretrain, NGCFconcat, LightGCNconcat, NGCFpretrainMLP, LightGCNpretrainMLP

def get_last_file(path):
    files = glob.glob(path + '/*')
    return max(files, key=os.path.getctime)


# Create atomic files 
- sparse matrix (.npz) 파일 -> interaction (.inter), features (.itememb) 파일

## .inter

In [None]:
# .npz files are in './dataset/csr_matrix/'

npz_files = glob.glob('./dataset/csr_matrix/*.npz')
print(npz_files)

['./dataset/csr_matrix/azuki.npz', './dataset/csr_matrix/bayc.npz', './dataset/csr_matrix/coolcats.npz', './dataset/csr_matrix/doodles.npz', './dataset/csr_matrix/meebits.npz']


In [None]:
# create folder './dataset/collections'
if not os.path.exists('./dataset/collections'):
    os.makedirs('./dataset/collections')

In [None]:
for npz in npz_files:
    sparse_matrix = scipy.sparse.load_npz(npz)
    coo_sparse_matrix = sparse_matrix.tocoo()

    user = coo_sparse_matrix.row
    item = coo_sparse_matrix.col
    data = np.ones(shape=(len(user),), dtype=np.int32) # coo_sparse_matrix.data

    f = open('./dataset/collections/' + npz[21:-4] + ".inter", 'w')
    f.write("user_id:token\titem_id:token\trating:float\n")
    for i in range(len(user)):
        f.write("%d\t%d\t%d\n"%(user[i],item[i],data[i]))
    f.close()

file_path = os.listdir('./dataset/collections')
for name in file_path:
    os.makedirs('dataset/collections/'+name[:-6])
    shutil.move('dataset/collections/'+name, 'dataset/collections/'+name[:-6]+'/'+name)



FileExistsError: [Errno 17] File exists: 'dataset/collections/'

In [None]:
# .inter files are in folders in './dataset/collections/*/'

inter_files = glob.glob('./dataset/collections/*/*.inter')
print(inter_files)

['./dataset/collections/azuki/azuki.inter', './dataset/collections/bayc/bayc.inter', './dataset/collections/coolcats/coolcats.inter', './dataset/collections/doodles/doodles.inter', './dataset/collections/meebits/meebits.inter']


In [None]:
# dataset names are in './dataset/collections/'

DATASET_names = os.listdir('./dataset/collections/')
DATASET_names

['azuki', 'bayc', 'coolcats', 'doodles', 'meebits']

## .itememb

### img

In [None]:
# get collection names in folder './dataset/collections'
collection_names = os.listdir('./dataset/collections')
collection_names

['azuki', 'bayc', 'coolcats', 'doodles', 'meebits']

In [None]:
for collection in collection_names:
    
    print('--- ', collection)
    
    # get csv file with '_image' in their name
    img_file = pd.read_csv(f'./dataset/item_features/{collection}_image.csv')
    print('before: ', img_file.shape)
    
    # 우리가 가진 기간 내 interaction에 등장하는 아이템만 남기기
    # get meebits.inter file from './dataset/collections/meebits/'
    inter = pd.read_csv(f'./dataset/collections/{collection}/{collection}.inter', sep='\t')
    # get unique values in column 'item_id:token'
    token_ids = inter['item_id:token'].unique()
    img_file = img_file[img_file['token_ID'].isin(token_ids)].reset_index(drop=True)
    print('after: ', img_file.shape)
    
    # .itememb 저장하기
    f = open(f"./dataset/collections/{collection}/{collection}.itememb_img", 'w')
    f.write("iid_img:token" + '\t' + 'item_emb_img:float_seq' + '\n')
    for i in tqdm(range(len(img_file))):
        # get token_id
        token_id = img_file['token_ID'][i]
        # get the rest of the features
        features = img_file.iloc[i, 1:] # Series
        # write
        f.write(str(token_id) + '\t')
        for j in range(len(features)):
            f.write(f"{features[j].astype(np.float32)}") 
            # if it is not the last iteration
            if j != len(features) - 1:
                f.write(' ')
        f.write('\n')    

    f.close()

---  azuki
before:  (10000, 65)
after:  (8386, 65)


100%|██████████| 8386/8386 [00:07<00:00, 1195.38it/s]


---  bayc
before:  (9983, 65)
after:  (4008, 65)


100%|██████████| 4008/4008 [00:03<00:00, 1251.17it/s]


---  coolcats
before:  (9952, 65)
after:  (4908, 65)


100%|██████████| 4908/4908 [00:04<00:00, 1137.60it/s]


---  doodles
before:  (9999, 65)
after:  (7641, 65)


100%|██████████| 7641/7641 [00:05<00:00, 1318.04it/s]


---  meebits
before:  (12306, 65)
after:  (4942, 65)


100%|██████████| 4942/4942 [00:03<00:00, 1237.86it/s]


### txt

In [None]:
# get collection names in folder './dataset/collections'
collection_names = os.listdir('./dataset/collections')
collection_names

['azuki', 'bayc', 'coolcats', 'doodles', 'meebits']

In [None]:
for collection in collection_names:
    
    print('--- ', collection)
    
    # get csv file with '_text' in their name
    txt_file = pd.read_csv(f'./dataset/item_features/{collection}_text.csv')
    print('before: ', txt_file.shape)
    
    # 우리가 가진 기간 내 interaction에 등장하는 아이템만 남기기
    # get meebits.inter file from './dataset/collections/meebits/'
    inter = pd.read_csv(f'./dataset/collections/{collection}/{collection}.inter', sep='\t')
    # get unique values in column 'item_id:token'
    token_ids = inter['item_id:token'].unique()
    txt_file = txt_file[txt_file['Token ID'].isin(token_ids)].reset_index(drop=True)
    print('after: ', txt_file.shape)
    
    # .itememb 저장하기
    f = open(f"./dataset/collections/{collection}/{collection}.itememb_txt", 'w')
    f.write("iid_txt:token" + '\t' + 'item_emb_txt:float_seq' + '\n')
    for i in tqdm(range(len(txt_file))):
        # get token_id
        token_id = txt_file['Token ID'][i]
        # get the rest of the features
        features = txt_file.iloc[i, 1:] # Series
        # write
        f.write(str(token_id) + '\t')
        for j in range(len(features)):
            f.write(f"{features[j].astype(np.float32)}") 
            # if it is not the last iteration
            if j != len(features) - 1:
                f.write(' ')
        f.write('\n')    

    f.close()

---  azuki
before:  (10000, 1801)
after:  (8386, 1801)


100%|██████████| 8386/8386 [01:45<00:00, 79.44it/s]


---  bayc
before:  (10000, 1801)
after:  (4025, 1801)


100%|██████████| 4025/4025 [00:49<00:00, 81.09it/s]


---  coolcats
before:  (9941, 1501)
after:  (4903, 1501)


100%|██████████| 4903/4903 [00:53<00:00, 92.23it/s] 


---  doodles
before:  (10000, 1501)
after:  (7642, 1501)


100%|██████████| 7642/7642 [01:27<00:00, 86.86it/s] 


---  meebits
before:  (20000, 1801)
after:  (5702, 1801)


100%|██████████| 5702/5702 [01:21<00:00, 70.28it/s]


### price

In [None]:
# get collection names in folder './dataset/collections'
collection_names = os.listdir('./dataset/collections')
collection_names

['azuki', 'bayc', 'coolcats', 'doodles', 'meebits']

In [None]:
for collection in collection_names:
    
    print('--- ', collection)
    
    # get csv file with '_text' in their name
    price_file = pd.read_csv(f'./dataset/item_features/{collection}_price.csv')
    print('before: ', price_file.shape)
    
    # 우리가 가진 기간 내 interaction에 등장하는 아이템만 남기기
    # get meebits.inter file from './dataset/collections/meebits/'
    inter = pd.read_csv(f'./dataset/collections/{collection}/{collection}.inter', sep='\t')
    # get unique values in column 'item_id:token'
    token_ids = inter['item_id:token'].unique()
    price_file = price_file[price_file['TokenID'].isin(token_ids)].reset_index(drop=True)
    print('after: ', price_file.shape)
    
    # .itememb 저장하기
    f = open(f"./dataset/collections/{collection}/{collection}.itememb_price", 'w')
    f.write("iid_price:token" + '\t' + 'item_emb_price:float_seq' + '\n')
    for i in tqdm(range(len(price_file))):
        # get token_id
        token_id = price_file['TokenID'][i]
        # get the rest of the features
        features = price_file.iloc[i, 1:] # Series
        # write
        f.write(str(token_id) + '\t')
        for j in range(len(features)):
            f.write(f"{features[j].astype(np.float32)}") 
            # if it is not the last iteration
            if j != len(features) - 1:
                f.write(' ')
        f.write('\n')    

    f.close()

---  azuki
before:  (8386, 2)
after:  (8386, 2)


100%|██████████| 8386/8386 [00:01<00:00, 4778.53it/s]


---  bayc
before:  (4025, 2)
after:  (4025, 2)


100%|██████████| 4025/4025 [00:00<00:00, 4726.50it/s]


---  coolcats
before:  (4908, 2)
after:  (4908, 2)


100%|██████████| 4908/4908 [00:01<00:00, 4777.23it/s]


---  doodles
before:  (7642, 2)
after:  (7642, 2)


100%|██████████| 7642/7642 [00:02<00:00, 3524.56it/s]


---  meebits
before:  (5702, 2)
after:  (5702, 2)


100%|██████████| 5702/5702 [00:01<00:00, 3950.85it/s]


# Create config file

In [None]:
parameter_dict = {
    
    # environment
    'seed': 0,
    'reproducibility': True,
    'data_path': 'dataset/collections/',
    'checkpoint_dir': 'saved/',
    'show_progress': True,
    'save_dataset': False,
    'log_wandb': False,
    
    # data
    'field_separator': '\t',
    'seq_separator': ' ',
    'USER_ID_FIELD': 'user_id',
    'ITEM_ID_FIELD': 'item_id',
    'RATING_FIELD': 'rating',
    'item_inter_num_interval': '[0,inf)', 
    
    # training
    'epochs': 50,
    'train_batch_size': 2048, # 2048
    'learner': 'adam',
    'learning_rate': 0.1, # 0.001
    'train_neg_sample_args': {'distribution': 'popularity',
                              'sample_num': 5,
                              'dynamic': False,
                              'candidate_num': 0},
    'eval_step': 1,
    'stopping_step': 15000000000000000000000000000000, # 15
    'loss_decimal_place': 4,
    
    # evaluation
    'eval_args': {'group_by': 'user',
                  'order': 'RO',
                  'split': {'RS':[8,1,1]},
                  'mode': 'pop100'},
    'metrics': ['Recall', 'MRR', 'NDCG', 'Hit', 'MAP', 'Precision', 'GAUC'],
    'topk': [1, 2, 5, 10, 20, 50], 
    'valid_metric': 'MRR@1', # for early stopping
    'eval_batch_size': 4096, # 4096
    'metric_decimal_place': 4
    
}

# convert parameter_dict to yaml file
with open(r'config/fixed_config_baseline.yaml', 'w') as file:
    documents = yaml.dump(parameter_dict, file)

In [None]:
""" 
topk가 valid_metric MRR@10보다 작으면 에러남 -> 근데 얘네들을 맞춰주기가 어려움 -> early stopping을 버리던지 config를 따로 만들어야 함 -> 그냥 early stopping을 버리자 -> valid data 용도는 그냥 best model 뽑기 위한 것만 
"""

# # K = [1, 2, 5, 10, 20, 50, 100] 
 
# parameter_dict['topk'] = 2
# parameter_dict['valid_metric'] = 'MRR@2'
# K = parameter_dict['topk']
# with open(r'config/fixed_config_K{0}.yaml'.format(K), 'w') as file:
#     documents = yaml.dump(parameter_dict, file)

# parameter_dict['topk'] = 5
# parameter_dict['valid_metric'] = 'MRR@5'
# K = parameter_dict['topk']
# with open(r'config/fixed_config_K{0}.yaml'.format(K), 'w') as file:
#     documents = yaml.dump(parameter_dict, file)


# # .yaml files are in './config/multi/'
# yaml_files = glob.glob('./config/multi/*.yaml')
# yaml_files