# gru4rec
recboleでgru4recを検証

In [1]:
import os
import sys
import gc
import subprocess
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import random
from collections import defaultdict

import pandas as pd
import numpy as np
import cudf

In [2]:
SEED = 42
random.seed(SEED)

In [3]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
PREP_DIR = os.getenv("PREP_DIR")
TMP_DIR = PREP_DIR + "recbox_data/"

In [4]:
CHUNK_N = 400

In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

学習用データ作成

In [30]:
name = "train_sessions_week3"
df = pd.read_pickle(PREP_DIR + name + ".pkl")
df["type"] = df["type"].map({"clicks":0, "carts":1, "orders":2}).astype("uint8")
df["ts"] = (df["ts"] / 1000).astype("int32")
df["session"] = df["session"].astype("int32")
df["aid"] = df["aid"].astype("int32")
df.to_parquet(PREP_DIR + name + ".parquet")

In [None]:
df.dty

In [19]:
(df["ts"] / 1000).astype("int32")

0           1661157951
1           1661157962
2           1661157993
3           1661158034
4           1661158090
               ...    
25911694    1661723968
25911695    1661723970
25911696    1661723972
25911697    1661723976
25911698    1661723983
Name: ts, Length: 25911699, dtype: int32

In [15]:
# データ読み込み
files = [
    "train_sessions_week1.pkl",
    "test_sessions_week1.pkl",
    "train_sessions_week2.pkl",
    "test_sessions_week2.pkl",
    "train_sessions_week3.pkl",
    "test_sessions_week3.pkl",
    "test_sessions.pkl"
]

dfs = []
for file in files:
    dfs.append(pd.read_pickle(PREP_DIR + file))
sessions = pd.concat(dfs)

sessions = sessions[["session", "aid", "ts"]].sort_values("ts").rename(columns={"session":"session:token",
                                                                                "aid": "aid:token",
                                                                                "ts": "ts:float"})
os.makedirs(TMP_DIR)
sessions.to_csv(TMP_DIR + "recbox_data.inter", index=False, sep="\t")

In [18]:
import _tkinter

ImportError: libX11.so.6: cannot open shared object file: No such file or directory

In [16]:
import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.model.context_aware_recommender import DeepFM
from recbole.model.sequential_recommender import GRU4Rec
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger

ImportError: libX11.so.6: cannot open shared object file: No such file or directory

In [None]:
MAX_ITEM = 30

parameter_dict = {
    'data_path': PREP_DIR,
    'seed':SEED, 
    'use_gpu': True,
    'USER_ID_FIELD': 'session',
    'ITEM_ID_FIELD': 'aid', 
    'TIME_FIELD': 'ts',
    'NEG_PREFIX': 'neg_',
    'user_inter_num_interval': "[0,inf)",
    'item_inter_num_interval': "[0,inf)",
    'MAX_ITEM_LIST_LENGTH': 30,
    'load_col': {'inter': ['session', 'aid', 'ts'],
                'user': ['session'],
                'item': ['aid']
                }, 
    'learning_rate': 0.002,
    'neg_sampling': None,
    'train_neg_sample_args': None,
    'train_batch_size': 2048,
    'epochs': 40,
    'stopping_step': 1,
    'eval_batch_size': 1024,
    'valid_metric': 'Recall@20',
    'topk': [20],
    'eval_args': {
        'split': {'RS': [9, 1, 0]},
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full'},
}
config = Config(model='GRU4Rec', dataset='recbox_data', config_dict=parameter_dict)

# init random seed
init_seed(config['seed'], config['reproducibility'])