In [1]:
import pandas
import tensorflow

print(pandas.__version__)
print(tensorflow.__version__)

1.3.3
2.6.0


In [2]:
import datetime as dt
from pathlib import Path
import os
import time
from datetime import datetime
from IPython.display import display

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [11]:
data_path = Path(os.getenv('HOME')+'/aiffel/yoochoose/data/') 
train_path = data_path / 'ratings.dat'

def load_data(data_path: Path, nrows=None):
    data = pd.read_csv(data_path, sep='::', header=None, usecols=[0, 1, 2, 3], dtype={0: np.int32, 1: np.int32, 2: np.int32}, nrows=nrows)
    data.columns = ['UserId', 'ItemId', 'Rating', 'Time']
    data['Time'] = pd.to_datetime(data['Time'], unit='s')
    return data

data = load_data(train_path, None)
data.sort_values(['UserId', 'Time'], inplace=True)  # data를 id와 시간 순서로 정렬해줍니다.
data

Unnamed: 0,UserId,ItemId,Rating,Time
31,1,3186,4,2000-12-31 22:00:19
22,1,1270,5,2000-12-31 22:00:55
27,1,1721,4,2000-12-31 22:00:55
37,1,1022,5,2000-12-31 22:00:55
24,1,2340,3,2000-12-31 22:01:43
...,...,...,...,...
1000019,6040,2917,4,2001-08-10 14:40:29
999988,6040,1921,4,2001-08-10 14:41:04
1000172,6040,1784,3,2001-08-10 14:41:04
1000167,6040,161,3,2001-08-10 14:41:26


In [12]:
def cleanse_recursive(data: pd.DataFrame, shortest, least_click) -> pd.DataFrame:
    while True:
        before_len = len(data)
        data = cleanse_short_session(data, shortest)
        data = cleanse_unpopular_item(data, least_click)
        after_len = len(data)
        if before_len == after_len:
            break
    return data


def cleanse_short_session(data: pd.DataFrame, shortest):
    session_len = data.groupby('UserId').size()
    session_use = session_len[session_len >= shortest].index
    data = data[data['UserId'].isin(session_use)]
    return data


def cleanse_unpopular_item(data: pd.DataFrame, least_click):
    item_popular = data.groupby('ItemId').size()
    item_use = item_popular[item_popular >= least_click].index
    data = data[data['ItemId'].isin(item_use)]
    return data

data = cleanse_recursive(data, shortest=2, least_click=5)

In [13]:
def split_by_date(data: pd.DataFrame, n_days: int):
    final_time = data['Time'].max()
    cutoff_time = final_time - dt.timedelta(n_days)
    
    train = data[data['Time'] < cutoff_time]
    test = data[data['Time'] >= cutoff_time]
    return train, test

tr, test = split_by_date(data, n_days=1)
tr, val = split_by_date(tr, n_days=1)

In [14]:
# Train 데이터의 아이템을 기준으로 인덱스 사전 생성
id2idx = {item_id : index for index, item_id in enumerate(tr['ItemId'].unique())}

def indexing(df, id2idx):
    # Train 데이터에 없는 아이템은 -1로 처리 (나중에 무시됨)
    df['item_idx'] = df['ItemId'].map(lambda x: id2idx.get(x, -1))
    return df

tr = indexing(tr, id2idx)
val = indexing(val, id2idx)
test = indexing(test, id2idx)

In [15]:
class SessionDataset:
    def __init__(self, data):
        self.df = data
        self.click_offsets = self.get_click_offsets()
        self.session_idx = np.arange(self.df['UserId'].nunique())

    def get_click_offsets(self):
        offsets = np.zeros(self.df['UserId'].nunique() + 1, dtype=np.int32)
        offsets[1:] = self.df.groupby('UserId').size().cumsum()
        return offsets

class FullSequenceDataLoader:
    def __init__(self, dataset: SessionDataset, batch_size=16):
        self.dataset = dataset
        self.batch_size = batch_size
        self.sessions = self._build_sessions()

    def _build_sessions(self):
        sessions = []
        for start, end in zip(self.dataset.click_offsets[:-1], self.dataset.click_offsets[1:]):
            session = self.dataset.df['item_idx'].values[start:end]
            if len(session) >= 2:
                sessions.append(session)
        return sessions

    # 'for' 루프를 가능하게 하는 __iter__ 메서드
    def __iter__(self):
        # 세션 리스트를 배치 사이즈만큼 순회
        for i in range(0, len(self.sessions), self.batch_size):
            batch = self.sessions[i:i+self.batch_size]
            input_batch = []
            target_batch = []

            for session in batch:
                input_batch.append(session[:-1])    # 예: [A, B, C]
                target_batch.append(session[-1])    # 예: D
            
            # 처리된 배치 데이터를 반환
            yield input_batch, target_batch

# 데이터 로더 생성
train_dataset = SessionDataset(tr)
train_loader = FullSequenceDataLoader(train_dataset, batch_size=16)

In [16]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import categorical_crossentropy

def create_model(args):
    inputs = Input(shape=(None, args.num_items))  # (배치 크기, 시퀀스 길이, 전체 아이템 수)
    x = GRU(args.hsz, return_sequences=False)(inputs)
    x = Dropout(args.drop_rate)(x)
    predictions = Dense(args.num_items, activation='softmax')(x)
    
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(loss=categorical_crossentropy, optimizer=Adam(args.lr), metrics=['accuracy'])
    model.summary()
    return model

# 하이퍼파라미터 설정
class Args:
    # class 선언 아래에 4칸 들여쓰기
    def __init__(self, tr, val, test, batch_size, hsz, drop_rate, lr, epochs, k):
        self.tr = tr
        self.val = val
        self.test = test
        self.num_items = tr['ItemId'].nunique()
        self.num_sessions = tr['UserId'].nunique()
        self.batch_size = batch_size
        self.hsz = hsz
        self.drop_rate = drop_rate
        self.lr = lr
        self.epochs = epochs
        self.k = k

args = Args(tr, val, test, batch_size=16, hsz=50, drop_rate=0.1, lr=0.001, epochs=3, k=20)

model = create_model(args)

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None, 3416)]      0         
_________________________________________________________________
gru_1 (GRU)                  (None, 50)                520200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3416)              174216    
Total params: 694,416
Trainable params: 694,416
Non-trainable params: 0
_________________________________________________________________


In [21]:
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

def train_model(model, args):
    train_dataset = SessionDataset(args.tr)
    train_loader = FullSequenceDataLoader(train_dataset, batch_size=args.batch_size)

    for epoch in range(1, args.epochs + 1):
        tr_loader = tqdm(train_loader, desc=f'Train Epoch {epoch}')

        for input_seqs, target_items in tr_loader:
            # 1. 시퀀스 길이를 맞추기 위해 패딩(padding)을 적용합니다.
            input_seqs_padded = pad_sequences(input_seqs, padding='pre', maxlen=None)
            
            # 2. 패딩된 시퀀스를 One-hot-encoding 합니다.
            input_ohe = to_categorical(input_seqs_padded, num_classes=args.num_items)
            
            # 3. 정답(target) 아이템을 One-hot-encoding 합니다.
            target_ohe = to_categorical(target_items, num_classes=args.num_items)

            # 이제 변수가 정의되었으므로 오류 없이 실행됩니다.
            result = model.train_on_batch(input_ohe, target_ohe)
            tr_loader.set_postfix(train_loss=result[0], accuracy=result[1])
        
        # 각 에포크 종료 후 검증 데이터로 성능 평가
        val_recall, val_mrr = get_metrics(args.val, model, args, args.k)
        print(f"\n\t - Recall@{args.k} epoch {epoch}: {val_recall:.4f}")
        print(f"\t - MRR@{args.k}    epoch {epoch}: {val_mrr:.4f}\n")
        
def get_metrics(data, model, args, k: int):
    dataset = SessionDataset(data)
    loader = FullSequenceDataLoader(dataset, batch_size=args.batch_size)

    recall_list, mrr_list = [], []

    for input_seqs, label in tqdm(loader, desc='Evaluation'):
        input_seqs_padded = pad_sequences(input_seqs, padding='pre', maxlen=100)
        input_ohe = to_categorical(input_seqs_padded, num_classes=args.num_items)  # (B, T, num_items)

        pred = model.predict(input_ohe, batch_size=args.batch_size)  # (B, num_items)
        pred_arg = tf.argsort(pred, direction='DESCENDING')

        for i in range(len(label)):
            recall_list.append(recall_k(pred_arg[i], label[i], k))
            mrr_list.append(mrr_k(pred_arg[i], label[i], k))

    return np.mean(recall_list), np.mean(mrr_list)

def recall_k(pred, truth: int, k: int) -> int:
    answer = truth in pred[:k]
    return int(answer)

def mrr_k(pred, truth: int, k: int):
    indexing = np.where(pred[:k] == truth)[0]
    if len(indexing) > 0:
        return 1 / (indexing[0] + 1)
    else:
        return 0

# 모델 학습 시작
train_model(model, args)

Train Epoch 1: 378it [03:37,  1.74it/s, accuracy=0, train_loss=6.49]     
Evaluation: 1it [00:00, 11.60it/s]



	 - Recall@20 epoch 1: 0.0000
	 - MRR@20    epoch 1: 0.0000



Train Epoch 2: 378it [03:37,  1.74it/s, accuracy=0, train_loss=6.13]     
Evaluation: 1it [00:00, 11.89it/s]



	 - Recall@20 epoch 2: 0.0000
	 - MRR@20    epoch 2: 0.0000



Train Epoch 3: 378it [03:37,  1.74it/s, accuracy=0.125, train_loss=5.97] 
Evaluation: 1it [00:00, 14.09it/s]


	 - Recall@20 epoch 3: 0.0000
	 - MRR@20    epoch 3: 0.0000




