- increase data by efficient memory usage

# recsys2021 best paper
- paper: https://dl.acm.org/doi/pdf/10.1145/3460231.3474273
- code: https://github.com/hasteck/Higher_Recsys_2021/blob/main/Higher_RecSys_2021.ipynb

In [1]:
import os
import numpy as np 
import pandas as pd 
from tqdm import tqdm
from numba import jit
from copy import deepcopy
from scipy import sparse, linalg

import warnings
warnings.filterwarnings('ignore')

In [2]:
DATA_DIR = "/kaggle/input/h-and-m-personalized-fashion-recommendations/"
df = pd.read_csv(DATA_DIR + "transactions_train.csv")
#article_df = pd.read_csv(DATA_DIR + "articles.csv")
#customer_df = pd.read_csv(DATA_DIR + "customers.csv")
sub_df = pd.read_csv(DATA_DIR + "sample_submission.csv")

In [3]:
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/306007
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py

def apk(actual, predicted, k=12):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]
        
    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

## prepare data

In [4]:
start_date = '2020-09-13'
val_start_date = '2020-09-19' #9/16

df = df[["t_dat", "customer_id", "article_id"]]

# sample check for small memoery usage
df = df.query(f"t_dat >= '{start_date}'").reset_index(drop=True)
train_data = df.query(f"t_dat < '{val_start_date}'").reset_index(drop=True)

In [5]:
unique_uid = df["customer_id"].unique()
unique_sid = df['article_id'].unique()

show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

n_users = max(list(profile2id.values())) + 1
n_items = max(list(show2id.values())) + 1
print(n_users, n_items)

del df, unique_uid, unique_sid

91188 20162


In [6]:
def numerize(tp):
    uid = map(lambda x: profile2id[x], tp['customer_id'])
    sid = map(lambda x: show2id[x], tp['article_id'])
    out = pd.DataFrame(data={'uid': list(uid), 'sid': list(sid)}, columns=['uid', 'sid'])
    return out.sort_values("uid").reset_index(drop=True)

In [7]:
train_data = numerize(train_data)
print(train_data.shape)

(202552, 2)


# training

In [8]:
# function for training
### functions to create the feature-pairs
@jit
def create_list_feature_pairs(XtX, threshold):
    AA= np.triu(np.abs(XtX))
    AA[np.diag_indices(AA.shape[0])]=0.0
    ii_pairs = np.where((AA>threshold)==True)
    return ii_pairs

@jit
def create_matrix_Z(ii_pairs, X):
    MM = np.zeros( (len(ii_pairs[0]), X.shape[1]), dtype=np.float)
    MM[np.arange(MM.shape[0]), ii_pairs[0]]=1.0
    MM[np.arange(MM.shape[0]), ii_pairs[1]]=1.0
    CCmask = 1.0-MM    # see Eq. 8 in the paper
    MM=sparse.csc_matrix(MM.T)
    Z=  X * MM
    Z= (Z == 2.0)
    Z=Z*1.0
    return [Z, CCmask]

### training-function of higher-order model
@jit
def train_higher(XtX, XtXdiag,lambdaBB, ZtZ, ZtZdiag, lambdaCC, CCmask, ZtX, rho, epochs):
    # precompute for BB
    ii_diag=np.diag_indices(XtX.shape[0])
    XtX[ii_diag] = XtXdiag+lambdaBB
    PP=np.linalg.inv(XtX)
    
    # precompute for CC
    ii_diag_ZZ=np.diag_indices(ZtZ.shape[0])
    ZtZ[ii_diag_ZZ] = ZtZdiag+lambdaCC+rho
    #QQ=np.linalg.inv(ZtZ)
    
    # initialize
    CC = np.zeros((ZtZ.shape[0], XtX.shape[0]),dtype=np.float32)
    DD = np.zeros((ZtZ.shape[0], XtX.shape[0]),dtype=np.float32)
    UU = np.zeros((ZtZ.shape[0], XtX.shape[0]),dtype=np.float32) # is Gamma in paper
    XtX[ii_diag] = XtXdiag
    
    for iter in range(epochs):
        print("epoch {}".format(iter))
        # learn BB
        BB= PP.dot(XtX-ZtX.T.dot(CC)) # BB = np.linalg.solve(XtX, XtX-ZtX.T.dot(CC))
        gamma = np.diag(BB) / np.diag(PP)
        BB -= PP * gamma
        
        # learn CC
        #CC = QQ.dot(ZtX-ZtX.dot(BB) +rho *(DD-UU))
        CC = np.linalg.solve(ZtZ, ZtX-ZtX.dot(BB) + rho * (DD-UU))
        
        # learn DD
        DD =  CC  * CCmask 
        #DD= np.maximum(0.0, DD) # if you want to enforce non-negative parameters
        
        # learn UU (is Gamma in paper)
        UU += CC-DD
    return [BB,DD]

#### data formation 
def format_data(tp):
    rows, cols = tp['uid'], tp['sid']
    data = sparse.csr_matrix((np.ones_like(rows),
                             (rows, cols)), dtype='float32',
                             shape=(n_users, n_items))
    return data

In [9]:
### choose the training-hyperparameters
epochs = 10
threshold, lambdaBB, lambdaCC, rho = 100, 500, 5000, 100000

In [10]:
X = format_data(train_data)
del train_data

XtX=np.array((X.transpose() * X).todense()) 
XtXdiag=deepcopy(np.diag(XtX))

In [11]:
### create the list of feature-pairs and the higher-order matrix Z
XtX[np.diag_indices(XtX.shape[0])]=XtXdiag #if code is re-run, ensure that the diagonal is correct
ii_feature_pairs = create_list_feature_pairs(XtX, threshold)
print("number of feature-pairs: {}".format(len(ii_feature_pairs[0])))
Z, CCmask = create_matrix_Z(ii_feature_pairs, X)

number of feature-pairs: 10


In [12]:
### create the higher-order matrices
ZtZ=np.array((Z.transpose() * Z).todense()) 
ZtX=np.array((Z.transpose() * X).todense()) 
ZtZdiag=deepcopy(np.diag(ZtZ))
del X, Z

In [13]:
### iterative training, and evaluation every 10 epochs 
BB, CC = train_higher(XtX, XtXdiag, lambdaBB, ZtZ, ZtZdiag, lambdaCC, CCmask, ZtX, rho, epochs)

epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9


In [14]:
del ZtZ, ZtX, ZtZdiag

# prediction

In [15]:
### evaluation-function of higher-order model
@jit
def pred_func(BB,CC,test_data_tr, Z_test_data_tr, N_test, batch_size_test=5000):
    print("Evaluating ...")
    #evaluate in batches
    for bnum, st_idx in tqdm(enumerate(range(0, N_test, batch_size_test))):
        end_idx = min(st_idx + batch_size_test, N_test)
        Xtest = test_data_tr[st_idx:end_idx]
        Ztest = Z_test_data_tr[st_idx:end_idx]
        if sparse.isspmatrix(Xtest):
            Xtest = Xtest.toarray()
            Ztest = Ztest.toarray()
        Xtest = Xtest.astype('float64')
        Ztest = Ztest.astype('float64')
        pred_val = (Xtest).dot(BB) + Ztest.dot(CC)
        #pred_val[Xtest.nonzero()] = -np.inf # exclude examples from training and validation (if any)
        pred_val = np.argpartition(pred_val, -12)[:,-12:]
        if bnum == 0:
            ans = pred_val
        else:
            ans = np.concatenate((ans,pred_val), axis=0)
    return ans

In [16]:
df = pd.read_csv(DATA_DIR + "transactions_train.csv")
df = df[["t_dat", "customer_id", "article_id"]]

valid_data = df.query(f"t_dat >= '{val_start_date}'").reset_index(drop=True)
valid_unq = valid_data.groupby('customer_id')['article_id'].apply(list).reset_index()
valid_unq['valid_true'] = valid_unq['article_id'].map(lambda x: '0'+' 0'.join(str(x)[1:-1].split(', ')))
valid_unq.drop("article_id", axis=1, inplace=True)

valid_data = numerize(valid_data)
print(valid_data.shape)
X_valid = format_data(valid_data)
del valid_data, df

Z_val_data , _ = create_matrix_Z(ii_feature_pairs, X_valid)
N_valid = X_valid.shape[0]

(133281, 2)


In [17]:
id2show = dict((v,"0"+str(k)) for k,v in show2id.items())
id2profile = dict((v,k) for k,v in profile2id.items())

sample_value = sub_df["prediction"][0]

In [18]:
pred = pred_func(BB,CC, X_valid, Z_val_data, N_valid)

Evaluating ...


19it [19:35, 61.87s/it]


In [19]:
# https://stackoverflow.com/questions/16992713/translate-every-element-in-numpy-array-according-to-key
# indexから商品idに変換
pred = np.vectorize(id2show.get)(pred)
pred = pd.DataFrame(pred)

In [20]:
cols = pred.columns
pred['prediction'] = pred[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
pred = pred.drop(cols, axis=1).reset_index(drop=False)
pred.columns = ["customer_id", "prediction"]
pred["customer_id"] = pred["customer_id"].map(id2profile)

In [21]:
sub_df = pd.merge(sub_df["customer_id"], pred, on = "customer_id", how="left")
sub_df["prediction"].fillna(sample_value, inplace=True)
sub_df.to_csv("submission.csv", index=False)

In [22]:
# local score
valid_unq = pd.merge(valid_unq, pred, on ="customer_id", how="left")
mapk(valid_unq['valid_true'].map(lambda x: x.split()), valid_unq['prediction'].map(lambda x: x.split()), k=12)

0.021079123367383944