# load lib and data

In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import json

from datetime import timedelta
from tqdm.notebook import tqdm
from collections import Counter
from heapq import nlargest


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_theme()

import warnings
warnings.filterwarnings('ignore')

In [None]:
#paths

data_path = Path("/kaggle/input/otto-recommender-system")
train_df = data_path/'train.jsonl'
test_df = data_path/'test.jsonl'
sample_sub_path = Path('/kaggle/input/otto-recommender-system/sample_submission.csv')

# json to DF

In [None]:
def read_jsonl(target: str) -> pd.DataFrame():
    sessions = pd.DataFrame()
    chunks = pd.read_json(data_path / f'{target}.jsonl', lines=True, chunksize=1000)

    for e, chunk in enumerate(chunks):
        event_dict = {
            'session': [],
            'aid': [],
            'ts': [],
            'type': [],
        }
        if e < 2:
            for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
                for event in events:
                    event_dict['session'].append(session)
                    event_dict['aid'].append(event['aid'])
                    event_dict['ts'].append(event['ts'])
                    event_dict['type'].append(event['type'])
            chunk_session = pd.DataFrame(event_dict)
            sessions = pd.concat([sessions, chunk_session])
        else:
            break
    return sessions.reset_index(drop=True)
train_sessions = read_jsonl('train')
test_sessions = read_jsonl('test')

In [None]:
train_sessions.head(3)

In [None]:
test_sessions.head(3)

In [None]:
type2id = {'clicks' :0, "carts" : 1, 'orders' : 2}

train_sessions['type2id'] = train_sessions['type'].apply(lambda x : type2id[x])
test_sessions['type2id'] = test_sessions['type'].apply(lambda x : type2id[x])

In [None]:
train_sessions.head()

In [None]:
test_sessions.head()

In [None]:
!pip install polars

import multiprocessing
import polars as pl
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

## calculating the scores used for co-visitation matrix

reference : https://www.kaggle.com/code/radek1/co-visitation-matrix-simplified-imprvd-logic, https://www.kaggle.com/code/alberteinsten/cudf-pandas-proof-of-concept-lgbm-ranker

In [None]:
def add_session_length(df) :
    df['session_length'] = train_sessions.groupby('session')['ts'].transform('count')
    return df

def add_action_num_reverse_chrono(df):
    df['action_num_reverse_chrono'] = df.session_length - df.groupby('session').cumcount() - 1
    return df

def add_log_recency_score(df):
    linear_interpolation = 0.1 + ((1-0.1) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    df['log_recency_score'] = (2 ** linear_interpolation - 1).fillna(1.0)
    return df

def add_type_weighted_log_recency_score(df):
    type_weights = {0:1, 1:6, 2:3}
    df['type_weighted_log_recency_score'] = df['log_recency_score'] / df['type2id'].map(type_weights)
    return df

def apply(df, pipeline) :
    for f in pipeline :
        df = f(df)

    return df


In [None]:
pipeline = [add_session_length, add_action_num_reverse_chrono, add_log_recency_score, add_type_weighted_log_recency_score]

apply(train_sessions, pipeline)
apply(test_sessions, pipeline)

In [None]:
train_sessions.head()

In [None]:
test_sessions.head()

In [None]:
test_sessions['gt'] =1

In [None]:
train = train_sessions.merge(test_sessions, on=['session','ts','type', 'aid','type2id','session_length','action_num_reverse_chrono','log_recency_score','type_weighted_log_recency_score'], how='left')

train['gt'] = train['gt'].fillna(0)
train = train.sort_values('session').reset_index(drop=True)

In [None]:
train.gt.value_counts()

# 여기서부터 word2vec의 similarity통한 예측값을 하나의 columns으로 받은 후 그 값을 포함하여 새로운 예측값을 만드는 것.

# Word2Vec

In [None]:
import hashlib
import os
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
os.environ["PYTHONHASHSEED"] = str(42)
def hashfxn(x):
    return int(hashlib.md5(str(x).encode()).hexdigest(), 16)

preprocessing and create extra columns

reference : https://www.kaggle.com/code/alberteinsten/cudf-pandas-proof-of-concept-lgbm-ranker, https://www.kaggle.com/code/radek1/word2vec-how-to-training-and-submission,
https://www.kaggle.com/code/cdeotte/candidate-rerank-model-lb-0-575

In [None]:
raw_corpus = []
for session, group_df in tqdm(train_sessions.groupby(['session'])):
    raw_corpus.append(list(group_df['aid'].astype(str) + '_' + group_df['type']))
for session, group_df in tqdm(test_sessions.groupby(['session'])):
    raw_corpus.append(list(group_df['aid'].astype(str) + '_' + group_df['type']))

In [None]:
len(raw_corpus)

In [None]:
w2vec =Word2Vec(sentences=raw_corpus, vector_size=100, window=5, min_count=1, sg=0, workers=-1, seed=42, hashfxn=hashfxn)

In [None]:
train_sessions.head(2)

In [None]:
train_sessions.head(3)

In [None]:
sub_list = []

for session, group_df in tqdm(test_sessions.groupby(['session'])):
    aid_list = []
    results = w2vec.wv.most_similar(positive=list(group_df['aid'].astype(str) + '_' + group_df['type']), topn=100)
    for result in results:
        aid = result[0].split('_')[0]
        if aid not in aid_list:
            aid_list.append(aid)
        if len(aid_list) == 20:
            aid_list = ' '.join(aid_list)
            break

    sub_list.append([f'{session}_clicks', aid_list])
    sub_list.append([f'{session}_carts', aid_list])
    sub_list.append([f'{session}_orders', aid_list])

sub_list[:1]

In [None]:
submission = pd.DataFrame(sub_list, columns=['session_type','labels'])
submission.head(3)

In [None]:
%%time

from annoy import AnnoyIndex

aid2idx = {aid: i for i, aid in enumerate(w2vec.wv.index_to_key)}
index = AnnoyIndex(100, 'euclidean')

for aid, idx in aid2idx.items():
#     print(aid)
#     print(idx)
    index.add_item(idx, w2vec.wv.vectors[idx])
index.build(20)

In [None]:
test_sessions.head()
test_sessions['aid']=test_sessions['aid'].astype(str)

In [None]:
test_aids = test_sessions.groupby('session')['aid'].apply(list)
test_types = test_sessions.groupby('session')['type2id'].apply(list)

In [None]:
test_aids

In [None]:
test_types

this format referenced by @radek.
https://www.kaggle.com/code/radek1/word2vec-how-to-training-and-submission

defaultdict


In [None]:
from collections import defaultdict

labels = []
type_weight_multipliers = {0: 1, 1: 6, 2: 3}

for AIDs, types in zip(test_aids,test_types):
#     print(test_aids,test_types)
    if len(AIDs) >= 20:
        # if we have enough aids (over equals 20) we don't need to look for candidates! we just use the old logic
        weights=np.logspace(0.1,1,len(AIDs),base=2, endpoint=True)-1
        aids_temp=defaultdict(lambda: 0)
        for aid,w,t in zip(AIDs,weights,types):
            aids_temp[aid]+= w * type_weight_multipliers[t]

        sorted_aids=[k for k, v in sorted(aids_temp.items(), key=lambda item: -item[1])]
        labels.append(sorted_aids[:20])
    else:
        # here we don't have 20 aids to output -- we will use word2vec embeddings to generate candidates!
        AIDs = list(dict.fromkeys(AIDs[::-1]))
#         print(AIDs)
        # let's grab the most recent aid
        most_recent_aid = AIDs[0]
#         print(aid2idx[most_recent_aid])
        # and look for some neighbors!
        nns = [w2vec.wv.index_to_key[i] for i in index.get_nns_by_item(aid2idx[most_recent_aid], 21)[1:]]

        labels.append((AIDs+nns)[:20])

let's make submission files for word2vec

In [None]:
session_types = ['clicks','carts','orders']
labels_as_strings = [' '.join([str(l) for l in lls]) for lls in labels]

predictions = pd.DataFrame(data={'session_type': test_aids.index, 'labels': labels_as_strings})

prediction_dfs = []


for st in session_types:
    prediction = predictions.copy()
    prediction.session_type = prediction.session_type.astype('str') + f'_{st}'
    prediction_dfs.append(prediction)

w2v_submission = pd.concat(prediction_dfs).reset_index(drop=True)
w2v_submission

In [None]:
submission1 = w2v_submission
submission1.to_csv('submission.csv', index=False)

#FM



https://deepctr-doc.readthedocs.io/en/latest/Features.html#sparsefeat

https://deepctr-doc.readthedocs.io/en/latest/Examples.html

https://projectlog-eraser.tistory.com/41

https://arxiv.org/pdf/1703.04247v1.pdf

https://aplab.tistory.com/entry/%EC%B6%94%EC%B2%9C-%EC%95%8C%EA%B3%A0%EB%A6%AC%EC%A6%98-%ED%83%90%EC%83%89-Deep-FM-%EC%95%8C%EC%95%84%EB%B3%B4%EA%B8%B0

In [None]:
!pip install polars
import polars as pl

# vectorize train and test data

In [None]:
train.head()

In [None]:
# train['aid'] = train['aid'].astype(str)
train['aid_next'] = train.aid.shift(-1)

In [None]:
train.isnull().sum()
train.aid_next.dropna(inplace=True)

In [None]:
train_pairs = train[['aid','aid_next']]
train_pairs.dropna(inplace=True)

In [None]:
train_pairs.to_parquet('pl_train_pairs.parquet')

In [None]:
pl_train_pairs=pl.read_parquet('pl_train_pairs.parquet')

In [None]:
cardinality_aids = max(pl_train_pairs['aid'].max(),pl_train_pairs['aid_next'].max())
cardinality_aids

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

class ClickDataset(Dataset) :
    def __init__(self,pairs) :
        self.aid1 = pairs['aid'].to_numpy()
        self.aid2 = pairs['aid_next'].to_numpy()
    def __getitem__(self, idx) :
        aid1 = self.aid1[idx]
        aid2 = self.aid2[idx]
        return [aid1,aid2]
    def __len__(self) :
        return len(self.aid1)

train_ds = ClickDataset(pl_train_pairs[:100000])
test_ds = ClickDataset(pl_train_pairs[100000:])

In [None]:
train_ds = ClickDataset(train_pairs)
train_dl_pytorch = DataLoader(train_ds, 65535, True, num_workers=2)

In [None]:
%%time

for batch in train_dl_pytorch :
    aid1,aid2 = batch[0], batch[1]

In [None]:
!pip install merlin-dataloader==0.0.2

In [None]:
pl_train_pairs[:100000].to_pandas().to_parquet('train_pairs.parquet')
pl_train_pairs[100000:].to_pandas().to_parquet('valid_pairs.parquet')

In [None]:
pl_train=pl.read_parquet('train_pairs.parquet')
pl_train

In [None]:
from merlin.loader.torch import Loader
from merlin.io import Dataset

In [None]:
train_ds = Dataset('train_pairs.parquet')
train_ds

In [None]:
train_dl_merlin = Loader(train_ds, 65535, True)

In [None]:
%%time

for batch, _ in train_dl_merlin:
    aid1, aid2 = batch['aid'], batch['aid_next']

In [None]:
class MatrixFactorization(nn.Module) :
    def __init__(self, n_aids, n_factors) :
        super().__init__()
        self.aid_factors = nn.Embedding(n_aids, n_factors, spars=True)

    def forward(self, aid1,aid2) :
        aid1 = self.aid_factors(aid1)
        aid2 = self.aid_factors(aid2)
        return (aid1*aid2).sum(dim=1)

class AverageMeter(object) :
    def __init__(self, name, fmt=':f') :
        self.name = name
        self.fmt =fmt
        self.reset()

    def reset(self) :
        self.avg=0
        self.val=0
        self.sum=0
        self.count = 0
    def update(self, val, n=1) :
        self.val = val
        self.sum +=val*n
        self.count +=n
        self.avg = self.sum / self.count

    def __str__(self) :
        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
        return fmtstr.format(**self.__dict__)

valid_ds = Dataset('valid_paris.parquet')
valid_dl_merlin = Loader(valid_ds, 65536, True)

In [None]:
from torch.optim import SparseAdam

num_epochs = 1
lr = 0.1

model = MatrixFactorization(cardinality_aids+1, 32)
optimizer = SparseAdam(model.parameters(), lr=lr)
criterion = nn.BCEwithLogitsLoss()

In [None]:
%%time

for epoch in range(num_epochs) :
    for batch,_ in train_dl_merlin :
        model.train()
        losses = AverageMeter('Loss', ':.4e')

        aid1,aid2 = batch['aid1'], batch['aid_next']
        output_pos = model(aid1, aid2)
        output_neg = model(aid1,aid2[torch.randperm(aid2.shape[0])])

        output = torch.cat([output_pos, output_neg])
        targets = torch.cat([torch.ones_like(output_pos), torch.zeroes_like(output_pos)])
        loss = criterion(output, targets)
        losses.update(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    model.eval()

    with torch.no_grad():
        accuracy = AverageMeter('accuracy')
        for batch, _ in valid_dl_merlin :
            aid1,aid2 = batch['aid1'], batch['aid_next']
            output_pos = model(aid1, aid2)
            output_neg = model(aid1,aid2[torch.randperm(aid2.shape[0])])

            accuracy_batch = torch.cat([output_pos.sigmoid() > 0.5, output_neg.sigmoid() < 0.5]).float().mean()
            accuracy.update(accuracy_batch, aid1.shape[0])


    print(f'{epoch+1:0.2d} : * Trainloss {losses.avg:.3f} * accuracy {accuracy.avg:.3f}')

In [None]:
embeddings = model.aid_factors.weight.detach().numpy()

In [None]:
%%time

from annoy import AnnoyIndex

fm = AnnoyIndex(32, 'euclidian')
for i, v in enumerate(embeddings) :
    index.add_item(i,v)

index.build(10)

In [None]:
fm.get_nns_by_item(123,10)

In [None]:
from collections import defaultdict

labels = []
type_weight_multipliers = {0: 1, 1: 6, 2: 3}

for AIDs, types in zip(test_aids,test_types):
#     print(test_aids,test_types)
    if len(AIDs) >= 20:
        # if we have enough aids (over equals 20) we don't need to look for candidates! we just use the old logic
        weights=np.logspace(0.1,1,len(AIDs),base=2, endpoint=True)-1
        aids_temp=defaultdict(lambda: 0)
        for aid,w,t in zip(AIDs,weights,types):
            aids_temp[aid]+= w * type_weight_multipliers[t]

        sorted_aids=[k for k, v in sorted(aids_temp.items(), key=lambda item: -item[1])]
        labels.append(sorted_aids[:20])
    else:
        # here we don't have 20 aids to output -- we will use word2vec embeddings to generate candidates!
        AIDs = list(dict.fromkeys(AIDs[::-1]))
#         print(AIDs)
        # let's grab the most recent aid
        most_recent_aid = AIDs[0]
#         print(aid2idx[most_recent_aid])
        # and look for some neighbors!
        nns = [fm.wv.index_to_key[i] for i in index.get_nns_by_item(fm[most_recent_aid], 21)[1:]]

        labels.append((AIDs+nns)[:20])

In [None]:
session_types = ['clicks','carts','orders']
labels_as_strings = [' '.join([str(l) for l in lls]) for lls in labels]

predictions = pd.DataFrame(data={'session_type': test_aids.index, 'labels': labels_as_strings})

prediction_dfs = []


for st in session_types:
    prediction = predictions.copy()
    prediction.session_type = prediction.session_type.astype('str') + f'_{st}'
    prediction_dfs.append(prediction)

fm_submission = pd.concat(prediction_dfs).reset_index(drop=True)
fm_submission

# DeepFM

In [None]:
#load library
from deepctr.models import DeepFM
from deepcrt.feature_column import SparseFeat, get_feature_names

#load data and preprocessing
df = data[[x for x in data.columns if 'mapping' in x or x == 'price']]

#define feature and target
sparse_features = list(df.columns)[1:]
target = ['AMT']

#define feature columns
fixlen_feature_columns = [SparseFeat(feat, df[feat].nunique(), embedding_dim=6) \
                          for feat in sparse_features]
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns=fixlen_feature_columns
feature_names = get_feature_names(line_feature_columns + dnn_feature_columns)

#split train and test set

x_train,x_test,y_train,y_test = train_test_split(df[sparse_fefatures], df[target], test_size=0.2, random_state=42)

#input data
x_train = {name:x_train[name].values for name in feature_names}
x_test = {name:x_test[name].values for name in feature_names}

#create a model the compile
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile(optimizer=optimizer.SparseAdam(lr=0.01), loss='cross_entropy')
criteion=  nn.BCEWithLogitLoss()

hist = model.fit(x_train, y_train, batch_size=256, epochs=500, verbose=1,validation_split=0.2)



