# data preparation

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 63 bytes


In [None]:
!kaggle competitions download -c h-and-m-personalized-fashion-recommendations

Downloading h-and-m-personalized-fashion-recommendations.zip to /content
100% 28.7G/28.7G [02:16<00:00, 210MB/s]
100% 28.7G/28.7G [02:16<00:00, 226MB/s]


In [None]:
!unzip h-and-m-personalized-fashion-recommendations.zip articles.csv
!unzip h-and-m-personalized-fashion-recommendations.zip customers.csv
!unzip h-and-m-personalized-fashion-recommendations.zip transactions_train.csv
!unzip h-and-m-personalized-fashion-recommendations.zip sample_submission.csv

Archive:  h-and-m-personalized-fashion-recommendations.zip
  inflating: articles.csv            
Archive:  h-and-m-personalized-fashion-recommendations.zip
  inflating: customers.csv           
Archive:  h-and-m-personalized-fashion-recommendations.zip
  inflating: transactions_train.csv  
Archive:  h-and-m-personalized-fashion-recommendations.zip
  inflating: sample_submission.csv   


# import modules

In [None]:
import os
import sys
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
def seed_everything(seed=42): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

# data 

In [None]:
df = pd.read_csv("transactions_train.csv", dtype={"article_id": str})

In [None]:
df["t_dat"] = pd.to_datetime(df["t_dat"])
df["t_dat"].max()

Timestamp('2020-09-22 00:00:00')

In [None]:
active_articles = df.groupby("article_id")["t_dat"].max().reset_index()
active_articles = active_articles[active_articles["t_dat"] >= "2019-08-24"].reset_index() 
active_articles.shape

(73239, 3)

In [None]:
df = df[df["article_id"].isin(active_articles["article_id"])].reset_index(drop=True)
df.shape

(29728257, 5)

In [None]:
df["week"] = (df["t_dat"].max() - df["t_dat"]).dt.days // 7

In [None]:
article_ids = np.concatenate([["placeholder"], np.unique(df["article_id"].values)])

le_article = LabelEncoder()
le_article.fit(article_ids)
df["article_id"] = le_article.transform(df["article_id"])

In [None]:
def create_dataset(df, week):
    hist_df = df[(df["week"] > week) & (df["week"] <= week + WEEK_HIST_MAX)]
    hist_df = hist_df.groupby("customer_id").agg({"article_id": list, "week": list}).reset_index() #, "price": list
    hist_df.rename(columns={"week": 'week_history'}, inplace=True) #, 'price': 'price_history'
    
    target_df = df[df["week"] == week]
    target_df = target_df.groupby("customer_id").agg({"article_id": list}).reset_index()
    target_df.rename(columns={"article_id": "target"}, inplace=True)
    target_df["week"] = week
    
    return target_df.merge(hist_df, on="customer_id", how="left")

def create_test_dataset(test_df):
    week = -1
    test_df["week"] = week
    
    hist_df = df[(df["week"] > week) & (df["week"] <= week + WEEK_HIST_MAX)]
    hist_df = hist_df.groupby("customer_id").agg({"article_id": list, "week": list}).reset_index()
    hist_df.rename(columns={"week": 'week_history'}, inplace=True)
    
    return test_df.merge(hist_df, on="customer_id", how="left")

In [None]:
val_weeks = [0]
train_weeks = [1, 2, 3]
WEEK_HIST_MAX = 5

# memory save
df = df[df.week < (max(train_weeks) + WEEK_HIST_MAX) * 2].reset_index(drop=True)

val_df = pd.concat([create_dataset(df, w) for w in val_weeks]).reset_index(drop=True)
train_df = pd.concat([create_dataset(df, w) for w in train_weeks]).reset_index(drop=True)

cons_users = list(set(train_df.customer_id).union(set(val_df.customer_id)))
print(len(cons_users))
display(train_df.head())
train_df.shape, val_df.shape

233174


Unnamed: 0,customer_id,target,week,article_id,week_history
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,[47084],1,,
1,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,"[29309, 1818]",1,,
2,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,[32003],1,"[52049, 27705, 65845, 69607, 71677, 71677, 716...","[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ..."
3,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,"[14885, 12129, 66912, 66912, 68467]",1,"[72643, 72141, 68849, 71570, 70590, 70807]","[6, 6, 6, 2, 2, 2]"
4,00040239317e877c77ac6e79df42eb2633ad38fcac09fc...,"[65998, 65999, 65998, 65999]",1,,


((228094, 5), (68984, 5))

# setting for nn

In [None]:
SEQ_LEN = 16
BS = 256
NW = 8
MODEL_NAME = "exp001"
SEED = 0
article_emb_size = 256

In [None]:
class HMDataset(Dataset):
    def __init__(self, df, seq_len, is_test=False):
        self.df = df.reset_index(drop=True)
        self.seq_len = seq_len
        self.is_test = is_test
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        if self.is_test:
            target = torch.zeros(2).float()
        else:
            target = torch.zeros(len(article_ids)).float()
            for t in row.target:
                target[t] = 1.0
            
        article_hist = torch.zeros(self.seq_len).long()
        week_hist = torch.ones(self.seq_len).float()
                
        if isinstance(row.article_id, list):
            if len(row.article_id) >= self.seq_len:
                article_hist = torch.LongTensor(row.article_id[-self.seq_len:])
                week_hist = (torch.LongTensor(row.week_history[-self.seq_len:]) - row.week)/WEEK_HIST_MAX
            else:
                article_hist[-len(row.article_id):] = torch.LongTensor(row.article_id)
                week_hist[-len(row.article_id):] = (torch.LongTensor(row.week_history) - row.week)/WEEK_HIST_MAX
                
        return article_hist, week_hist, target

#HMDataset(val_df, 64)[1]

In [None]:
# https://github.com/ChristophReich1996/SmeLU/blob/master/smelu/smelu.py
class SmeLU(nn.Module):
    """
    This class implements the Smooth ReLU (SmeLU) activation function proposed in:
    https://arxiv.org/pdf/2202.06499.pdf
    """

    def __init__(self, beta: float = 2.) -> None:
        """
        Constructor method.
        :param beta (float): Beta value if the SmeLU activation function. Default 2.
        """
        # Call super constructor
        super(SmeLU, self).__init__()
        # Check beta
        assert beta >= 0., f"Beta must be equal or larger than zero. beta={beta} given."
        # Save parameter
        self.beta: float = beta

    def __repr__(self) -> str:
        """
        Returns a string representation.
        :return (str): String representation
        """
        return f"SmeLU(beta={self.beta})"

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        """
        Forward pass.
        :param input (torch.Tensor): Tensor of any shape
        :return (torch.Tensor): Output activation tensor of the same shape as the input tensor
        """
        output: torch.Tensor = torch.where(input > self.beta, input,
                                           torch.tensor([0.], device=input.device, dtype=input.dtype))
        output: torch.Tensor = torch.where(torch.abs(input) <= self.beta,
                                           torch.div(torch.square(torch.add(input, self.beta)), (4. * self.beta)).to(input.dtype),
                                           output)
        return output

In [None]:
class HMModel(nn.Module):
    def __init__(self, article_shape):
        super(HMModel, self).__init__()
        
        self.article_emb = nn.Embedding(article_shape[0], embedding_dim=article_shape[1])
        self.article_likelihood = nn.Parameter(torch.zeros(article_shape[0]), requires_grad=True)

        self.top = nn.Sequential(nn.Conv1d(3, 16, kernel_size=1), nn.LeakyReLU(), 
                                 nn.Conv1d(16, 8, kernel_size=1), nn.LeakyReLU(),
                                 nn.Conv1d(8, 1, kernel_size=1))
        
    def forward(self, inputs):
        article_hist, week_hist = inputs[0], inputs[1]

        x = self.article_emb(article_hist)
        x = F.normalize(x, dim=2)
        
        x = x@F.normalize(self.article_emb.weight).T
        
        x, indices = x.max(axis=1)
        x = x.clamp(1e-3, 0.999)
        x = -torch.log(1/x - 1)
        
        max_week = week_hist.unsqueeze(2).repeat(1, 1, x.shape[-1]).gather(1, indices.unsqueeze(1).repeat(1, week_hist.shape[1], 1))
        max_week = max_week.mean(axis=1).unsqueeze(1)
        
        x = torch.cat([x.unsqueeze(1), 
                       max_week,
                       self.article_likelihood[None, None, :].repeat(x.shape[0], 1, 1)
                       ], axis=1)
        
        x = self.top(x).squeeze(1)
        return x

In [None]:
def calc_map(topk_preds, target_array, k=12):
    metric = []
    tp, fp = 0, 0
    
    for pred in topk_preds:
        if target_array[pred]:
            tp += 1
            metric.append(tp/(tp + fp))
        else:
            fp += 1
            
    return np.sum(metric) / min(k, target_array.sum())

def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader, k=12):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    maps = []
    
    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            logits = model(inputs)

            _, indices = torch.topk(logits, k, dim=1)

            indices = indices.detach().cpu().numpy()
            target = target.detach().cpu().numpy()

            for i in range(indices.shape[0]):
                maps.append(calc_map(indices[i], target[i]))
        
    
    return np.mean(maps)

In [None]:
def dice_loss(y_pred, y_true):
    y_pred = y_pred.sigmoid()
    intersect = (y_true*y_pred).sum(axis=1)
    
    return 1 - (intersect/(intersect + y_true.sum(axis=1) + y_pred.sum(axis=1))).mean()

def get_optimizer(net):
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=1e-4, betas=(0.9, 0.999),
                                 eps=1e-08)
    return optimizer

def train(model, train_loader, val_loader, epochs):
    np.random.seed(SEED)
    
    optimizer = get_optimizer(model)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e1, 
                                              max_lr=5e-4, epochs=epochs, steps_per_epoch=len(train_loader))
    scaler = torch.cuda.amp.GradScaler()
    criterion = torch.nn.BCEWithLogitsLoss()
    best_score = 0
    
    count = 0
    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
                
        loss_list = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            optimizer.zero_grad()
            
            with torch.cuda.amp.autocast():
                logits = model(inputs)
                loss = criterion(logits, target) + dice_loss(logits, target)
            
            scaler.scale(loss).backward()
            #optimizer.step()
            scaler.step(optimizer)
            scheduler.step()
            scaler.update()
            
            loss_list.append(loss.detach().cpu().item())
            
            avg_loss = np.round(100*np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e+1} Loss: {avg_loss}")
            
        val_map = validate(model, val_loader)

        log_text = f"Epoch {e+1}\nTrain Loss: {avg_loss}\nValidation MAP: {val_map}\n"
        print(log_text)
        if val_map > best_score:
            torch.save(model.state_dict(), 'best-model-parameters.pt')
            best_score = val_map
            count = 0
        else:
            count += 1

        if count == 5:
            break

    final_model = HMModel((len(le_article.classes_), article_emb_size))
    final_model = model.cuda()
    final_model.load_state_dict(torch.load('best-model-parameters.pt')) 
    
    return final_model

# first training

In [None]:
model = HMModel((len(le_article.classes_), article_emb_size))
model = model.cuda()

val_dataset = HMDataset(val_df, SEQ_LEN)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)

train_dataset = HMDataset(train_df, SEQ_LEN)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=NW,
                          pin_memory=False, drop_last=True)

model = train(model, train_loader, val_loader, epochs=10)

Epoch 1 Loss: 161.5972: 100%|██████████| 890/890 [08:28<00:00,  1.75it/s]
100%|██████████| 270/270 [00:53<00:00,  5.03it/s]
Epoch 1
Train Loss: 161.5972
Validation MAP: 0.021436661000962025

Epoch 2 Loss: 100.74: 100%|██████████| 890/890 [08:28<00:00,  1.75it/s]
100%|██████████| 270/270 [00:53<00:00,  5.04it/s]
Epoch 2
Train Loss: 100.74
Validation MAP: 0.02343437528871703

Epoch 3 Loss: 99.8203: 100%|██████████| 890/890 [08:28<00:00,  1.75it/s]
100%|██████████| 270/270 [00:53<00:00,  5.06it/s]
Epoch 3
Train Loss: 99.8203
Validation MAP: 0.023831265464597143

Epoch 4 Loss: 99.4783: 100%|██████████| 890/890 [08:27<00:00,  1.75it/s]
100%|██████████| 270/270 [00:53<00:00,  5.07it/s]
Epoch 4
Train Loss: 99.4783
Validation MAP: 0.02408301622845486

Epoch 5 Loss: 99.3986: 100%|██████████| 890/890 [08:27<00:00,  1.75it/s]
100%|██████████| 270/270 [00:53<00:00,  5.07it/s]
Epoch 5
Train Loss: 99.3986
Validation MAP: 0.024080457937633727

Epoch 6 Loss: 99.3782: 100%|██████████| 890/890 [08:27<00

# fine tune

In [None]:
train_dataset = HMDataset(train_df[train_df["week"] < 4].append(val_df), SEQ_LEN)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=NW,
                          pin_memory=False, drop_last=True)

model = train(model, train_loader, val_loader, epochs=10)

# prediction

In [None]:
test_df = pd.read_csv('sample_submission.csv').drop("prediction", axis=1)
test_df = create_test_dataset(test_df)
print(test_df["article_id"].isnull().mean())
test_df.head()

In [None]:
test_ds = HMDataset(test_df, SEQ_LEN, is_test=True)
test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)

def inference(model, loader, k=12):
    model.eval()
    
    tbar = tqdm(loader, file=sys.stdout)
    
    preds = []
    
    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            logits = model(inputs)

            _, indices = torch.topk(logits, k, dim=1)

            indices = indices.detach().cpu().numpy()
            target = target.detach().cpu().numpy()

            for i in range(indices.shape[0]):
                preds.append(" ".join(list(le_article.inverse_transform(indices[i]))))
    
    return preds


test_df["prediction"] = inference(model, test_loader)

In [None]:
test_df.to_csv("submission.csv", index=False, columns=["customer_id", "prediction"])

In [None]:
# submit用
#!kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f submission.csv -m "second google colab sub"