In [3]:
import numpy as np
import pandas as pd


df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype={"article_id": str})
print(df.shape)
df.head()

(31788324, 5)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [4]:
df["t_dat"] = pd.to_datetime(df["t_dat"])
df["t_dat"].max()

Timestamp('2020-09-22 00:00:00')

In [5]:
active_articles = df.groupby("article_id")["t_dat"].max().reset_index()
active_articles = active_articles[active_articles["t_dat"] >= "2019-09-01"].reset_index()
active_articles.shape

(72581, 3)

In [6]:
df = df[df["article_id"].isin(active_articles["article_id"])].reset_index(drop=True)
df.shape

(29634404, 5)

In [7]:
df["week"] = (df["t_dat"].max() - df["t_dat"]).dt.days // 7
df["week"].value_counts()

65     620104
13     549443
42     518403
12     517428
64     508664
        ...  
93     174190
102    164298
104    163143
97     162580
94     152807
Name: week, Length: 105, dtype: int64

In [8]:
from sklearn.preprocessing import LabelEncoder


article_ids = np.concatenate([["placeholder"], np.unique(df["article_id"].values)])

le_article = LabelEncoder()
le_article.fit(article_ids)
df["article_id"] = le_article.transform(df["article_id"])

In [9]:
WEEK_HIST_MAX = 5

def create_dataset(df, week):
    hist_df = df[(df["week"] > week) & (df["week"] <= week + WEEK_HIST_MAX)]
    hist_df = hist_df.groupby("customer_id").agg({"article_id": list, "week": list}).reset_index()
    hist_df.rename(columns={"week": 'week_history'}, inplace=True)
    
    target_df = df[df["week"] == week]
    target_df = target_df.groupby("customer_id").agg({"article_id": list}).reset_index()
    target_df.rename(columns={"article_id": "target"}, inplace=True)
    target_df["week"] = week
    
    return target_df.merge(hist_df, on="customer_id", how="left")

val_weeks = [0]
train_weeks = [1, 2, 3, 4]


val_df = pd.concat([create_dataset(df, w) for w in val_weeks]).reset_index(drop=True)
train_df = pd.concat([create_dataset(df, w) for w in train_weeks]).reset_index(drop=True)
train_df.shape, val_df.shape

((300129, 5), (68984, 5))

In [10]:
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

class HMDataset(Dataset):
    def __init__(self, df, seq_len, is_test=False):
        self.df = df.reset_index(drop=True)
        self.seq_len = seq_len
        self.is_test = is_test
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        if self.is_test:
            target = torch.zeros(2).float()
        else:
            if not row.target:
                target = torch.tensor([0]).int()
            else:
                rand_target = np.random.choice(row.target,1)
                target = torch.tensor(rand_target).squeeze().int()

            
        article_hist = torch.zeros(self.seq_len).long()
        week_hist = torch.ones(self.seq_len).float()
        
        
        if isinstance(row.article_id, list):
            if len(row.article_id) >= self.seq_len:
                article_hist = torch.LongTensor(row.article_id[-self.seq_len:])
                week_hist = (torch.LongTensor(row.week_history[-self.seq_len:]) - row.week)/WEEK_HIST_MAX/2
            else:
                article_hist[-len(row.article_id):] = torch.LongTensor(row.article_id)
                week_hist[-len(row.article_id):] = (torch.LongTensor(row.week_history) - row.week)/WEEK_HIST_MAX/2
                
        return article_hist, week_hist, target
    
HMDataset(val_df, 64)[2]

(tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  1310, 31011,  5922, 59838,
         31013,  7950, 52530, 31012]),
 tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
 

In [11]:
def adjust_lr(optimizer, epoch):
    if epoch < 1:
        lr = 5e-5
    elif epoch < 6:
        lr = 1e-3
    elif epoch < 9:
        lr = 1e-4
    else:
        lr = 1e-5

    for p in optimizer.param_groups:
        p['lr'] = lr
    return lr
    
def get_optimizer(net):
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=3e-4, betas=(0.9, 0.999),
                                 eps=1e-08)
    return optimizer

In [12]:
import torch.nn as nn
import torch.nn.functional as F

In [13]:
class HierarchicalSoftmax(nn.Module):
    def __init__(self, ntokens, nhid, ntokens_per_class = None):
        super(HierarchicalSoftmax, self).__init__()

        # Parameters
        self.ntokens = ntokens#the number of ouput.(72582)
        self.nhid = nhid#dimension: the same length of customer dimension.(512)

        self.ntokens_per_class = ntokens_per_class#how many children one intermidiate node.(20)

        self.nclasses = int(np.ceil(self.ntokens * 1. / self.ntokens_per_class))#intermidiate nodes.(3630)
        self.ntokens_actual = self.nclasses * self.ntokens_per_class#72600

        self.layer_top_W = nn.Parameter(torch.FloatTensor(self.nhid, self.nclasses), requires_grad=True)
        self.layer_top_b = nn.Parameter(torch.FloatTensor(self.nclasses), requires_grad=True)

        self.layer_bottom_W = nn.Parameter(torch.FloatTensor(self.ntokens_per_class, self.nhid), requires_grad=True)
        self.layer_bottom_b = nn.Parameter(torch.FloatTensor(self.nclasses), requires_grad=True)

        self.init_weights()

    def init_weights(self):

        initrange = 0.1
        self.layer_top_W.data.uniform_(-initrange, initrange)
        self.layer_top_b.data.fill_(0)
        self.layer_bottom_W.data.uniform_(-initrange, initrange)
        self.layer_bottom_b.data.fill_(0)


    def forward(self, inputs):
        labels = torch.arange(self.ntokens_actual)###72600 
        batch_size, d = inputs.size()

        label_position_top = (labels / self.ntokens_per_class).long()#which position is the top layer.###[0,0,..,0,....,3659,3659]
        label_position_bottom = (labels % self.ntokens_per_class).long()#which position is the bottom layer.###[0,1,2,..,19,1,2,...,19,..]
        
        layer_top_logits = torch.matmul(inputs, self.layer_top_W) + self.layer_top_b###[256, 3630]

        multi_bias = self.layer_bottom_b[label_position_bottom].repeat(batch_size,1)###[256,72600]
        
        layer_bottom_logits = torch.matmul(inputs,self.layer_bottom_W[label_position_bottom].T) + multi_bias###[256,72600]

        layer_top_logits = layer_top_logits.repeat_interleave(self.ntokens_per_class,dim=1)###[256,72600]#match the top classes and the bottom classes.
        
        target_logits = torch.add(layer_top_logits,layer_bottom_logits)#get the final logits

        return target_logits


In [14]:
class HMModel(nn.Module):
    def __init__(self, article_shape):
        super(HMModel, self).__init__()
        
        self.article_emb = nn.Embedding(article_shape[0], embedding_dim=article_shape[1])
        self.hier = HierarchicalSoftmax(72582, 512,ntokens_per_class = 20)
        
    def forward(self, inputs):
        article_hist, week_hist = inputs[0], inputs[1]
        x = self.article_emb(article_hist)
        x = F.normalize(x, dim=2)###[256, 16, 512]
        
        x, indices = x.max(axis=1)##customer_emb[256,512]

        ###get logits rather than probability to generate loss function.
        
        logits = self.hier(x)
        logits = logits[:,:72582]#remove virtual leaves.

        return logits
    
    
model = HMModel((len(le_article.classes_), 512))
model = model.cuda()

In [15]:
import sys

def calc_map(topk_preds, target_array, k=12):
    metric = []
    tp, fp = 0, 0
    
    for pred in topk_preds:
        if target_array[pred]:
            tp += 1
            metric.append(tp/(tp + fp))
        else:
            fp += 1
            
    return np.sum(metric) / min(k, target_array.sum())

def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader, k=12):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    maps = []
    
    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            logits = model(inputs)

            _, indices = torch.topk(logits, k, dim=1)

            indices = indices.detach().cpu().numpy()
            target = target.detach().cpu().numpy()
            
            for i in range(indices.shape[0]):
                maps.append(calc_map(indices[i], target[i]))
        
    
    return np.mean(maps)

SEQ_LEN = 16

BS = 256
NW = 8

val_dataset = HMDataset(val_df, SEQ_LEN)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)

  cpuset_checked))


### Train and validate

In [16]:
def dice_loss(y_pred, y_true):
    y_pred = y_pred.sigmoid()
    intersect = (y_true*y_pred).sum(axis=1)
    
    return 1 - (intersect/(intersect + y_true.sum(axis=1) + y_pred.sum(axis=1))).mean()


def train(model, train_loader, val_loader, epochs):
    np.random.seed(SEED)
    
    optimizer = get_optimizer(model)
    scaler = torch.cuda.amp.GradScaler()
    
    criterion = torch.nn.functional.cross_entropy
    
    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        
        lr = adjust_lr(optimizer, e)
        
        loss_list = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            optimizer.zero_grad()
            
            with torch.cuda.amp.autocast():
                logits = model(inputs)
#                 print(logits.shape)
#                 print(logits)
#                 print(target.shape)
#                 print(target)
#                 return ###
                loss = criterion(logits, target.long())
            #loss.backward()
            scaler.scale(loss).backward()
            #optimizer.step()
            scaler.step(optimizer)
            scaler.update()
            
            loss_list.append(loss.detach().cpu().item())
            
            avg_loss = np.round(100*np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e+1} Loss: {avg_loss} lr: {lr}")
            
    return model


MODEL_NAME = "exp001"
SEED = 0

train_dataset = HMDataset(train_df, SEQ_LEN)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=NW,
                          pin_memory=False, drop_last=True)

model = train(model, train_loader, val_loader, epochs=10)

Epoch 1 Loss: 1093.7785 lr: 5e-05: 100%|██████████| 1172/1172 [01:38<00:00, 11.85it/s]
Epoch 2 Loss: 1013.3217 lr: 0.001: 100%|██████████| 1172/1172 [01:39<00:00, 11.73it/s]
Epoch 3 Loss: 1006.2875 lr: 0.001: 100%|██████████| 1172/1172 [01:40<00:00, 11.72it/s]
Epoch 4 Loss: 1004.8955 lr: 0.001: 100%|██████████| 1172/1172 [01:39<00:00, 11.73it/s]
Epoch 5 Loss: 1003.6794 lr: 0.001: 100%|██████████| 1172/1172 [01:39<00:00, 11.81it/s]
Epoch 6 Loss: 1002.1637 lr: 0.001: 100%|██████████| 1172/1172 [01:39<00:00, 11.84it/s]
Epoch 7 Loss: 999.4369 lr: 0.0001: 100%|██████████| 1172/1172 [01:38<00:00, 11.84it/s]
Epoch 8 Loss: 998.6596 lr: 0.0001: 100%|██████████| 1172/1172 [01:39<00:00, 11.83it/s]
Epoch 9 Loss: 998.6075 lr: 0.0001: 100%|██████████| 1172/1172 [01:38<00:00, 11.88it/s]
Epoch 10 Loss: 997.993 lr: 1e-05: 100%|██████████| 1172/1172 [01:38<00:00, 11.89it/s] 


### Finetune with more recent data for submission (include validation set)

In [17]:
train_dataset = HMDataset(train_df[train_df["week"] < 4].append(val_df), SEQ_LEN)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=NW,
                          pin_memory=False, drop_last=True)

model = train(model, train_loader, val_loader, epochs=10)

Epoch 1 Loss: 994.2746 lr: 5e-05: 100%|██████████| 1160/1160 [01:37<00:00, 11.85it/s]
Epoch 2 Loss: 994.0558 lr: 0.001: 100%|██████████| 1160/1160 [01:37<00:00, 11.91it/s]
Epoch 3 Loss: 991.9912 lr: 0.001: 100%|██████████| 1160/1160 [01:36<00:00, 11.96it/s]
Epoch 4 Loss: 989.718 lr: 0.001: 100%|██████████| 1160/1160 [01:37<00:00, 11.92it/s] 
Epoch 5 Loss: 987.1725 lr: 0.001: 100%|██████████| 1160/1160 [01:36<00:00, 12.05it/s]
Epoch 6 Loss: 984.365 lr: 0.001: 100%|██████████| 1160/1160 [01:37<00:00, 11.85it/s] 
Epoch 7 Loss: 980.8309 lr: 0.0001: 100%|██████████| 1160/1160 [01:37<00:00, 11.92it/s]
Epoch 8 Loss: 979.9418 lr: 0.0001: 100%|██████████| 1160/1160 [01:37<00:00, 11.87it/s]
Epoch 9 Loss: 979.8323 lr: 0.0001: 100%|██████████| 1160/1160 [01:37<00:00, 11.92it/s]
Epoch 10 Loss: 979.0787 lr: 1e-05: 100%|██████████| 1160/1160 [01:37<00:00, 11.94it/s]


In [18]:
test_df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv').drop("prediction", axis=1)
print(test_df.shape)
test_df.head()

(1371980, 1)


Unnamed: 0,customer_id
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...


In [19]:
def create_test_dataset(test_df):
    week = -1
    test_df["week"] = week
    
    hist_df = df[(df["week"] > week) & (df["week"] <= week + WEEK_HIST_MAX)]
    hist_df = hist_df.groupby("customer_id").agg({"article_id": list, "week": list}).reset_index()
    hist_df.rename(columns={"week": 'week_history'}, inplace=True)
    
    
    return test_df.merge(hist_df, on="customer_id", how="left")

test_df = create_test_dataset(test_df)
test_df.head()

Unnamed: 0,customer_id,week,article_id,week_history
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,-1,[7154],[2]
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,-1,,
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,-1,[46435],[1]
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,-1,,
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,-1,,


In [20]:
test_df["article_id"].isnull().mean()

0.8008965145264508

In [21]:
test_ds = HMDataset(test_df, SEQ_LEN, is_test=True)
test_loader = DataLoader(test_ds, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)


def inference(model, loader, k=12):
    model.eval()
    
    tbar = tqdm(loader, file=sys.stdout)
    
    preds = []
    
    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            logits = model(inputs)

            _, indices = torch.topk(logits, k, dim=1)

            indices = indices.detach().cpu().numpy()
            target = target.detach().cpu().numpy()

            for i in range(indices.shape[0]):
                preds.append(" ".join(list(le_article.inverse_transform(indices[i]))))
        
    
    return preds


test_df["prediction"] = inference(model, test_loader)

  cpuset_checked))


100%|██████████| 5360/5360 [34:35<00:00,  2.58it/s]


In [None]:
test_df.to_csv("submission.csv", index=False, columns=["customer_id", "prediction"])

In [23]:
test_df

Unnamed: 0,customer_id,week,article_id,week_history,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,-1,[7154],[2],0751471042 0751471041 0751471043 0751471037 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,-1,,,0915526002 0915611003 0915529001 0915453004 09...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,-1,[46435],[1],0762846003 0915453003 0762856001 0915611003 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,-1,,,0915526002 0915611003 0915529001 0915453004 09...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,-1,,,0915526002 0915611003 0915529001 0915453004 09...
...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,-1,"[27463, 28961, 33376, 45860, 49128, 6056]","[2, 2, 2, 2, 2, 2]",0915453004 0863581002 0915611003 0915529003 08...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,-1,,,0915526002 0915611003 0915529001 0915453004 09...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,-1,"[21885, 66983, 46532, 39429]","[2, 2, 2, 1]",0762846027 0762853002 0762856008 0762846036 07...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,-1,,,0915526002 0915611003 0915529001 0915453004 09...
