In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"
import gc
import math
import time
import pickle
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob
pd.options.display.max_rows = 1000
from sklearn.metrics import roc_auc_score

import cudf

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torch.optim import SparseAdam, Adam
from torch.utils.data import Dataset, DataLoader
torch.backends.cudnn.benchmark = True
print(torch.__version__)

def freemem(df):
    for col in df.columns:
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
        elif df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
    gc.collect()
    return

device = 'cuda'
USE_GPU = torch.cuda.is_available()
print('Is GPU available?', USE_GPU)

In [None]:
# point to all your training parquet files
files = (
    list(glob('fold/train-split*')) +
    list(glob('fold/valid-split*')) +
    list(glob('fold/test-split*'))
)
print(len(files))

In [None]:
train = pd.read_parquet(files)
train = cudf.from_pandas(train); gc.collect()
train = train.sort_values(['session', 'ts'], ascending=[True, True]).reset_index(drop=True)
freemem(train)

print(train.shape)
train.head()

In [None]:
train[f'ts_diff'] = train.groupby('session')['ts'].shift(1).fillna(0).astype('int32')
train[f'ts_diff'] = (train[f'ts'] - train[f'ts_diff'])
train[f'ts_diff'] = train[f'ts_diff'].clip(0, 24*60*60)
train.loc[train.ts_diff<2*60*60 , f'ts_diff'] = 0
train.loc[train.ts_diff>=2*60*60 , f'ts_diff'] = 1
gc.collect()
train[f'subses'] = train.groupby('session')['ts_diff'].cumsum()
train[f'subses'] = (train[f'session']*128 + train[f'subses']).factorize()[0]
print(train[f'ts_diff'].mean())
train.head(60)

In [None]:
for lag in range(11):
    train[f'lag{lag}'] = train.groupby('subses')['aid'].shift(lag).fillna(-1).astype('int32')
    gc.collect()
    
del train['aid']
gc.collect()
freemem(train); gc.collect()

train.head(60)

In [None]:
print(train.shape)
del train['ts_diff']; gc.collect()

train = train.loc[(train['type'] == 0) & (train['lag1'] >= 0)].reset_index(drop=True)
del train['type']
gc.collect()

print(train.shape)
train.head(60)

In [None]:
train['hour'] = ((train['ts']-train['ts'].min()) // (8*60*60)).astype('int8') 
del train['ts'], train['subses']
gc.collect()
train.tail(20)

In [None]:
train['hour'].max()

In [None]:
gc.collect()
train.to_pandas().to_parquet('train-proc-1.parquet'); gc.collect()

# Reload the DataFrame using RAM to save GPU memory for the model.

In [None]:
train = pd.read_parquet('train-proc-1.parquet'); gc.collect()
print(train.shape)
print(train.head())

In [None]:
for i in range(1, 11):
    print(i)
    train[f'lag{i}'] = train[f'lag{i}'].clip(0, None)
print(train.head())

In [None]:
train['lag0'].max()

In [None]:
train.columns

In [None]:
train['hour'] = train['hour'].astype('int32') + 1855602 + 1
train.head()

# Define the FM model

In [None]:
class MatrixFactorization(nn.Module):
    def __init__(self, n_aids=1855602+1, n_factors=32):
        super().__init__()
        self.aid_emb = nn.Embedding(1855602+128, n_factors, sparse=False)
        self.aid_emb.weight.data.normal_(mean=0.0, std=0.001)
        self.head = nn.Linear(11, 1)
        
    def forward(self, lags, targets):
        targets = torch.repeat_interleave(targets.view(-1,1), lags.shape[1], dim=1)
        tgt = self.aid_emb(targets)
        tgt = torch.nn.functional.normalize(tgt, p=2.0, dim=2, eps=1e-12)
        
        e1 = self.aid_emb(lags) 
        e1 = torch.nn.functional.normalize(e1, p=2.0, dim=2, eps=1e-12)
        e1 = e1 * tgt
        e1 = e1.sum(2)
        
        out = self.head(e1)
        return out.sigmoid()


model = MatrixFactorization(n_aids=1855602+1, n_factors=32)
model.train()

# Try the model

In [None]:
lags = (1855603 * torch.rand(9, 11)).long()
targets = (1855603 * torch.rand(9)).long()

lags, targets#, hours

In [None]:
model(lags, targets)

In [None]:
train.head()

In [None]:
def train_loop(model, iterator, optimizer, clip):
    
    criterion = nn.BCELoss().to(device)
    model.train()
    epoch_loss = 0.
    gc.collect()
    with tqdm(enumerate(iterator), total=len(iterator), miniters=100) as pbar:
        for i, (data, target) in pbar:    
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output_pos = model(data, target[:,0])
            
            #Shuffle target to build negative samples
            target = target[torch.randperm(target.shape[0])]
            output_neg = model(data, target[:,0])
            
            outputs = torch.cat([output_pos, output_neg])
            targets = torch.cat([torch.ones_like(output_pos), torch.zeros_like(output_neg)])
            loss = criterion(outputs, targets)
            
            del data, target

            loss.backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()

            epoch_loss += loss.item()
            cumloss = epoch_loss / (i+1)
            pbar.set_description(f"Loss {cumloss:.5f}")
    
    gc.collect()  
    return epoch_loss / len(iterator)


def valid_loop(model, iterator):
    ypred = []
    ytarget = []    
    epoch_loss = 0.

    model.eval()
    criterion = nn.BCELoss().to(device)
    with torch.no_grad():
        with tqdm(enumerate(iterator), total=len(iterator), miniters=50) as pbar:
            for i, (data, target) in pbar:    
                data, target = data.to(device), target.to(device)

                output_pos = model(data, target[:,0])

                #Shuffle target to build negative samples
                target = target[torch.randperm(target.shape[0])]
                output_neg = model(data, target[:,0])

                outputs = torch.cat([output_pos, output_neg])
                targets = torch.cat([torch.ones_like(output_pos), torch.zeros_like(output_neg)])
                loss = criterion(outputs, targets)
                
                ypred.append(outputs.cpu().numpy())
                ytarget.append(targets.cpu().numpy())
                del data, target

                epoch_loss += loss.item()
                cumloss = epoch_loss / (i+1)
                pbar.set_description(f"Loss {cumloss:.5f}")
                
    ypred = np.concatenate(ypred)
    ytarget = np.concatenate(ytarget)
    gc.collect()  
    
    auc = roc_auc_score(ytarget.flatten(), ypred.flatten())
    
    return epoch_loss / len(iterator), auc

gc.collect()

In [None]:
train.shape, device

In [None]:
valid = train.loc[(train.session % 400)==11].copy().reset_index()
valid.shape

In [None]:
train = train.loc[(train.session % 400)!=11].reset_index()
gc.collect()
train.shape

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
TRAIN = train[['lag1', 'lag2', 'lag3', 'lag4', 'lag5', 'lag6', 'lag7', 'lag8', 'lag9', 'lag10', 'hour']].values.copy()
VALID = valid[['lag1', 'lag2', 'lag3', 'lag4', 'lag5', 'lag6', 'lag7', 'lag8', 'lag9', 'lag10', 'hour']].values.copy()
TRAIN_TARGET = train[['lag0']].values.copy()
VALID_TARGET = valid[['lag0']].values.copy()
del train, valid; gc.collect()
TRAIN.shape, VALID.shape

In [None]:
model = MatrixFactorization(n_aids=1855602+1, n_factors=256).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.000025)
gc.collect()
print(model)

In [None]:
from torch.utils.data import TensorDataset

BS = 32*32*32

train_ds = TensorDataset(torch.as_tensor(TRAIN).long(), torch.as_tensor(TRAIN_TARGET).long())
train_dl = DataLoader(train_ds, BS, True, num_workers=4, drop_last=True, pin_memory=True)

valid_ds = TensorDataset(torch.as_tensor(VALID).long(), torch.as_tensor(VALID_TARGET).long())
valid_dl = DataLoader(valid_ds, BS, False, num_workers=4, drop_last=False, pin_memory=True)

print(len(valid_ds), len(valid_dl))

In [None]:
!mkdir model-weights
for epoch in range(0, 15):
    train_loop(model, train_dl, optimizer, 1000.); gc.collect()
    valloss, auc = valid_loop(model, valid_dl); gc.collect()
    torch.save(model.state_dict(), f'model-weights/fm_split_{epoch}_{auc:.4f}_v10.pt');gc.collect()
    print(epoch, valloss, auc)

# Load last iteration and extract embeddings

In [None]:
# model = MatrixFactorization(n_aids=1855602+1, n_factors=256).to(device)
# model.load_state_dict(torch.load('model-weights/fm_split_9_0.9674_v10.pt'))
with torch.no_grad():
    embeddings = model.aid_emb.weight
    embeddings = torch.nn.functional.normalize(embeddings, p=2.0, dim=1, eps=1e-12)
    embeddings = embeddings.detach().cpu().numpy()
    
np.save('oof/embbedings_fm10.npy', embeddings)
embeddings.shape