<a href="https://colab.research.google.com/github/wbendada/playlist-completion-challenge/blob/master/Spotify_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
from math import floor, log2, ceil
import os
import sys
import json
import operator
import numpy as np
from scipy.sparse import csr_matrix, lil_matrix, load_npz, save_npz
from collections import defaultdict
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
import random
import pandas as pd
from torch.utils.data import IterableDataset, DataLoader
from torch.utils.data.dataset import Dataset

drive.mount('/content/drive')
notebooks_path = '/content/notebooks'
os.symlink('/content/drive/My Drive/Colab Notebooks/', notebooks_path)
sys.path.insert(0,notebooks_path)
#!pip uninstall implicit
#!pip install --target=$nb_path implicit
MPD_PATH = "./drive/MyDrive/Colab Notebooks/million_playlist_dataset"
os.chdir(MPD_PATH)

Mounted at /content/drive


In [None]:
from src.gated_cnn import GatedCNN
from src.evaluator import Evaluator
from src.data_manager import DataManager
from src.model import ChartsModel, CompletionModel, MatrixFactorizationModel, ItemItemModel, UserUserModel, EnsembleModel

data_manager = DataManager()
test_evaluator = Evaluator(data_manager)
#charts_model = ChartsModel(data_manager)
#mf_model = MatrixFactorizationModel(emb_size=256, data_manager=data_manager)

# Evaluation

In [None]:
user_user = UserUserModel(data_manager, retrain=False)
item_item = ItemItemModel(data_manager, retrain=False)
mf_model = MatrixFactorizationModel(emb_size=256, data_manager=data_manager)
mf_model.prepare_item_factors()
em = EnsembleModel(data_manager, models=[user_user, item_item, mf_model])
#combinations, recos, ndcgs = em.find_optimal_weights(test_evaluator)

In [None]:
class DataManager():
  def __init__(self, foldername = "data/processed_data", test_size=1000, min_songs_test=5, resplit=False):
    self.foldername = foldername 
    self.test_size = test_size
    self.min_songs_test = min_songs_test
    self.load_playlist_track()
    self.load_track_info()
    self.n_playlists = self.playlist_track.shape[0]
    self.n_tracks = self.playlist_track.shape[1]
    self.train_indices = self.get_indices("train", resplit=resplit)
    self.val_indices = self.get_indices("val")
    self.binary_val_set, self.val_ground_truth = self.get_ground_truth("val")
    self.binary_train_set = self.get_train_set()
    del self.playlist_track
    self.prepare_charts()

  def load_playlist_track(self):
    self.playlist_track = load_npz("%s/playlist_track.npz" % self.foldername)
  
  def load_track_info(self):
    with open("%s/track_info.json" % self.foldername) as f :
      self.tracks_info = json.load(f)
  
  def prepare_charts(self):
    self.ordered_tracks = [e[0] for e in sorted({v["id"]:v["count"] for k,v in self.tracks_info.items()}.items(), key=operator.itemgetter(1), reverse=True)]
    self.ordered_tracks.insert(0, self.n_tracks)
    self.tracks_rank = np.zeros(self.n_tracks + 1, dtype=np.int32)
    for i,t in enumerate(self.ordered_tracks):
      self.tracks_rank[t] = i
    self.ordered_tracks = np.array(self.ordered_tracks)
  
  def split_sets(self):
    playlist_track_csc = self.playlist_track.tocsc()
    candidate_indices = np.random.choice(list(set(playlist_track_csc.indices[playlist_track_csc.data > 2*self.min_songs_test])), 2*self.test_size, replace = False) # find all playlists that have at least 10 songs
  
    tmp_test_indices = candidate_indices[:self.test_size] 
    tmp_val_indices = candidate_indices[self.test_size:]
    train_indices = [i for i in range(self.n_playlists) if i not in candidate_indices]

    val_indices = self.get_valid_playlists(train_indices, tmp_val_indices)
    test_indices = self.get_valid_playlists(train_indices, tmp_test_indices)
    np.save('%s/train_indices' % (self.foldername), train_indices)
    np.save('%s/val_indices' % (self.foldername), val_indices)
    np.save('%s/test_indices' % (self.foldername), test_indices)

  def get_indices(self, set_name, resplit = False):
    if resplit:
      self.split_sets()
    return np.load("%s/%s_indices.npy" % (self.foldername, set_name))
      
  def get_valid_playlists(self, train_indices, test_indices):
    # removes playlists in test set that have songs with no occurence in the train set
    train_tracks = set(self.playlist_track[train_indices].indices)
    test_tracks = set(self.playlist_track[test_indices].indices)
    test_size = len(test_indices)
    invalid_tracks = test_tracks - train_tracks
    invalid_positions = set()
    v = self.playlist_track[test_indices].tocsc()
    for i in invalid_tracks:
      invalid_positions = invalid_positions.union(set(v.indices[v.indptr[i]:v.indptr[i+1]]))
    valid_positions = np.array(sorted([p for p in range(test_size) if p not in invalid_positions]))
    return test_indices[valid_positions]
  
  def get_ground_truth(self, set_name, binary = True, resplit=False):
    indices = self.get_indices(set_name, resplit)
    data = self.playlist_track[indices]
    ground_truth_array = data.multiply(data > self.min_songs_test)
    start_data = data - ground_truth_array
    if binary:
      start_data = 1 * (start_data > 0)
    ground_truth_list = []
    for i in range(data.shape[0]):
      ground_truth_list.append(set(ground_truth_array.indices[ground_truth_array.indptr[i]:ground_truth_array.indptr[i+1]]))
    return start_data, ground_truth_list

  def get_train_set(self, binary = True, resplit=False):
    train_indices = self.get_indices("train", resplit)
    train_set = self.playlist_track[train_indices]
    if binary :
      train_set = 1 * (train_set > 0)
    return train_set

In [None]:
# Next step : https://github.com/jojonki/Gated-Convolutional-Networks 

# Deep Convolutionnal Networks

In [None]:
# Input data is in the form of a csv, one row per playlist, list of ids separated by commas
# Prerequisite : order tracks by popularity for adaptive softmax. tracks_rank[track_id] == track_rank / ordered_tracks[tracks_rank] == track_id
#
# 
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#

In [None]:
mid_dim=128
n_layers        = 8
kernel          = (3, mid_dim)
conv_size = 128
hidden_size = 64
padding = (1, 0)
res_block_count = 5
batch_size      = 256
seq_len = 6 # 5 + 1 to predict
cutoffs = [1000, 4000, 15000, 65000]
end_of_seq = data_manager.n_tracks
channels = mid_dim
k=4 #kernel_width
test_batch_size = 32
sequential_data_file = "data/playlists_seq.csv"
train_indices = data_manager.train_indices
val_indices = data_manager.val_indices
item_factors_path = 'data/models/item_factors_frequency_reordered.npy'

In [None]:
from torch.utils.data.dataset import Dataset

# Training : for each row, split text, convert to int. If sequence is shorter than 5, select all sequence + padding at the end. Otherwise, randomly select a subsequence of 5 consecutive tracks
class SequentialTrainDataset(Dataset):
    def __init__(self, filename, train_indices, sample_size, index_encoder):
        self.sample_size = sample_size
        self.index_encoder = index_encoder
        self.data = pd.read_csv(filename, delimiter ='\t',  header=None, names = ['tracks']).iloc[train_indices]
        self.data['tracks'] = self.data['tracks'].apply(lambda x: self.parse(x))
 
    def parse(self, row):
      return self.index_encoder[np.array(list(map(int, row.split(','))))] # adaptive softmax requires labels to be sorted by decreasing frequency
        
    def __getitem__(self, index):
        seq =  self.data['tracks'].iloc[index]
        X = np.zeros(seq_len, dtype=np.int_)
        l = len(seq)
        if l < seq_len:
          X[:l] = np.array(seq)
        else :
          r = random.randint(0, l-seq_len)
          X = seq[r: r+seq_len] 
        return X

    def __len__(self):
        return len(self.data)

In [None]:
# Test : for each row, split text, convert to int, select 5 first tracks. Predict following tracks
class SequentialTestDataset(Dataset):    
    def __init__(self, filename, test_indices, sample_size, index_encoder):
        self.sample_size = sample_size
        self.index_encoder = index_encoder
        self.data = pd.read_csv(filename, delimiter ='\t',  header=None, names = ['tracks']).iloc[test_indices]
        self.data['tracks'] = self.data['tracks'].apply(lambda x: self.parse(x))
    def parse(self, row):
      return self.index_encoder[np.array(list(map(int, row.split(','))))][:self.sample_size] # adaptive softmax requires labels to be sorted by decreasing frequency
        
    def __getitem__(self, index):
        X =  np.array(self.data["tracks"].iloc[index])
        return X

    def __len__(self):
        return len(self.data)

In [None]:
# iterate over training set
sequential_data_file = "data/processed_data/playlist_sequence.csv"

sequential_train_dataset = SequentialTrainDataset(sequential_data_file, train_indices , seq_len, data_manager.tracks_rank)
train_dataloader = DataLoader(sequential_train_dataset, batch_size = batch_size, shuffle=False)
sequential_test_dataset = SequentialTestDataset(sequential_data_file, val_indices , seq_len - 1, data_manager.tracks_rank)
test_dataloader = DataLoader(sequential_test_dataset, batch_size = test_batch_size)

In [None]:
from torch.nn.utils import clip_grad_norm_

def train(model, train_loader, test_loader, optimizer, dev, ordered_tracks_array, evaluator, n_epoch=10, clip=False):
    print('=========training=========')
    start = time.time()
    for epoch in range(n_epoch):
        print('----epoch', epoch)
        batch_ct = 0
        model.train()

        for X in train_loader:
            X = X.long().to(dev)
            pred, loss = model(X) # (bs, ans_size)
            optimizer.zero_grad()
            loss.backward()
            if clip :
              clip_grad_norm_(model.parameters(), max_norm=clip, norm_type=2)
            optimizer.step()
            if batch_ct % 100 == 0:
                current_time = time.time()
                print('loss: {:.4f}'.format(loss))
                print('batch %d elapsed time %f seconds' % (batch_ct, current_time - start))
            batch_ct +=1
        r_prec, ndcg, click = test(model, test_loader, evaluator, ordered_tracks_array)
        current_time = time.time()
        print('current performance at epoch %d elapsed time %f seconds' % (epoch, current_time - start))
        print('r-precision : %f' % r_prec)
        print('ndcg : %f' % ndcg)
        print('click : %f' % click)

In [None]:
def test(model, test_loader, evaluator, ordered_tracks_array):
    model.eval()
    recos_CNN = np.zeros((evaluator.test_size, evaluator.n_recos))
    current_batch = 0
    for X in test_loader:
      X = X.long().to(dev)
      bs = X.shape[0]
      pred = model(X)
      pred = pred[:,1:]
      coded_recos = torch.argsort(pred, dim=1, descending=True)[:,:evaluator.n_recos].to('cpu').long() + 1
      recos_CNN[current_batch * test_loader.batch_size: current_batch * test_loader.batch_size + bs] = ordered_tracks_array.take(coded_recos)
      current_batch+=1
    r_prec = evaluator.compute_R_precision(recos_CNN)
    ndcg = evaluator.compute_ndcg(recos_CNN)
    click = evaluator.compute_all_click(recos_CNN)
    return r_prec, ndcg, click


In [None]:
# from https://github.com/jojonki/Gated-Convolutional-Networks
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class GatedCNN(nn.Module):
    '''
        In : (N, sentence_len)
        Out: (N, sentence_len, embd_size)
    '''
    def __init__(self,
                 seq_len,
                 vocab_size,
                 embd_size,
                 n_layers,
                 kernel,
                 padding,
                 out_chs,
                 hidden_chs,
                 res_block_count,
                 init_factors_path,
                 cutoffs):
        super(GatedCNN, self).__init__()
        self.res_block_count = res_block_count
        self.n_layers = n_layers
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(np.load(init_factors_path)[:,:embd_size]), freeze=False).float()

        # nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, ...
        self.conv_0 = nn.Conv2d(1, out_chs, kernel, padding=padding)
        self.b_0 = nn.Parameter(torch.randn(1, out_chs, 1, 1))
        self.conv_gate_0 = nn.Conv2d(1, out_chs, kernel, padding=padding)
        self.c_0 = nn.Parameter(torch.randn(1, out_chs, 1, 1))

        self.down_conv = nn.ModuleList([nn.Conv2d(out_chs, hidden_chs, (kernel[0], 1), padding=padding) for _ in range(n_layers)])
        self.bottle_conv = nn.ModuleList([nn.Conv2d(hidden_chs, hidden_chs, (kernel[0], 1), padding=padding) for _ in range(n_layers)])  # bottleneck here
        self.up_conv = nn.ModuleList([nn.Conv2d(hidden_chs, out_chs, (kernel[0], 1), padding=padding) for _ in range(n_layers)])

        self.down_conv_gate = nn.ModuleList([nn.Conv2d(out_chs, hidden_chs, (kernel[0], 1), padding=padding) for _ in range(n_layers)]) # bottlenecking here
        self.bottle_conv_gate = nn.ModuleList([nn.Conv2d(hidden_chs, hidden_chs, (kernel[0], 1), padding=padding) for _ in range(n_layers)])
        self.up_conv_gate = nn.ModuleList([nn.Conv2d(hidden_chs, out_chs, (kernel[0], 1), padding=padding) for _ in range(n_layers)]) # bottlenecking here

        self.fc = nn.Linear(out_chs*(seq_len - 1), vocab_size)
        self.adapt = nn.AdaptiveLogSoftmaxWithLoss(out_chs*(seq_len - 1), vocab_size, cutoffs, div_value=1.8)
        self.b = nn.ParameterList([nn.Parameter(torch.randn(1, out_chs, 1, 1)) for _ in range(n_layers)])
        self.c = nn.ParameterList([nn.Parameter(torch.randn(1, out_chs, 1, 1)) for _ in range(n_layers)])

    def forward(self, x):
        # x: (N, seq_len)

        # Embedding
        bs = x.size(0) # batch size
        if self.training:
          target = x[:,-1]
          x = x[:,:-1]
        seq_len = x.size(1)
        x = self.embedding(x) # (bs, seq_len, embd_size)

        # CNN
        x = x.unsqueeze(1) # (bs, Cin, seq_len, embd_size), insert Channnel-In dim
        

        # Conv2d
        #    Input : (bs, Cin,  Hin,  Win )
        #    Output: (bs, Cout, Hout, Wout)
        A = self.conv_0(x)      # (bs, Cout, seq_len, 1)
        A += self.b_0.repeat(1, 1, seq_len, 1)
        B = self.conv_gate_0(x) # (bs, Cout, seq_len, 1)
        B += self.c_0.repeat(1, 1, seq_len, 1)
        h = A * torch.sigmoid(B)    # (bs, Cout, seq_len, 1)
        res_input = h # TODO this is h1 not h0

        for i in range(self.n_layers):
            A = self.up_conv[i](self.bottle_conv[i](self.down_conv[i](h))) + self.b[i].repeat(1, 1, seq_len, 1)
            B = self.up_conv_gate[i](self.bottle_conv_gate[i](self.down_conv_gate[i](h))) + self.c[i].repeat(1, 1, seq_len, 1)
            h = A * torch.sigmoid(B) # (bs, Cout, seq_len, 1)
            if i % self.res_block_count == 0: # size of each residual block
                h += res_input
                res_input = h

        h = h.view(bs, -1) # (bs, Cout*seq_len)
        if self.training:
          return self.adapt(h, target)
        else :
          return self.adapt.log_prob(h)

In [None]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"

clip = 0.1
gated_cnn = GatedCNN(seq_len, data_manager.n_tracks + 1, mid_dim, n_layers, kernel, padding, conv_size, hidden_size, res_block_count, item_factors_path, cutoffs ).to(dev)
optimizer = torch.optim.SGD(gated_cnn.parameters(), lr=1.0, momentum = 0.1, nesterov=True)
train(gated_cnn, train_dataloader, test_dataloader, optimizer, dev, data_manager.ordered_tracks, test_evaluator, clip=clip, n_epoch=10**12)

In [None]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"

clip = 0.1
gated_cnn = GatedCNN(seq_len, data_manager.n_tracks + 1, mid_dim, n_layers, kernel, padding, conv_size, hidden_size, res_block_count, item_factors_path, cutoffs ).to(dev)
optimizer = torch.optim.SGD(gated_cnn.parameters(), lr=0.1, momentum = 0.1, nesterov=True)
train(gated_cnn, train_dataloader, test_dataloader, optimizer, dev, data_manager.ordered_tracks, test_evaluator, clip=clip, n_epoch=80)

----epoch 0
loss: 15.4725
batch 0 elapsed time 0.221540 seconds
loss: 11.5455
batch 1000 elapsed time 111.656628 seconds
loss: 11.2690
batch 2000 elapsed time 222.844069 seconds
loss: 11.2320
batch 3000 elapsed time 334.014528 seconds
current performance at epoch 0 elapsed time 437.798936 seconds
r-precision : 0.030713
ndcg : 0.093735
click : 16.916452
----epoch 1
loss: 11.2229
batch 0 elapsed time 437.933214 seconds
loss: 11.7884
batch 1000 elapsed time 549.057018 seconds
loss: 11.2956
batch 2000 elapsed time 660.173142 seconds
loss: 11.5965
batch 3000 elapsed time 771.319351 seconds
current performance at epoch 1 elapsed time 875.165737 seconds
r-precision : 0.030689
ndcg : 0.093260
click : 16.771208
----epoch 2
loss: 11.3080
batch 0 elapsed time 875.302786 seconds
loss: 11.5669
batch 1000 elapsed time 986.552961 seconds
loss: 11.4021
batch 2000 elapsed time 1097.762045 seconds
loss: 11.5546
batch 3000 elapsed time 1209.148337 seconds
current performance at epoch 2 elapsed time 1313.

In [None]:
# from https://github.com/jojonki/Gated-Convolutional-Networks
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class GatedCNN(nn.Module):
    '''
        In : (N, sentence_len)
        Out: (N, sentence_len, embd_size)
    '''
    def __init__(self,
                 seq_len,
                 vocab_size,
                 embd_size,
                 n_layers,
                 kernel,
                 padding,
                 out_chs,
                 res_block_count,
                 init_factors_path,
                 cutoffs):
        super(GatedCNN, self).__init__()
        self.res_block_count = res_block_count

        self.embedding = nn.Embedding.from_pretrained(torch.tensor(np.load(init_factors_path)[:,:embd_size]), freeze=False).float()

        # nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, ...
        self.conv_0 = nn.Conv2d(1, out_chs, kernel, padding=padding)
        self.b_0 = nn.Parameter(torch.randn(1, out_chs, 1, 1))
        self.conv_gate_0 = nn.Conv2d(1, out_chs, kernel, padding=padding)
        self.c_0 = nn.Parameter(torch.randn(1, out_chs, 1, 1))

        self.conv = nn.ModuleList([nn.Conv2d(out_chs, out_chs, (kernel[0], 1), padding=padding) for _ in range(n_layers)])
        self.conv_gate = nn.ModuleList([nn.Conv2d(out_chs, out_chs, (kernel[0], 1), padding=padding) for _ in range(n_layers)])
        self.b = nn.ParameterList([nn.Parameter(torch.randn(1, out_chs, 1, 1)) for _ in range(n_layers)])
        self.c = nn.ParameterList([nn.Parameter(torch.randn(1, out_chs, 1, 1)) for _ in range(n_layers)])

        self.fc = nn.Linear(out_chs*(seq_len - 1), vocab_size)
        self.adapt = nn.AdaptiveLogSoftmaxWithLoss(out_chs*(seq_len - 1), vocab_size, cutoffs, div_value=1.8)

    def forward(self, x):
        # x: (N, seq_len)

        # Embedding
        bs = x.size(0) # batch size
        if self.training:
          target = x[:,-1]
          x = x[:,:-1]
        seq_len = x.size(1)
        x = self.embedding(x) # (bs, seq_len, embd_size)

        # CNN
        x = x.unsqueeze(1) # (bs, Cin, seq_len, embd_size), insert Channnel-In dim
        

        # Conv2d
        #    Input : (bs, Cin,  Hin,  Win )
        #    Output: (bs, Cout, Hout, Wout)
        A = self.conv_0(x)      # (bs, Cout, seq_len, 1)
        A += self.b_0.repeat(1, 1, seq_len, 1)
        B = self.conv_gate_0(x) # (bs, Cout, seq_len, 1)
        B += self.c_0.repeat(1, 1, seq_len, 1)
        h = A * torch.sigmoid(B)    # (bs, Cout, seq_len, 1)
        res_input = h # TODO this is h1 not h0

        for i, (conv, conv_gate) in enumerate(zip(self.conv, self.conv_gate)):
            A = conv(h) + self.b[i].repeat(1, 1, seq_len, 1)
            B = conv_gate(h) + self.c[i].repeat(1, 1, seq_len, 1)
            h = A * torch.sigmoid(B) # (bs, Cout, seq_len, 1)
            if i % self.res_block_count == 0: # size of each residual block
                h += res_input
                res_input = h

        h = h.view(bs, -1) # (bs, Cout*seq_len)
        if self.training:
          return self.adapt(h, target)
        else :
          return self.adapt.log_prob(h)

In [None]:
if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu"

clip = 0.1
gated_cnn = GatedCNN(seq_len, data_manager.n_tracks + 1, mid_dim, n_layers, kernel, padding, channels, res_block_count, item_factors_path, cutoffs ).to(dev)
optimizer = torch.optim.SGD(gated_cnn.parameters(), lr=10.0, momentum = 0.1, nesterov=True)
train(gated_cnn, train_dataloader, test_dataloader, optimizer, dev, data_manager.ordered_tracks, test_evaluator, clip=clip)

----epoch 0
loss: 15.2022
batch 0 elapsed time 0.137938 seconds
loss: 10.8786
batch 1000 elapsed time 107.286998 seconds
loss: 10.6975
batch 2000 elapsed time 214.486134 seconds
loss: 10.4046
batch 3000 elapsed time 321.578002 seconds
current performance at epoch 0 elapsed time 421.753333 seconds
r-precision : 0.011551
ndcg : 0.039929
click : 31.848329
----epoch 1
loss: 10.2782
batch 0 elapsed time 421.892482 seconds
loss: 9.9826
batch 1000 elapsed time 529.029066 seconds
loss: 10.2310
batch 2000 elapsed time 636.237832 seconds
loss: 9.7470
batch 3000 elapsed time 743.293571 seconds
current performance at epoch 1 elapsed time 843.531823 seconds
r-precision : 0.010043
ndcg : 0.031696
click : 34.926735
----epoch 2
loss: 9.6227
batch 0 elapsed time 843.667435 seconds
loss: 9.7500
batch 1000 elapsed time 950.715488 seconds
loss: 9.4346
batch 2000 elapsed time 1057.779553 seconds
loss: 9.8620
batch 3000 elapsed time 1164.902343 seconds
current performance at epoch 2 elapsed time 1265.122504

KeyboardInterrupt: ignored

In [None]:
gated_cnn

In [None]:
torch.save(gated_cnn, 'dummy_cnn')