In [1]:
import warnings
warnings.filterwarnings(action='ignore')

from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from implicit.bpr import BayesianPersonalizedRanking as BPR
import numpy as np
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from sklearn.utils import shuffle
from scipy.sparse import *
from collections import Counter
import pandas as pd

In [None]:
raw_train = pd.read_json("data/train.json")
raw_val = pd.read_json("data/val.json")

## Calc Portion

In [None]:
def calc_portion_x(cols,x):
    all_cols = []
    for col in cols:
        all_cols += col
        
    cols_cnt = dict(Counter(all_cols))
    
    cols_cnt_list = sorted(cols_cnt.items(), key=lambda t: -t[1])
    
    x_cnt = sum(x[1] for x in cols_cnt_list[:x])
    total = sum(x[1] for x in cols_cnt_list)
    return x_cnt/total

### tag portion

In [None]:
tag_portion = calc_portion_x(raw_train.tags.tolist(),2500)

In [None]:
tag_portion

### song portion

In [None]:
song_portion = calc_portion_x(raw_train.songs.tolist(),50000)

In [None]:
song_portion

## Extract only meaningful data

In [8]:
def get_top_x(cols, x):
    """
    cols : 2D array
    x: int
    """
    all_cols = []
    for col in cols:
        all_cols += col
        
    cols_cnt = dict(Counter(all_cols))
    
    cols_cnt_list = sorted(cols_cnt.items(), key=lambda t: -t[1])
    top_cols_cnt = dict()

    for col, cnt in cols_cnt_list[:x]:
        top_cols_cnt[col] = cnt
    return top_cols_cnt

In [9]:
top_songs = get_top_x(raw_train.songs.tolist(), 50000)

In [10]:
top_song_idx = -1
for top_song_num in top_songs.keys():
    top_song_idx = max(top_song_idx, top_song_num)

In [11]:
tmp_tag_dict = dict()
idx = top_song_idx + 1
for tags in raw_train.tags.tolist():
    for tag in tags:
        if tag not in tmp_tag_dict:
            tmp_tag_dict[tag] = idx
            idx+=1
    
for i, row in raw_train.iterrows():
    tmp = []
    for tag in row.tags:
        tmp.append(tmp_tag_dict[tag])
    raw_train.at[i,'tags'] = tmp

In [12]:
top_tags = get_top_x(raw_train.tags.tolist(), 2500)

## Make playlist X (songs + tags ids) table

In [13]:
tr_songs = raw_train.songs.tolist()
tr_tags = raw_train.tags.tolist()
te_songs = raw_val.songs.tolist()
te_tags = raw_val.tags.tolist()
te_ids = raw_val.id.tolist()

In [14]:
tr = []
song_to_idx = {}
tag_to_idx = {} 
idx = 0

for songs in tr_songs:
    for song_id in songs:
        if song_id not in song_to_idx and song_id in top_songs:
            song_to_idx[song_id] = idx
            idx += 1
    song_idxs = [song_to_idx[song_id] for song_id in songs if song_id in top_songs]
    tr.append(song_idxs)
n_items = len(song_to_idx)

for i, tags in enumerate(tr_tags):
    for tag in tags:
        if tag not in tag_to_idx and tag in top_tags:
            tag_to_idx[tag] = idx
            idx += 1
    tr[i].extend([tag_to_idx[tag] for tag in tags if tag in top_tags])
n_tags = len(tag_to_idx)

In [15]:
te = []

idx = 0
for songs in te_songs:
    ret = [] 
    for song_id in songs:
        if song_id not in song_to_idx or song_id not in top_songs:
            continue
        ret.append(song_to_idx[song_id])
    te.append(ret)
idx = 0
for i, tags in enumerate(te_tags):
    ret = []
    for tag in tags:
        if tag not in tag_to_idx or tag not in top_tags:
            continue
        ret.append(tag)
    te[i].extend([tag_to_idx[x] for x in ret])

In [16]:
tr = shuffle(tr)

In [17]:
idx_to_song = {x:y for(y,x) in song_to_idx.items()}
idx_to_tag = {x:y for(y,x) in tag_to_idx.items()}

- n_items: song_to_idx's length
- n_tags: ta_to_dix's length
- song_to_idx: 
    - key: song id
    - value : idx(unique key from 1 to n_item-1)
- tag_to_idx:
    - key: tag id
    - value : idx(unique key from 1 to n_tags-1)
- idx_to_song:
    - key: idx
    - value: song id
- idx_to_tag:
    - key: idx
    - value: tag id

In [18]:
def lil_to_csr(playlists):
    """
    playlists: playlist with top songs and tags
    """
    row = []
    col = []
    data = []
    for row_idx, playlist in enumerate(playlists):
        for info in playlist:
            if info < n_items:
                col.append(idx_to_song[info])
            else:
                col.append(idx_to_tag[info])
            data.append(info)
            row.append(row_idx)
    return row, col, data

In [19]:
csr_row, csr_col, csr_data = lil_to_csr(tr)

In [20]:
r = csr_matrix((csr_data, (csr_row, csr_col)))

In [32]:
te_csr_row, te_csr_col, te_csr_data = lil_to_csr(te)

In [33]:
te_r = csr_matrix((te_csr_data, (te_csr_row, te_csr_col)))

In [21]:
als_model = ALS(factors=128, regularization=0.08)
als_model.fit(r.T * 15.0)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [29]:
als_model.user_factors

array([[  6.9513817 ,  -2.4813073 ,  -9.301803  , ...,   5.9767656 ,
         -2.352466  ,   4.777112  ],
       [  6.9844813 ,   3.3944857 ,   5.158386  , ...,   1.3513653 ,
          1.5316392 ,   6.5867195 ],
       [ -6.815578  ,   1.3902944 ,  -0.04743091, ...,  -2.0536335 ,
         -7.355252  , -10.700894  ],
       ...,
       [ -8.500814  ,  -9.982717  ,  -5.1515574 , ...,  -4.6068625 ,
          1.1491692 ,   9.207828  ],
       [ -2.9057803 ,  -5.2427382 ,   2.2977285 , ...,  11.01917   ,
          5.8742485 ,   1.0605428 ],
       [  1.8127896 ,   0.87577945,   4.213435  , ...,   6.804237  ,
          3.434759  ,   0.96029115]], dtype=float32)

In [22]:
item_model = ALS(use_gpu=False)
tag_model = ALS(use_gpu=False)
item_model.user_factors = als_model.user_factors
tag_model.user_factors = als_model.user_factors

In [23]:
item_model.item_factors = als_model.item_factors[:n_items]
tag_model.item_factors = als_model.item_factors[n_items:]

In [28]:
r

<115071x727710 sparse matrix of type '<class 'numpy.longlong'>'
	with 4153638 stored elements in Compressed Sparse Row format>

In [29]:
r[:,n_items:]

<115071x677710 sparse matrix of type '<class 'numpy.int64'>'
	with 3884919 stored elements in Compressed Sparse Row format>

In [30]:
item_rec_csr = r[:, :n_items]
tag_rec_csr = r[:, n_items:]

In [35]:
item_ret = []
tag_ret = []
from tqdm.auto import tqdm
for u in tqdm(range(te_r.shape[0])):
    item_rec = item_model.recommend(u, item_rec_csr, N=100)
    item_rec = [idx_to_song[x[0]] for x in item_rec]
    tag_rec = tag_model.recommend(u, tag_rec_csr, N=100)
    tag_rec = [idx_to_tag[x[0]] for x in tag_rec if x[0] in idx_to_tag]
    item_ret.append(item_rec)
    tag_ret.append(tag_rec)

HBox(children=(FloatProgress(value=0.0, max=23015.0), HTML(value='')))




In [42]:
item_ret[0][:10]

[543123,
 447028,
 484964,
 592094,
 109240,
 440098,
 549563,
 470445,
 562544,
 451418]

In [43]:
tag_ret[:10]

[[712505], [], [], [709102], [], [], [716315], [709586, 708498], [], []]