In [1]:
import warnings
warnings.filterwarnings(action='ignore')

from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from implicit.bpr import BayesianPersonalizedRanking as BPR
import numpy as np
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from sklearn.utils import shuffle
from scipy.sparse import *
from collections import Counter
import pandas as pd

In [2]:
SONG_TOP_X = 50000
TAG_TOP_X = 2500

In [3]:
raw_train = pd.read_json("data/train.json")
raw_val = pd.read_json("data/val.json")

## Average num of tags

In [4]:
songs_list = list()

total = 0

for playlist in raw_train.tags.tolist():
    total += len(playlist)

print(total/len(raw_train))

4.139453033344631


## Total number of tags and songs

In [5]:
def count_col(data):
    tmp = dict()
    ret = 0

    for row in (data):
        for song in row:
            if song not in tmp:
                tmp[song] = 0
                ret +=1 
    return ret

In [6]:
count_col(raw_train.songs.tolist())

615142

In [7]:
count_col(raw_train.tags.tolist())

29160

## Calc Portion

In [8]:
def calc_portion_x(cols,x):
    all_cols = []
    for col in cols:
        all_cols += col
        
    cols_cnt = dict(Counter(all_cols))
    
    cols_cnt_list = sorted(cols_cnt.items(), key=lambda t: -t[1])
    
    x_cnt = sum(x[1] for x in cols_cnt_list[:x])
    total = sum(x[1] for x in cols_cnt_list)
    return x_cnt/total

### tag portion

In [9]:
tag_portion = calc_portion_x(raw_train.tags.tolist(),TAG_TOP_X)

In [10]:
tag_portion

0.8914641289355512

### song portion

In [11]:
song_portion = calc_portion_x(raw_train.songs.tolist(),SONG_TOP_X)

In [12]:
song_portion

0.7054667054871373

## Extract TOP_X songs and tags

In [13]:
def get_top_x(cols, x):
    """
    cols : 2D array
    x: int
    ---------------------------
    song_id : cnt
    in descending order
    """
    all_cols = []
    for col in cols:
        all_cols += col
        
    cols_cnt = dict(Counter(all_cols))
    
    cols_cnt_list = sorted(cols_cnt.items(), key=lambda t: -t[1])
    top_cols_cnt = dict()

    for col, cnt in cols_cnt_list[:x]:
        top_cols_cnt[col] = cnt
    return top_cols_cnt

In [14]:
top_songs = get_top_x(raw_train.songs.tolist(), SONG_TOP_X)

assert len(top_songs) == SONG_TOP_X, "top_songs are not extracted correctly"

In [15]:
top_tags = get_top_x(raw_train.tags.tolist(), TAG_TOP_X)

assert len(top_tags) == TAG_TOP_X, "top_tags are not extracted correctly"

## Remove raw data that song_id is not in top_song and change song_id to idx

song_to_idx:
  - key: song_id (from raw data)
  - value: idx [0 : SONG_TOP_X-1]
 
idx_to_song:
   - key: idx [0 : SONG_TOP_X-1]
   - value: song_id (from raw data)

In [16]:
song_to_idx = dict()
idx_to_song = dict()
idx = 0
for songs in raw_train.songs.tolist():
    for song in songs:
        if song not in song_to_idx and song in top_songs:
            song_to_idx[song] = idx
            idx_to_song[idx] = song
            idx+=1

for i, row in raw_train.iterrows():
    tmp = []
    for songs in raw_train.loc[i,["songs"]]:
        for song in songs:
            if song in top_songs: tmp.append(song_to_idx[song])
    raw_train.at[i,'songs'] = tmp

In [17]:
assert len(song_to_idx) == SONG_TOP_X, "song_to_idx has problem"

## Remove raw data that tag is not in top_tag and change change tags from str to id

tag_to_idx:
  - key: tag_id (from raw data, str)
  - value: idx [SONG_TOP_X : TAG_TOP_X+SONG_TOP_X-1]
 
idx_to_tag:
   - key: idx [SONG_TOP_X : TAG_TOP_X+SONG_TOP_X-1]
   - value: song_id (from raw data)

In [18]:
tag_to_idx = dict()
idx_to_tag = dict()

for tags in raw_train.tags.tolist():
    for tag in tags:
        if tag not in tag_to_idx and tag in top_tags:
            tag_to_idx[tag] = idx
            idx_to_tag[idx] = tag
            idx+=1
    
for i, row in raw_train.iterrows():
    tmp = []
    for tags in raw_train.loc[i,["tags"]]:
        for tag in tags:
            if tag in top_tags: tmp.append(tag_to_idx[tag])
    raw_train.at[i,'tags'] = tmp

In [19]:
assert len(tag_to_idx) == TAG_TOP_X, "tag_to_idx has problem"

In [20]:
n_items = len(song_to_idx)

## Make playlist X (songs + tags ids) table

In [21]:
tr_songs = raw_train.songs.tolist()
tr_tags = raw_train.tags.tolist()
te_songs = raw_val.songs.tolist()
te_tags = raw_val.tags.tolist()

tr & te:
  - row: playlist
  - col: {song| tag}_idx (from 0 to SONG_TOP_X + TAG_TOP_X)


In [23]:
tr = []

for songs in tr_songs:
    tr.append(songs)

for i, tags in enumerate(tr_tags):
    tr[i].extend(tags)

In [32]:
te = []

for songs in te_songs:
    temp = []
    for song in songs:
        if song in song_to_idx:
            temp.append(song_to_idx[song])
    te.append(temp)

for i, tags in enumerate(te_tags):
    temp = []
    for tag in tags:
        if tag in tag_to_idx:
            temp.append(tag_to_idx[tag])
    te[i].extend(temp)

In [43]:
def lil_to_csr(playlists,playlists2 = []):
    """
    playlists: playlist with top songs and tags
    """
    row = []
    col = []
    data = []
    for row_idx, playlist in enumerate(playlists):
        for idx in playlist:
            col.append(idx)
            data.append(1)
            row.append(row_idx)
    
    for row_idx, playlist in enumerate(playlists2):
        for idx in playlist:
            col.append(idx)
            data.append(1)
            row.append(row_idx)
    return row, col, data

In [44]:
csr_row, csr_col, csr_data = lil_to_csr(te, tr)

In [46]:
r = csr_matrix((csr_data, (csr_row, csr_col)))

In [47]:
te_r= r[:len(te)]

In [48]:
als_model = ALS(factors=128, regularization=0.08)
als_model.fit(r.T * 15.0)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [49]:
als_model.user_factors

array([[ 0.08708247,  0.22377487, -0.28201085, ..., -0.216176  ,
         0.62339747, -0.17579183],
       [ 0.38757157, -0.08192531, -0.26700616, ..., -0.6219379 ,
        -0.36399037,  0.05050861],
       [ 0.06485939,  0.25710782, -0.03657802, ...,  0.3033687 ,
         0.07647658, -0.04850008],
       ...,
       [-0.20174706,  0.4071364 , -0.3314475 , ..., -0.0588234 ,
         0.278058  , -0.49946713],
       [-0.31993487,  0.19406585,  0.09740806, ...,  0.2672826 ,
         0.04594472, -0.13663504],
       [-1.0544006 ,  1.1130584 , -0.08380224, ..., -0.23044313,
         1.174417  , -0.19035602]], dtype=float32)

In [50]:
item_model = ALS(use_gpu=False)
tag_model = ALS(use_gpu=False)
item_model.user_factors = als_model.user_factors
tag_model.user_factors = als_model.user_factors

In [51]:
item_model.item_factors = als_model.item_factors[:n_items]
tag_model.item_factors = als_model.item_factors[n_items:]

In [52]:
item_rec_csr = r[:, :n_items]
tag_rec_csr = r[:, n_items:]

In [53]:
item_rec_csr

<115071x50000 sparse matrix of type '<class 'numpy.int64'>'
	with 4021733 stored elements in Compressed Sparse Row format>

In [54]:
tag_rec_csr

<115071x2500 sparse matrix of type '<class 'numpy.int64'>'
	with 447816 stored elements in Compressed Sparse Row format>

In [66]:
te_r.shape

(23015, 52500)

In [67]:
item_ret = []
tag_ret = []
from tqdm.auto import tqdm
for u in tqdm(range(te_r.shape[0])):
    item_rec = item_model.recommend(u, item_rec_csr, N=100)
    item_rec = [idx_to_song[x[0]] for x in item_rec]
    tag_rec = tag_model.recommend(u, tag_rec_csr, N=10)
    tag_rec = [idx_to_tag[x[0]] for x in tag_rec if x[0] in idx_to_tag]
    item_ret.append(item_rec)
    tag_ret.append(tag_rec)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [68]:
tag_ret[0]

[]