In [3]:
import warnings
warnings.filterwarnings(action='ignore')

from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from implicit.bpr import BayesianPersonalizedRanking as BPR
import numpy as np
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from sklearn.utils import shuffle
from scipy.sparse import *
from collections import Counter
import pandas as pd

In [13]:
from tqdm.notebook import tqdm

In [4]:
raw_train = pd.read_json("data/train.json")
raw_val = pd.read_json("data/val.json")

## Calc Portion

In [23]:
def calc_portion_x(cols,x):
    all_cols = []
    for col in cols:
        all_cols += col
        
    cols_cnt = dict(Counter(all_cols))
    
    cols_cnt_list = sorted(cols_cnt.items(), key=lambda t: -t[1])
    
    x_cnt = sum(x[1] for x in cols_cnt_list[:x])
    total = sum(x[1] for x in cols_cnt_list)
    return x_cnt/total

### tag portion

In [24]:
tag_portion = calc_portion_x(raw_train.tags.tolist(),2500)

In [36]:
tag_portion

0.8914641289355512

### song portion

In [34]:
song_portion = calc_portion_x(raw_train.songs.tolist(),100000)

In [35]:
song_portion

0.8096639134780247

## Extract only meaningful data

In [19]:
def get_top_x(cols, x):
    all_cols = []
    for col in cols:
        all_cols += col
        
    cols_cnt = dict(Counter(all_cols))
    
    cols_cnt_list = sorted(cols_cnt.items(), key=lambda t: -t[1])
    top_cols_cnt = dict()

    for col, cnt in cols_cnt_list[:x]:
        top_cols_cnt[col] = cnt
    return top_cols_cnt

In [20]:
top_tags_cnt = get_top_x(raw_train.tags.tolist(), 2500)

In [21]:
top_songs_cnt = get_top_x(raw_train.songs.tolist(), 2500)

## Make playlist X (songs + tags ids) table

In [38]:
tr_songs = raw_train.songs.tolist()
tr_tags = raw_train.tags.tolist()
te_songs = raw_val.songs.tolist()
te_tags = raw_val.tags.tolist()
te_ids = raw_val.id.tolist()

In [43]:
tr = []
song_to_idx = {}
tag_to_idx = {} 
idx = 0

for songs in tr_songs:
    for song_id in songs:
        if song_id not in song_to_idx and song_id in top_songs_cnt:
            song_to_idx[song_id] = idx
            idx += 1
    song_ids = [song_to_idx[song_id] for song_id in songs if song_id in top_songs_cnt]
    tr.append(song_ids)

n_items = len(song_to_idx)

for i, tags in enumerate(tr_tags):
    for tag in tags:
        if tag not in tag_to_idx and tag in top_tags_cnt:
            tag_to_idx[tag] = idx
            idx += 1
    tr[i].extend([tag_to_idx[tag] for tag in tags if tag in top_tags_cnt])
n_items = len(song_to_idx)
n_tags = len(tag_to_idx)

## ToDo

In [10]:
maximum = -1
for songs in tr:
    for song in songs:
        maximum = max(maximum,song)
print(maximum)
print(n_items+n_tags)

644301
644302


In [11]:
te = []

idx = 0
for songs in te_songs:
    ret = [] 
    for song_id in songs:
        if song_id not in song_to_idx:
            continue
        ret.append(song_to_idx[song_id])
    te.append(ret)
idx = 0
for i, tags in enumerate(te_tags):
    ret = []
    for tag in tags:
        if tag not in tag_to_idx:
            continue
        ret.append(tag)
    te[i].extend([tag_to_idx[x] for x in ret])

In [12]:
tr = shuffle(tr)

In [13]:
idx_to_song = {x:y for(y,x) in song_to_idx.items()}
idx_to_tag = {(x - n_items):y for(y,x) in tag_to_idx.items()}

In [14]:
from scipy.sparse import csr_matrix, vstack

In [15]:
def lil_to_csr(songs, num):
    row = len(songs)
    col = num
    ret = np.zeros((row,col))
    for idx, song in enumerate(songs):
        for each in song:
            ret[idx][each] = 1
    return ret

In [16]:
tr_csr = lil_to_csr(tr, n_tags + n_items)

In [17]:
te_csr = lil_to_csr(te, n_tags + n_items)

In [None]:
r = vstack([te_csr, tr_csr])

In [None]:
r = csr_matrix(r)

In [None]:
als_model = ALS(factors=128, regularization=0.08)
als_model.fit(r.T * 15.0)