In [1]:
import warnings
warnings.filterwarnings(action='ignore')

from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from implicit.bpr import BayesianPersonalizedRanking as BPR
import numpy as np
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from sklearn.utils import shuffle
from scipy.sparse import *
from collections import Counter
import pandas as pd

In [2]:
SONG_TOP_X = 50000
TAG_TOP_X = 2500

In [5]:
raw_train = pd.read_json("data/train.json")
raw_val = pd.read_json("data/val.json")

## Average num of tags

In [6]:
songs_list = list()

total = 0

for playlist in raw_train.tags.tolist():
    total += len(playlist)

print(total/len(raw_train))

4.139453033344631


## Total number of tags and songs

In [7]:
def count_col(data):
    tmp = dict()
    ret = 0

    for row in (data):
        for song in row:
            if song not in tmp:
                tmp[song] = 0
                ret +=1 
    return ret

In [8]:
count_col(raw_train.songs.tolist())

615142

In [9]:
count_col(raw_train.tags.tolist())

29160

## Calc Portion

In [10]:
def calc_portion_x(cols,x):
    all_cols = []
    for col in cols:
        all_cols += col
        
    cols_cnt = dict(Counter(all_cols))
    
    cols_cnt_list = sorted(cols_cnt.items(), key=lambda t: -t[1])
    
    x_cnt = sum(x[1] for x in cols_cnt_list[:x])
    total = sum(x[1] for x in cols_cnt_list)
    return x_cnt/total

### tag portion

In [11]:
tag_portion = calc_portion_x(raw_train.tags.tolist(),TAG_TOP_X)

In [12]:
tag_portion

0.8914641289355512

### song portion

In [13]:
song_portion = calc_portion_x(raw_train.songs.tolist(),SONG_TOP_X)

In [14]:
song_portion

0.7054667054871373

## Extract TOP_X songs and tags

In [15]:
def get_top_x(cols, x):
    """
    cols : 2D array
    x: int
    ---------------------------
    song_id : cnt
    in descending order
    """
    all_cols = []
    for col in cols:
        all_cols += col
        
    cols_cnt = dict(Counter(all_cols))
    
    cols_cnt_list = sorted(cols_cnt.items(), key=lambda t: -t[1])
    top_cols_cnt = dict()

    for col, cnt in cols_cnt_list[:x]:
        top_cols_cnt[col] = cnt
    return top_cols_cnt

In [16]:
top_songs = get_top_x(raw_train.songs.tolist(), SONG_TOP_X)

assert len(top_songs) == SONG_TOP_X, "top_songs are not extracted correctly"

In [17]:
top_tags = get_top_x(raw_train.tags.tolist(), TAG_TOP_X)

assert len(top_tags) == TAG_TOP_X, "top_tags are not extracted correctly"

## Remove raw data that song_id is not in top_song and change song_id to idx

song_to_idx:
  - key: song_id (from raw data)
  - value: idx [0 : SONG_TOP_X-1]
 
idx_to_song:
   - key: idx [0 : SONG_TOP_X-1]
   - value: song_id (from raw data)

In [18]:
song_to_idx = dict()
idx_to_song = dict()
idx = 0

#make song to idx
#make idx to song
for songs in raw_train.songs.tolist():
    for song in songs:
        if song not in song_to_idx and song in top_songs:
            song_to_idx[song] = idx
            idx_to_song[idx] = song
            idx+=1

#change song id to idx
for i, row in raw_train.iterrows():
    tmp = []
    for songs in raw_train.loc[i,["songs"]]:
        for song in songs:
            if song in top_songs: tmp.append(song_to_idx[song])
    raw_train.at[i,'songs'] = tmp

In [19]:
#change te song id to idx
for i, row in raw_val.iterrows():
    tmp = []
    for songs in raw_val.loc[i,["songs"]]:
        for song in songs:
            if song in top_songs: tmp.append(song_to_idx[song])
    raw_val.at[i,'songs'] = tmp

In [20]:
assert len(song_to_idx) == SONG_TOP_X, "song_to_idx has problem"

## Remove raw data that tag is not in top_tag and change change tags from str to id

tag_to_idx:
  - key: tag_id (from raw data, str)
  - value: idx [SONG_TOP_X : TAG_TOP_X+SONG_TOP_X-1]
 
idx_to_tag:
   - key: idx [SONG_TOP_X : TAG_TOP_X+SONG_TOP_X-1]
   - value: song_id (from raw data)

In [21]:
tag_to_idx = dict()
idx_to_tag = dict()

#make song to idx
#make idx to song
for tags in raw_train.tags.tolist():
    for tag in tags:
        if tag not in tag_to_idx and tag in top_tags:
            tag_to_idx[tag] = idx
            idx_to_tag[idx] = tag
            idx+=1

#change song id to idx
for i, row in raw_train.iterrows():
    tmp = []
    for tags in raw_train.loc[i,["tags"]]:
        for tag in tags:
            if tag in top_tags: tmp.append(tag_to_idx[tag])
    raw_train.at[i,'tags'] = tmp

In [22]:
for i, row in raw_val.iterrows():
    tmp = []
    for tags in raw_val.loc[i,["tags"]]:
        for tag in tags:
            if tag in top_tags: tmp.append(tag_to_idx[tag])
    raw_val.at[i,'tags'] = tmp

In [23]:
assert len(tag_to_idx) == TAG_TOP_X, "tag_to_idx has problem"

In [24]:
n_items = len(song_to_idx)

## Make playlist X (songs + tags ids) table

In [25]:
tr_songs = raw_train.songs.tolist()
tr_tags = raw_train.tags.tolist()
te_songs = raw_val.songs.tolist()
te_tags = raw_val.tags.tolist()

tr & te:
  - row: playlist
  - col: {song| tag}_idx (from 0 to SONG_TOP_X + TAG_TOP_X)


In [26]:
tr = []

for songs in tr_songs:
    tr.append(songs)

for i, tags in enumerate(tr_tags):
    tr[i].extend(tags)

In [27]:
te = []
for songs in te_songs:
    te.append(songs)

for i, tags in enumerate(te_tags):
    te[i].extend(tags)

In [32]:
def lil_to_csr(playlists,playlists2 = []):
    """
    playlists: playlist with top songs and tags
    """
    row = []
    col = []
    data = []
    te_row = len(te)
    
    for row_idx, playlist in enumerate(playlists):
        for idx in playlist:
            col.append(idx)
            data.append(1)
            row.append(row_idx)
    
    for row_idx, playlist in enumerate(playlists2):
        for idx in playlist:
            col.append(idx)
            data.append(1)
            row.append(te_row + row_idx)
    return row, col, data

In [29]:
def lil_to_csr2(playlists,playlists2 = []):
    """
    playlists: playlist with top songs and tags
    """
    row = []
    col = []
    data = []
    te_row = len(te)
    
    for row_idx, playlist in enumerate(playlists):
        for idx in playlist:
            col.append(idx)
            data.append(1)
            row.append(row_idx)
    
    for row_idx, playlist in enumerate(playlists2):
        for idx in playlist:
            col.append(idx)
            data.append(1)
            row.append(te_row + row_idx)
    return row, col, data

In [30]:
csr_row, csr_col, csr_data = lil_to_csr(te, tr)

In [31]:
r = csr_matrix((csr_data, (csr_row, csr_col)))

In [33]:
te_r= r[:len(te)]

In [34]:
tr_r = r[len(te):]

In [35]:
als_model = ALS(factors=128, regularization=0.08)
als_model.fit(r.T * 15.0)



HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [37]:
als_model.user_factors

array([[-0.04299308, -0.23688114, -0.11300412, ..., -0.4819556 ,
        -0.4163514 ,  0.24793236],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.01933972,  0.03257951,  0.00100221, ...,  0.0071633 ,
        -0.02676675, -0.02027103],
       ...,
       [-0.07248535,  0.21526325, -0.41455686, ...,  0.2624069 ,
         0.04358422,  0.28007677],
       [-0.11151101, -0.00137787, -0.2988688 , ..., -0.22309275,
         0.19106367, -0.05040663],
       [-0.28638658, -0.19458656,  0.70601887, ...,  0.4613494 ,
         0.16831812,  0.30358806]], dtype=float32)

In [36]:
item_model = ALS(use_gpu=False)
tag_model = ALS(use_gpu=False)
item_model.user_factors = als_model.user_factors
tag_model.user_factors = als_model.user_factors

In [37]:
item_model.item_factors = als_model.item_factors[:n_items]
tag_model.item_factors = als_model.item_factors[n_items:]

In [38]:
item_rec_csr = r[:, :n_items]
tag_rec_csr = r[:, n_items:]

In [39]:
item_ret = []

for u in range(te_r.shape[0]):
    item_rec = item_model.recommend(u, item_rec_csr, N=100)
    item_rec = [idx_to_song[x[0]] for x in item_rec if x[0] in idx_to_song]
    item_ret.append(item_rec)

In [40]:
tag_ret = []
for u in range(te_r.shape[0]):
    tag_rec = tag_model.recommend(u, tag_rec_csr, N=10)
    tag_rec = [idx_to_tag[x[0]+SONG_TOP_X] for x in tag_rec if x[0]+SONG_TOP_X in idx_to_tag]
    tag_ret.append(tag_rec)

In [41]:
tag_ret = []
for u in range(te_r.shape[0]):
    tag_rec = tag_model.recommend(u, tag_rec_csr, N=10)
    tag_rec = [idx_to_tag[x[0]+SONG_TOP_X] for x in tag_rec if x[0]+SONG_TOP_X in idx_to_tag]
    tag_ret.append(tag_rec)

In [42]:
tag_ret

[['OST', '영화', '영화음악', '영화OST', '디즈니', '애니메이션', '뮤지컬', '해외', '감성', '드라마'],
 ['울고싶을때',
  '하우스음악',
  '러닝머신',
  '낙엽',
  '듀엣곡',
  '팝송모음',
  '사랑노래',
  '행복해',
  '휘트니스',
  '트레이닝'],
 ['발라드', '산책', '여행', '드라이브', '까페', '팝', '가을', '댄스', '재즈', 'OST'],
 ['가을', '발라드', '운동', '아침', '기분전환', '센치', '댄스', '우울', '가을감성', '겨울'],
 ['CCM', '사랑', '찬양', '설렘', '국내ccm', '위로', '잔잔한', '은혜', '파워DJ_CCM', '까페'],
 ['비오는날', '감성', '인디', '우울', 'Pop', '발라드', '카페', '혼자', '가을', '재즈'],
 ['락', '랩', '팝', '힙합', '봄', '일렉', '산책', '벚꽃', '편안한', '봄노래'],
 ['일렉', '팝', '클럽', '겨울', '산책', '퇴근길', '저녁', '알앤비', '트렌디', '소울'],
 ['기분전환', '드라이브', '운동', '신나는', '클럽', '여행', '여름', '감성', '히든트랙', '인디팝'],
 ['울고싶을때',
  '하우스음악',
  '러닝머신',
  '낙엽',
  '듀엣곡',
  '팝송모음',
  '사랑노래',
  '행복해',
  '휘트니스',
  '트레이닝'],
 ['팝', '클래식', '피아노', '재즈', '연주곡', '편안한', '감미로운', 'Jazz', '감성', '연주'],
 ['겨울', '연말', '출근길', '카페', '따뜻한', '추위', '커피', '캐롤', '뉴에이지', '연주곡'],
 ['일렉트로니카',
  '일렉',
  '댄스',
  '신나는',
  '일렉트로닉',
  '클럽',
  '하우스',
  'electronica',
  '파티',
  '여름'],
 ['뉴에이지', '까페', '연

In [45]:
valvalval = pd.read_json("data/val.json")

In [46]:
te_ids = valvalval.id.tolist()

In [48]:
returnval = []
for _id, rec, tag_rec in zip(te_ids, item_ret, tag_ret):
    returnval.append({
        "id": _id,
        "songs": rec[:100],
        "tags": tag_rec[:10]
    })

In [49]:
import json
with open('ret.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(returnval, ensure_ascii=False))

In [50]:
!pwd

/Users/kimtaegyun/dev/kakao


In [51]:
!ls

LICENSE               [1m[36mdata[m[m                  requirements.txt
MF_Base.ipynb         [1m[36mexample[m[m               ret.json
Untitled.ipynb        genre_most_popular.py
