In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm
from arena_util import load_json 
from arena_util import write_json
from scipy.sparse import csr_matrix 
from collections import Counter
import random
import pickle

In [2]:
train = load_json("./arena_data/orig/train.json")
song_meta_json = load_json('./song_meta.json')
question = load_json("./arena_data/questions/val.json")
meta = pd.read_json('./song_meta.json')
genre_table = load_json('./genre_gn_all.json')
answer = load_json('./arena_data/answers/val.json')

In [7]:
for ply1 in train :
    if len(ply1['songs']) < 10 :
        train.remove(ply1)
for ply2 in question :        
    if len(ply2['songs']) < 10 :
        question.remove(ply2)

print(len(train), len(question))

89531 13559


In [8]:
sub_train = train[:2000]
sub_question = question[:300]
q_ids = set([ply["id"] for ply in sub_question])
sub_val = [ply for ply in answer if ply["id"] in q_ids]

In [9]:
dtl_genre = genre_table.copy()
for x in genre_table:
    if x[-1] == '1' or '0' in genre_table[x] :
        dtl_genre.pop(x)
dtl_genre.pop('GN0305', 'GN2604')
dtl_genre_ids = list(dtl_genre.keys()) ### 170개

In [10]:
def get_genre(ply) :
    table_mf = meta[meta['id'].apply(lambda x : x in ply)]
    g_table_mf = table_mf['song_gn_gnr_basket']
    g_list_mf = g_table_mf.values.flatten().tolist()
    g_list_mf = [ x for gs in g_list_mf for x in gs]
    return g_list_mf

def get_dtl_genre(ply):
    table_mf = meta[meta['id'].apply(lambda x : x in ply)]
    g_table_mf = table_mf['song_gn_dtl_gnr_basket']
    g_list_mf = g_table_mf.values.flatten().tolist()
    g_list_mf = [x for gs in g_list_mf for x in gs]
    g_list_mf = list(filter(lambda x: x in dtl_genre_ids, g_list_mf))
    return g_list_mf

def get_gnr_comb(ply):
    g_g_c = Counter(get_genre(ply))
    g_g_c.update(Counter(get_dtl_genre(ply)))
    return g_g_c

def get_mat(train):
    train_matrix = np.zeros(shape = (len(train), len(song_meta_json)))
    for index, playlist in enumerate(train):
        songs = playlist["songs"]
        for id in songs :
            train_matrix[index][id] = 1
    return train_matrix

In [11]:
train_matrix = get_mat(sub_train)
svd_1 = TruncatedSVD(64)
csr_train_1 = csr_matrix(train_matrix)
csr_tsvd_1 = svd_1.fit_transform(csr_train_1)
reconstructed_matrix_1 = csr_tsvd_1 @ svd_1.components_
csr_rec_1 = csr_matrix(reconstructed_matrix_1)

In [12]:
def plyXsong(train_matrix, question, rec, howmany) :
    answers = []
    for i, ply in tqdm(enumerate(question)) :
        q_mean_vec = np.array(np.mean(rec[:, ply['songs']], axis=1))
        similarity = rec.T @ q_mean_vec
        similarity[ply['songs']] = 0
        ind = np.argsort(-similarity, axis = 0 )
        indices = list(ind[:howmany].reshape(howmany))
        answers.append({
            "id" : ply['id'],
            "songs" : indices
        })
    return answers

ans1 = plyXsong(train_matrix, sub_question, csr_rec_1, 1000)

300it [01:18,  3.83it/s]


In [14]:
subq_genre = {}
for ply in tqdm(sub_question) :
    gnr = get_genre(ply['songs'])
    power_genre = list(Counter(gnr))[0]
    dtlgnr = get_dtl_genre(ply['songs'])
    if len(dtlgnr) == 0 :
        power_dtlgenre = None
    else:
        power_dtlgenre = list(Counter(dtlgnr))[0]
    subq_genre[ply['id']] = (power_genre, power_dtlgenre)


100%|██████████| 300/300 [01:49<00:00,  2.75it/s]


In [15]:
num_songs = len(meta)

s_g_matrix = np.zeros((len(dtl_genre), num_songs))

for i in tqdm(range(num_songs)):
    dtlgns = meta.iloc[i]['song_gn_dtl_gnr_basket']
    gn_idx = [dtl_genre_ids.index(g) for g in dtlgns if g[-1] != '1' and '0' not in genre_table[g]]
    s_g_matrix[gn_idx, i] += 1
    gns = meta.iloc[i]['song_gn_gnr_basket']
    if len(gns) == 0 :
        pass
    elif gns[0] not in dtl_genre_ids:
        pass
    else :
        g_idx = dtl_genre_ids.index(gns[0])
    s_g_matrix[g_idx, i] += 1


100%|██████████| 707989/707989 [00:27<00:00, 26069.77it/s]


In [16]:
svd_2 = TruncatedSVD(32)
csr_train_2 = csr_matrix(s_g_matrix)
csr_tsvd_2 = svd_2.fit_transform(csr_train_2)
rec_s_g = csr_tsvd_2 @ svd_2.components_


In [17]:
def genre_filter(rec, ply, genre, howmany):
    gnr_index = dtl_genre_ids.index(genre)
    gnr_vector = rec[gnr_index, :]
    ind = np.argsort(-gnr_vector[ply])
    indices = list(ind[:howmany].reshape(howmany))
    ans = [ ply[i] for i in indices ]
    return ans

In [27]:
get_gnr_comb(my_ply['songs']).items()

dict_items([('GN0300', 5), ('GN0500', 5), ('GN0600', 5), ('GN1300', 1), ('GN1200', 5), ('GN1000', 4), ('GN0100', 2), ('GN0900', 1), ('GN0303', 3), ('GN0503', 4), ('GN1304', 1), ('GN1302', 1), ('GN1004', 1), ('GN0304', 1), ('GN0505', 1), ('GN1003', 1), ('GN0902', 1), ('GN1202', 1)])

In [30]:
list(get_gnr_comb(my_ply['songs']).values())

[5, 5, 5, 1, 5, 4, 2, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1]

In [81]:
ansW = []

for q_ply, i_ply in tqdm((zip(sub_question, ans1))) :
    genre_combination = list(get_gnr_comb(q_ply['songs']).keys())
    genre_weight = list(get_gnr_comb(q_ply['songs']).values())
    genre_idx = [dtl_genre_ids.index(genre) for genre in genre_combination if genre in dtl_genre_ids]
    score = np.zeros(len(meta))
    for i,genre_id in enumerate(genre_idx):
        genre_vec = rec_s_g[genre_id, :]
        score += genre_vec * genre_weight[i]
    target_score = score[i_ply['songs']]
    ind = np.argsort(-target_score)[:30]
    indices = [ i_ply['songs'][i] for i in ind ]
    ansW.append({
            "id" : i_ply['id'],
            "songs" : indices
        })


300it [03:29,  1.43it/s]


In [None]:
ans3 = []
for ply in ans2 :
    genre3 = subq_genre[ply['id']][1]
    if genre3 == None:
        ind3 = ply['songs'][:30]
    else:
        ind3 = genre_filter(rec_s_g, ply['songs'], genre3, 30)
    ans3.append({
            "id" : ply['id'],
            "songs" : ind3
        })

In [18]:
# -*- coding: utf-8 -*-
import fire
import numpy as np

from arena_util import load_json


class ArenaEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec, k):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)
            if i == k-1 :
                break

        return dcg / self._idcgs[len(gt)]
    
    def mAP(self, gt, rec, k):
        score = []
        suc = 0
        for i, r in enumerate(rec):
            user_map = []
            if r in gt:
                suc += 1
                a = suc / (i + 1)
                user_map.append(a)
            if user_map:  
                score.append(sum(user_map) / len(user_map))
            if i == k-1 :
                break
        mAP = sum(score) / len(score) if score else 0 
        return mAP


    def _eval(self, gt_playlists, rec_playlists, k):        
        gt_dict = {g["id"]: g for g in gt_playlists}

        gt_ids = set([g["id"] for g in gt_playlists])
        rec_ids = set([r["id"] for r in rec_playlists])

        if gt_ids != rec_ids:
            print(len(gt_ids), len(rec_ids))
            raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.")

        rec_song_counts = [len(p["songs"]) for p in rec_playlists]

        if set(rec_song_counts) != set([k]):
            raise Exception("추천 곡 결과의 개수가 맞지 않습니다.")

        rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists]

        if set(rec_unique_song_counts) != set([k]):
            raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.")

        music_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:k], k)

        music_ndcg = music_ndcg / len(rec_playlists)
        
        music_mAP = 0
        
        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_mAP += self.mAP(gt["songs"], rec["songs"][:k], k)
                                                   
        music_mAP = music_mAP / len(rec_playlists)                                    

        return music_ndcg, music_mAP

    def evaluate(self, gt_playlists, rec_playlists, k):
        try:
            music_ndcg, mAP = self._eval(gt_playlists, rec_playlists, k)
            print(f"nDCG: {music_ndcg:.6}")
            print(f"mAP: {mAP:.6}")
        except Exception as e:
            print(e)

eval = ArenaEvaluator()

In [19]:
eval.evaluate(sub_val, ans1, 1000)

nDCG: 0.123731
mAP: 0.0532218


In [37]:
eval.evaluate(sub_val, ans2, 200)

nDCG: 0.0321352
mAP: 0.029647


In [85]:
eval.evaluate(sub_val, ansW, 30)

nDCG: 0.014608
mAP: 0.0489869


In [21]:
#### 현 input playlist의 특성 : 힙합 / 락이 거의 반반으로 이루어져있지만, 일반적으로 힙합 곡들이 훨씬 대중적인 픽
my_songs = [88186, 166190, 115874, 125149, 13123, 581746,
            264929, 705815, 161919, 1546, 661432, 463577,
            535405, 35945, 582314, 182243, 64895, 211995, 
            616542, 251980, 583072, 707295] 
my_ply = {
    'id' : 62827586,
    'plylst_title' : 'my_ply',
    'songs' : my_songs ,
    'like_cnt' : 12,
    'updt_date' : '2024-03-25 20:22:34.000'
    
}

In [37]:
my_ans1 = plyXsong(train_matrix, [my_ply], csr_rec_1, 1000)

1it [00:04,  4.97s/it]


In [95]:
my_ans2 = []
for ply in my_ans1 :
    genre2 = 'GN1300'
    ind2 = genre_filter(rec_s_g, ply['songs'], genre2, 200)
    my_ans2.append({
            "id" : ply['id'],
            "songs" : ind2
        })
my_ans3 = []

for ply in my_ans2 :
    genre3 = 'GN1100'
    if genre3 == None:
        ind3 = ply['songs'][:30]
    else:
        ind3 = genre_filter(rec_s_g, ply['songs'], genre3, 30)
    my_ans3.append({
            "id" : ply['id'],
            "songs" : ind3
        })
    
my_ans4=[]
for ply in my_ans1:
    genre_combination = list(get_gnr_comb(my_ply['songs']).keys())
    genre_weight = list(get_gnr_comb(my_ply['songs']).values())
    genre_idx = [dtl_genre_ids.index(genre) for genre in genre_combination]
    score = np.zeros(len(meta))
    for i,genre_id in enumerate(genre_idx):
        genre_vec = rec_s_g[genre_id, :]
        score += genre_vec * genre_weight[i]
    target_score = score[ply['songs']]
    ind = np.argsort(-target_score)[:30]
    indices = [ ply['songs'][i] for i in ind ]
    my_ans4.append({
            "id" : ply['id'],
            "songs" : indices
        })

        
    
    
    

In [96]:
for s in my_ans3[0]['songs'][:10]:
    print(meta.iloc[s]['song_name'], meta.iloc[s]['artist_name_basket'])

Slide (Feat. Frank Ocean & Migos) ['Calvin Harris']
Don`t Let Me Down (Feat. Daya) ['The Chainsmokers']
Beggars ['Krewella', 'Diskord']
모든 날, 모든 순간 (Every day, Every Moment) ['폴킴']
눈의 꽃 ['박효신']
You Like That ['Chris Brown']
Snooze (feat. Cousin Stizz) ['Johnny Yukon']
Sexy (feat. Trey Songz) ['Chris Brown']
No Guidance (feat. Drake) ['Chris Brown']
Take A Risk ['Chris Brown']


In [79]:
for s in my_ans4[0]['songs'][:10]:
    print(meta.iloc[s]['song_name'], meta.iloc[s]['artist_name_basket'])

차렷! (Feat. 다이나믹 듀오, Tablo) ['TBNY']
꿈이 뭐야 (Dream Chaser) (Feat. Dok2 & 크러쉬) ['GRAY (그레이)']
City ['오왼 (Owen)']
We Back ['MC Meta', '나찰', '라임어택 (RHYME-A-)', '넋업샨']
위험해 (Dangerous) (Feat. 박재범) ['GRAY (그레이)']
Same Boy (Feat. Loco) ['크루셜스타 (Crucial Star)']
Good Times (Feat. Babylon) ['팔로알토 (Paloalto)']
빙하 ['짙은']
고백 ['뜨거운 감자']
안아줘요 ['10CM']
