In [1]:
from algo import GreedyArrangeAlgo
from bayesian import BayesianService
from estimator import Estimator

from collections import defaultdict
import numpy as np
from scipy.spatial.distance import cosine
from tqdm import tqdm
import math
import pandas as pd

# settings

## id読み取り

In [7]:
artist_dict = defaultdict()

with open('../modify_dataset/spotify_playlists_dataset/artist_id.tsv', mode = 'r') as f:
    
    for line in f.readlines():
        artistname, id = line.split('\t')
        artist_dict[artistname] = int(id)
        
track_dict = defaultdict()
track_artist_id = defaultdict()

with open('../modify_dataset/spotify_playlists_dataset/track_id.tsv', mode = 'r') as f:
    
    for line in f.readlines():
        artistname, trackname, id = line.split('\t')
        track_dict[ (artistname, trackname) ] = int(id)
        track_artist_id[ int(id) ] = artist_dict[artistname]
        
playlist_list = []
with open('../modify_dataset/spotify_playlists_dataset/playlist_id.tsv', mode = 'r') as f:
    
    for line in f.readlines():
        data = line.split('\t')
        playlist = tuple(data[1:])
        playlist_list.append(playlist)

In [8]:
playlist_artist_counts = {}
artist_in_playlist_count = defaultdict(int)

for p_id, playlist in tqdm( enumerate(playlist_list), total = len(playlist_list) ):

    artist_count = defaultdict(int)

    if len(playlist) < 2:
        continue
    
    for track in playlist:
        artist = track_artist_id[int(track)]
        artist_count[ artist ] += 1
        artist_in_playlist_count[ artist ] += 1
        
    playlist_artist_counts[p_id] = artist_count

  0%|          | 0/192080 [00:00<?, ?it/s]

100%|██████████| 192080/192080 [00:06<00:00, 31282.93it/s]


In [9]:
from scipy.sparse import lil_matrix, coo_matrix

def normalize_data(data):
    
    # 各プレイリストのアーティスト回数について
    for pid, playlist_artist_ids in data.items():
        max_term = max(playlist_artist_ids.values())
        
        # プレイリストの各アーティストについて
        for artist_id, count in playlist_artist_ids.items():
            
            # tf-idf計算
            tf = count / max_term
            idf = np.log(len(data) / artist_in_playlist_count[artist_id])
            
            # 受け取ったプレイリスト-アーティスト回数をtf-idfに
            playlist_artist_ids[artist_id] = tf * idf
            
    return data

def init_matrix(data):
    n = len(artist_dict)
    m = len(data)
    matrix = lil_matrix((m, n), dtype=np.float32)
    for i, playlist_artist_ids in enumerate(data.values()):
        for artist_id, count in playlist_artist_ids.items():
            matrix[i, artist_id] = count
    return coo_matrix(matrix)

In [10]:
data = normalize_data(playlist_artist_counts)
playlist_artist_matrix = init_matrix(data)

In [11]:
track_count = defaultdict(int)

for p_id, playlist in tqdm( enumerate(playlist_list), total = len(playlist_list) ):
    
    for track in playlist:
        track_count[ track ] += 1

pl_pops = {}
for p_id, playlist in tqdm( enumerate(playlist_list), total = len(playlist_list) ):
    
    sum_popularity = 0
    for track in playlist:
        sum_popularity += math.log(track_count[track])

    pl_pops[p_id] = sum_popularity/len(playlist)

100%|██████████| 192080/192080 [00:04<00:00, 46420.85it/s]
100%|██████████| 192080/192080 [00:04<00:00, 46129.63it/s]


In [12]:
with open('./variances.tsv', mode = 'r') as f:
    lines = f.readlines()

var_data = defaultdict(lambda : dict())
for i, line in enumerate(lines):
    line = line.split('\t')
    if len(line) != 6:
        print(i)
    p_id, pid, pl_var, p_c, sq_var, s_c = line
    if p_id != pid:
        print(p_id)
    
    
    if float(pl_var) > 10e-6:
        var_data[int(p_id)]['coherence'] = 1.0 - float(sq_var) /float(pl_var)
        var_data[int(p_id)]['pl_var'] = float(pl_var)
        var_data[int(p_id)]['sq_var'] = float(sq_var) 

In [14]:
import pickle

with open('./pkl/var_data.pkl', 'wb') as f:
    pickle.dump(var_data, f)

PicklingError: Can't pickle <function <lambda> at 0x7fad6180ed40>: attribute lookup <lambda> on __main__ failed

In [13]:
coherence_table = defaultdict(list)
for p_id, value in var_data.items():
    playlist = playlist_list[p_id]
    coherence_table['id'].append(p_id)
    l = len(playlist)
    coherence_table['length'].append(l)
    coherence_table['log_length'].append(math.log(l))
    coherence_table['popularity'].append(pl_pops[p_id])
    coherence_table['coherence'].append(value['coherence'])

coherence_table = pd.DataFrame.from_dict(coherence_table)

# 並び替え

## setting

In [11]:
estimator = Estimator()
estimator.load_from_data(playlist_list, playlist_artist_matrix, track_artist_id, coherence_table)
bayesian = BayesianService(threshold=0.7)

100%|██████████| 111841/111841 [42:20<00:00, 44.03it/s] 


training transition
finished training transition
training coherence
finished training coherence


In [7]:
import pickle

with open('./pkl/estimator.pkl', 'wb') as f:
    pickle.dump(estimator, f)

with open('./pkl/coherence_table.pkl', 'wb') as f:
    pickle.dump(coherence_table, f)
    
with open('./pkl/track_artist_id.pkl', 'wb') as f:
    pickle.dump(track_artist_id, f)
    
with open('./pkl/playlist_artist_matrix.pkl', 'wb') as f:
    pickle.dump(playlist_artist_matrix, f)
    
with open('./pkl/playlist_list.pkl', 'wb') as f:
    pickle.dump(playlist_list, f)

## 読み込み

In [3]:
with open('./variances.tsv', mode = 'r') as f:
    lines = f.readlines()

var_data = defaultdict(lambda : dict())
for i, line in enumerate(lines):
    line = line.split('\t')
    if len(line) != 6:
        print(i)
    p_id, pid, pl_var, p_c, sq_var, s_c = line
    if p_id != pid:
        print(p_id)
    
    
    if float(pl_var) > 0:
        var_data[int(p_id)]['coherence'] = 1.0 - float(sq_var) /float(pl_var)
        var_data[int(p_id)]['pl_var'] = float(pl_var)
        var_data[int(p_id)]['sq_var'] = float(sq_var) 

In [4]:
import pickle

with open('./pkl/estimator.pkl', 'rb') as f:
    estimator = pickle.load(f)

with open('./pkl/coherence_table.pkl', 'rb') as f:
    coherence_table = pickle.load(f)
    
with open('./pkl/track_artist_id.pkl', 'rb') as f:
    track_artist_id = pickle.load(f)
    
with open('./pkl/playlist_artist_matrix.pkl', 'rb') as f:
    playlist_artist_matrix = pickle.load(f)
    
with open('./pkl/playlist_list.pkl', 'rb') as f:
    playlist_list = pickle.load(f)

## 実行

In [None]:
import time
bayesian = BayesianService(threshold=0.7)

p_id = 0
playlist = playlist_list[p_id]
algo = GreedyArrangeAlgo(p_id, playlist)
algo.load_from_data(playlist, estimator, coherence_table,bayesian, playlist_artist_matrix, track_artist_id)

start = time.time()
rearranged_p = algo.train(True)
print(time.time()-start)


  0: dist=  0, coh_error=0.0007
  1: dist=  1, coh_error=0.0000, trans_error=-0.0000
  2: dist=  1, coh_error=0.0000, trans_error=-0.0000
  3: dist=  1, coh_error=0.0000, trans_error=-0.0000
  4: dist=  1, coh_error=0.0000, trans_error=-0.0000
  5: dist=  1, coh_error=0.0000, trans_error=-0.0000
  6: dist=  1, coh_error=0.0000, trans_error= 0.0000
  7: dist=  1, coh_error=0.0000, trans_error= 0.0000
  8: dist=  1, coh_error=0.0000, trans_error= 0.0000
  9: dist=  1, coh_error=0.0000, trans_error= 0.0000
 10: dist=  1, coh_error=0.0000, trans_error= 0.0000
 11: dist=  1, coh_error=0.0000, trans_error= 0.0000
 12: dist=  2, coh_error=0.0000, trans_error= 0.0000
 13: dist= 19, coh_error=0.0000, trans_error= 0.0000
 14: dist=  1, coh_error=0.0000, trans_error= 0.0000
 15: dist=  2, coh_error=0.0000, trans_error= 0.0000
 16: dist=  5, coh_error=0.0000, trans_error= 0.0000
 17: dist=  2, coh_error=0.0000, trans_error= 0.0000
 18: dist= 11, coh_error=0.0000, trans_error= 0.0000
 19: dist= 26,

In [None]:
print(playlist)
print([track.tracks for track in rearranged_p])

('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66\n')
[[0], [1], [7], [3], [4], [5], [6], [2], [8], [10], [9], [11], [14], [12], [13], [15], [16], [36], [19], [21], [20], [18], [22], [23], [24], [25], [26], [27], [39], [29], [30], [57], [33], [32], [34], [35], [17], [37], [38], [28], [40], [41], [58], [63], [44], [45], [46], [47], [49], [48], [50], [51], [52], [53], [54], [55], [42], [31], [56], [59], [61], [60], [43], [62], [64], [65], [66]]


In [10]:
rearranged = []
bayesian = BayesianService(threshold=0.7)

for p_id in tqdm(data.keys(), total=len(data)):
    playlist = playlist_list[p_id]
    if len(playlist) > 40:
        continue
    algo = GreedyArrangeAlgo(p_id, playlist)
    algo.load_from_data(playlist, estimator, coherence_table,bayesian, playlist_artist_matrix, track_artist_id)

    rearranged.append(algo.train())

  0%|          | 1/111841 [00:09<281:36:24,  9.06s/it]


KeyboardInterrupt: 

In [11]:
p_len = []
for p_id in tqdm(data.keys(), total=len(data)):
    p_len.append(len(playlist_list[p_id]))

print(np.percentile(p_len,[0,5,10,25,50,75,90,95,100]))

100%|██████████| 111841/111841 [00:00<00:00, 2836622.83it/s]

[  2.   4.   6.  13.  25.  55. 101. 136. 205.]





## 少し修正して実行

In [9]:
rearranged = []
outlier = []

bayesian = BayesianService(threshold=0.7)

for p_id, data in tqdm(var_data.items(), total=len(var_data)):

    playlist = playlist_list[p_id]
    pl_variance = data['pl_var']
    if len(playlist) > 50 or len(playlist) < 3:
        continue
    algo = GreedyArrangeAlgo(p_id, playlist)
    algo.load_from_data(playlist, estimator, coherence_table,bayesian, playlist_artist_matrix, track_artist_id)

    # try:
    if algo.pl_variance > 10e-6:
        rearranged.append( (p_id, algo.train()) )
    else:
        outlier.append(p_id)
    # except Exception as e:
    #     print(e)
    #     print(algo.pl_variance)
    #     break

  4%|▎         | 4001/111841 [08:36<3:52:09,  7.74it/s] 


KeyboardInterrupt: 

In [5]:
from concurrent.futures import ProcessPoolExecutor

bayesian = BayesianService(threshold=0.7)

def parallel_fuc(p_id, playlist):
    if len(playlist) > 50 or len(playlist) < 3:
        return (p_id, [])
    
    algo = GreedyArrangeAlgo(p_id, playlist)
    algo.load_from_data(playlist, estimator, coherence_table,bayesian, playlist_artist_matrix, track_artist_id)
    
    if algo.pl_variance > 10e-6:
        return (p_id, algo.train())
    else:
        return (p_id, [])

rearranged = []
outlier = []

ids = []
playlists = []

for p_id, data in tqdm(var_data.items(), total=len(var_data)):
    playlist = playlist_list[p_id]
    if len(playlist) > 50 or len(playlist) < 3:
        continue
    ids.append(p_id)
    playlists.append(playlist)

with ProcessPoolExecutor() as executor:
    results = list( tqdm( executor.map( parallel_fuc, ids, playlists ), total = len(ids) ) )

100%|██████████| 111841/111841 [00:00<00:00, 2427111.67it/s]
100%|██████████| 78575/78575 [41:09<00:00, 31.82it/s]  


In [8]:
print(list(var_data.items())[102123-1])
print(len(playlist_list[176212]))

(176212, {'coherence': 0.1212876820832488, 'pl_var': 0.34021938, 'sq_var': 0.29895496})
27


In [9]:
len(results)

78575

In [10]:
with open('./rearranged.tsv',mode = 'w') as f:
    for p_id, playlist in results:

        f.write(str(p_id))

        for t_id in playlist:
            f.write('\t'+str(t_id))
        f.write('\n')

# srec用に変形

## settint

In [11]:
import json

In [12]:
durations = []

with open( '/home/tamak/master_exp/ThirtyMusic/relations/sessions.idomaar', mode = 'r' ) as f:
    lines = f.readlines()

    for line in lines:
        event_type, session_id, timestamp, info, data = line.split()

        data = json.loads(data)
        obs = data['objects']

        for ob in obs:
            durations.append(ob['playtime'])

In [13]:
durations = [ duration for duration in durations if duration > 0 ]

In [14]:
# #nowplaying : 2011/7/11~
min_time = 1310256000	
max_time = 1399766400

## 整形

In [15]:
from random import randint
from random import choice
import random

mdfy_list = []
random.seed(2025)
session_id = -1

with open('./rearranged.tsv', mode = 'r') as f:

    for line in f.readlines():
        time = randint(min_time,max_time)
        data = line.split('\t')
        p_id = data[0]
        session_id += 1
        session = data[2:]

        tmp = []
        if len(session) == 1:
            continue
        for i, track_id in enumerate(session):
            tmp.append( [ str(session_id), str(time), str(track_id.replace('\t','')) ] )

            deltatime = choice(durations)
            time += deltatime

        if time > max_time:
            diff = time-max_time
            for d in tmp:
                d[1] = str(int(d[1]) - diff)

        mdfy_list += tmp


with open('./experiment_train_full.txt', mode = 'w') as f:

    f.write('SessionId\tItemId\tTime\n')

    for data in mdfy_list:
        session_id, timestamp, track_id = data
        f.write(session_id.rstrip('\t')+'\t'+track_id.rstrip('\n')+'\t'+str(timestamp)+'\n')