In [2]:
%pip install rapidfuzz

Note: you may need to restart the kernel to use updated packages.


In [3]:
import json 
from urllib.parse import unquote
from collections import defaultdict
from rapidfuzz.process import cdist
import pandas as pd
import numpy as np

# アーティストの類似度を確認

## 30MUの同一期間アーティスト及び楽曲とのペアのデータを取り出し

In [4]:
track_dict = defaultdict(tuple)

with open( '/home/tamak/master_exp/ThirtyMusic/entities/tracks.idomaar', mode = 'r' ) as f:
    lines = f.readlines()

    for line in lines:
        try:
           entitiy_type, track_id, _, info, extra_info = line.split('\t')
        except:
            print(line)
            for l in line.split():
                print(l)
            break

        
        name = json.loads(info)['name']
        artistname, trackname = name.split('/_/')

        artistname = unquote(artistname.replace('+',' '))
        trackname = unquote(trackname.replace('+',' '))

        if track_id not in track_dict:
            track_dict[ int(track_id) ] = (artistname, trackname)


In [5]:
ThMU_artist = set()
ThMU_dict = defaultdict(list)

with open('30M_dataset/same_duration_data.tsv', mode = 'r') as f:
    lines  = f.readlines()

for line in lines[1:]:
    tmp = []
    data = line.split('\t')

    user_id = data[0]
    session = data[1:]

    tmp.append(user_id)

    for track_id in session:
        artistname, trackname = track_dict[int(track_id)]

    ThMU_artist.add(artistname)
    ThMU_dict[artistname].append(trackname)

## SPDのアーティスト及び楽曲とのペアのデータを取り出し

In [6]:
SPD_artist = set()
SPD_dict = defaultdict(list)

with open('./spotify_playlists_dataset/SPD_modified.tsv', mode = 'r') as f:

    lines = f.readlines()

for line in lines[1:]:
    data = line.split('\t,\t')

    if data[1] != '' and data[2] != '':

        artistname, trackname = (data[1],data[2])

        SPD_artist.add( artistname )
        SPD_dict[artistname].append(trackname)

## 類似度チェック

In [7]:
ThMU_table = np.array(list(ThMU_artist), dtype=str)
SPD_table = np.array(list(SPD_artist), dtype=str)

In [8]:
SPD_table

array(['Rosetta Tharpe', 'Daz Dillinger ft. Bad Azz', 'Betopey', ...,
       'Thomas Andersson', 'Dixie Dee And The Rhythm Rockers', 'Barobax'],
      shape=(289815,), dtype='<U255')

In [9]:
similarity = cdist(ThMU_table[0:1000], SPD_table,workers=-1)

In [10]:
similarity_table = pd.DataFrame({
    '30MU':ThMU_table[0:1000],
    'SPD':SPD_table[similarity.argmax(axis=1)],
    'ratio':(similarity.max(axis=1) / 100).astype(float)
})

similarity_table[ (similarity_table['ratio'] < 1) & (similarity_table['ratio'] > 0.8 )].sort_values('ratio')

Unnamed: 0,30MU,SPD,ratio
21,Ojalá,Oxalá,0.800000
82,4T1A (Quart Primera),4t1a (quart primera),0.800000
117,C.A.R.N.E.,C.A.B.L.E.,0.800000
363,Arnioe,Arno,0.800000
308,Dr. R. Thiagarajan,Prof. Thiagarajan,0.800000
...,...,...,...
597,Tyler Ward Feat. Heather Janssen,Tyler Ward feat. Heather Janssen,0.968750
761,Clare Bowditch and The Feeding Set,Clare Bowditch And The Feeding Set,0.970588
302,Freemasons feat. Sophie Ellis-Bextor,Freemasons ft. Sophie Ellis-Bextor,0.971429
688,Stanley Brinks and the Wave Pictures,Stanley Brinks and The Wave Pictures,0.972222


In [11]:
similarity_table = pd.DataFrame(columns=['30MU', 'SPD', 'ratio'])
similarity_table

Unnamed: 0,30MU,SPD,ratio


In [12]:
i = 0
dlt = 1000

while(i<len(ThMU_table)):

    if (i+dlt)<len(ThMU_table):

        similarity = cdist(ThMU_table[i:i+dlt], SPD_table,workers=-1)

        try:
            tmp_table = pd.DataFrame({
            '30MU':ThMU_table[ i:i+dlt ],
            'SPD':SPD_table[ similarity.argmax(axis=1) ],
            'ratio':(similarity.max(axis=1) / 100).astype(float)
            })
        except:
            print(i)
            print(len(similarity))
            print(len(SPD_table[ similarity.argmax(axis=1) ]))

    else:
        similarity = cdist(ThMU_table[i:], SPD_table,workers=-1)

        tmp_table = pd.DataFrame({
        '30MU':ThMU_table[ i: ],
        'SPD':SPD_table[ similarity.argmax(axis=1) ],
        'ratio':(similarity.max(axis=1) / 100).astype(float)
        })

    similarity_table = pd.concat([similarity_table, tmp_table])
    i+=dlt

  similarity_table = pd.concat([similarity_table, tmp_table])


In [14]:
similarity_table

Unnamed: 0,30MU,SPD,ratio
0,Bosco Clowes,Bosco Rogers,0.750000
1,A punk,Apink,0.727273
2,Хорсс,Вирус,0.400000
3,The Eternals,The Eternals,1.000000
4,Cryptacize,Cryptacize,1.000000
...,...,...,...
801,Henry Rollins,Henry Rollins,1.000000
802,The Ocular Audio Experiment,The Felix Experiment,0.723404
803,Inside Llewyn Davis,Janet Lee Davis,0.647059
804,Cr7z,Cyrz,0.750000


In [15]:
similarity_table.to_csv('artist_pairs.csv', sep = '\t')