# track

In [1]:
import pandas as pd

k_artist = pd.read_csv('kkbox_data/artists.csv')
k_tracks = pd.read_csv('kkbox_data/rank_tracks.csv')

s_artist = pd.read_csv('spotify_data/spotify_artists.csv')
s_tracks = pd.read_csv('spotify_data/spotify_tracks.csv')

In [12]:
from opencc import OpenCC
cc = OpenCC('s2t')

k_artist['artist'] = k_artist['artist'].apply(cc.convert)
k_tracks['name'] = k_tracks['name'].apply(cc.convert)

In [15]:
from difflib import SequenceMatcher

# calculate similarity
def calculate_similarity(title1, title2):
    return SequenceMatcher(None, title1, title2).ratio()

# find similar titles
def find_similar_titles_with_ids(titles1, titles2, ids1, ids2, threshold=0.85):
    similar_pairs = []
    for idx1, title1 in enumerate(titles1):
        candidates = [(idx2, title2) for idx2, title2 in enumerate(titles2) if title2[0] == title1[0]]
        for idx2, title2 in candidates:
            similarity = calculate_similarity(title1, title2)
            if similarity >= threshold:
                similar_pairs.append({
                    "kkbox_track": title1,
                    "kkbox_id": ids1[idx1],
                    "spotify_track": title2,
                    "spotify_id": ids2[idx2],
                    "similarity": similarity
                })
    return pd.DataFrame(similar_pairs)

kkbox_tracks_df = pd.read_csv('kkbox_data/rank_tracks.csv')
spotify_tracks_df = pd.read_csv('spotify_data/spotify_tracks.csv')

kkbox_track = kkbox_tracks_df['name'].tolist()
kkbox_id = kkbox_tracks_df['id'].tolist()
spotify_track = spotify_tracks_df['track_name'].tolist()
spotify_id = spotify_tracks_df['track_id'].tolist()

# store in df
similar_titles_df = find_similar_titles_with_ids(kkbox_track, spotify_track, kkbox_id, spotify_id)

similar_titles_df.to_csv('merge_table.csv', index=False)

In [16]:
df = pd.read_csv('merge_table.csv')

In [17]:
check = df[df['similarity'] != 1]
check.head()

Unnamed: 0,kkbox_track,kkbox_id,spotify_track,spotify_id,similarity
26,Slow Down,1_QMMRnE_TKl7TLNvU,Slow It Down,6WO7IDGLakjO38lsvI2gHB,0.857143
27,Slow Down,1_QMMRnE_TKl7TLNvU,Slow It Down,51eSHglvG1RJXtL3qI5trr,0.857143
54,在這座城市遺失了你 - 戲劇《他們創業的那些鳥事》插曲,Knb5vH9cKmKv6gIqv0,在這座城市遺失了你 (戲劇《他們創業的那些鳥事》插曲),1Ytgo9ipdlTsf6wlg6sXf3,0.925926
72,LOST,9_IhiWtuL75jKf_f3P,LOST!,02H58MSfVESkKyx4diDgu7,0.888889
75,唯一 - 三立/台視戲劇《戀愛是科學》插曲,GoVSwfbAV7_OcfN2DW,唯一 (三立/臺視戲劇《戀愛是科學》插曲),19fp9nI0tq0lcBl7XoCHAb,0.857143


In [18]:
check.to_csv('check_list.csv', index=False)

In [26]:
kkbox_data_2021 = pd.read_csv('kkbox_data/kkbox_data_2021.csv')

kkbox_data_2021.head()

Unnamed: 0,rank,artist,artist_id,song_id,album_id,date,category
0,1,五月天 (Mayday),9XN-7yg5vg3gYnCdsM,5_DAFKxzMf04hZqXk4,0le7fKde-BNMwoAst3,2021-01-01,mandarin
1,2,盧廣仲 (Crowd Lu),8qCONzjl89Yak9KFxZ,L_Aag3up_NyBpahTJG,4rzv06iAFoOyareMC7,2021-01-01,mandarin
2,3,任然,#,#,#,2021-01-01,mandarin
3,4,艾薇,SkYfuVHgT10-nfv3Tz,XXONe3fIN4kbrmzJqh,XXDPJ346EjhjDZXb-W,2021-01-01,mandarin
4,5,aMEI (張惠妹),Ok6kf9NM0d0grGYNbl,P_c_y1B6adNriKJkm6,Okl8SAlaH9M2s97GAv,2021-01-01,mandarin


In [23]:
example = pd.read_csv('kkbox_data/rank_tracks.csv')
example = example.head()
example

Unnamed: 0.1,Unnamed: 0,id,name,duration,track_number,album_id
0,0,5_DAFKxzMf04hZqXk4,因為你 所以我,281808.0,1,0le7fKde-BNMwoAst3
1,1,L_Aag3up_NyBpahTJG,刻在我心底的名字 - 電影〈刻在你心底的名字〉主題曲,320182.0,1,4rzv06iAFoOyareMC7
2,2,XXONe3fIN4kbrmzJqh,失重前幸福,222484.0,1,XXDPJ346EjhjDZXb-W
3,3,P_c_y1B6adNriKJkm6,緩緩,227343.0,1,Okl8SAlaH9M2s97GAv
4,4,4kDxlI5PWkaL8-hS8a,如果能幸福 - HBO Asia原創影集《戒指流浪記》片尾曲,259604.0,1,D_QwkPGasaFcNVcZCz


In [27]:
kkbox_data_2021.rename(columns={'song_id': 'id'}, inplace=True)
kkbox_data_2021 = kkbox_data_2021[['artist', 'id']]

In [29]:
e_df = example.merge(kkbox_data_2021, how='inner', on='id').drop_duplicates()
e_df

Unnamed: 0.1,Unnamed: 0,id,name,duration,track_number,album_id,artist
0,0,5_DAFKxzMf04hZqXk4,因為你 所以我,281808.0,1,0le7fKde-BNMwoAst3,五月天 (Mayday)
335,1,L_Aag3up_NyBpahTJG,刻在我心底的名字 - 電影〈刻在你心底的名字〉主題曲,320182.0,1,4rzv06iAFoOyareMC7,盧廣仲 (Crowd Lu)
700,2,XXONe3fIN4kbrmzJqh,失重前幸福,222484.0,1,XXDPJ346EjhjDZXb-W,艾薇
1065,3,P_c_y1B6adNriKJkm6,緩緩,227343.0,1,Okl8SAlaH9M2s97GAv,aMEI (張惠妹)
1143,4,4kDxlI5PWkaL8-hS8a,如果能幸福 - HBO Asia原創影集《戒指流浪記》片尾曲,259604.0,1,D_QwkPGasaFcNVcZCz,周興哲 (Eric Chou)


In [31]:
e_df.to_csv('example.csv', index=False)

In [2]:
k_tracks

Unnamed: 0,id,name,duration,track_number,album_id
0,5_DAFKxzMf04hZqXk4,因為你 所以我,281808.0,1,0le7fKde-BNMwoAst3
1,L_Aag3up_NyBpahTJG,刻在我心底的名字 - 電影〈刻在你心底的名字〉主題曲,320182.0,1,4rzv06iAFoOyareMC7
2,XXONe3fIN4kbrmzJqh,失重前幸福,222484.0,1,XXDPJ346EjhjDZXb-W
3,P_c_y1B6adNriKJkm6,緩緩,227343.0,1,Okl8SAlaH9M2s97GAv
4,4kDxlI5PWkaL8-hS8a,如果能幸福 - HBO Asia原創影集《戒指流浪記》片尾曲,259604.0,1,D_QwkPGasaFcNVcZCz
...,...,...,...,...,...
5537,CsY5pAz93ZiY2n9HTH,約定 - Album Version,264243.0,5,CrUljZ_ZyWtohnacfc
5538,0t_tJx0AKUfrDJftPN,我快樂嗎,225465.0,1,1aR6eY-SgvBgNbCJwZ
5539,9X-ALhgy3wYrA9t8RD,Bye Bye Bye,200202.0,1,5a_KHYwYczZYRwdhwl
5540,4lYQUE0a9Z5EZc9Exq,Stray Kids,189074.0,7,H_6b718H2kp-_nQn4A
