In [1]:
import requests as req
import lxml.html
import json
import re, os
import pickle

In [2]:
def get_song_tags(artist = "", track = ""):
    tmp_artist = artist.lower().replace(' ', '+')
    tmp_track = track.lower().replace(' ', '+')
    url = f'https://www.last.fm/music/{tmp_artist}/_/{tmp_track}'
    html_from_last_fm = req.get(url).text
    html = lxml.html.fromstring(html_from_last_fm)
    track_dict = {'artist': artist, 'track': track, 'tags':[]}
    for li in html.xpath("//ul[@class='tags-list tags-list--global']/li"):
        track_dict['tags'].append(str(li.text_content().lower().replace('-', ' ')))
    return track_dict

In [3]:
def get_track_list(path):
    for file in os.listdir(path):
        track_info = re.split('[-]', file[:-4])
        if track_info[0][-1] == ' ':
            track_info[0] = track_info[0][:-1]
        if track_info[1][0] == ' ':
            track_info[1] = track_info[1][1:]
        yield track_info[0], track_info[1], file

In [4]:
def get_json(path):
    info = []
    #TODO добавить проверку на наличие тегов. Если нет тегов дальше ничего не делаем
    for artist, track, file_name in get_track_list(path):
        meta = get_song_tags(artist= artist, track= track)
        meta['file_name'] = file_name
        if meta['tags']:
            info.append(meta)
    return info

In [5]:
def dump(info, path, user_id):
    with open(path+f'{user_id}_info.json', 'w') as fp:
        json.dump(info, fp, sort_keys=True, indent=4)

In [6]:
def load_ganres(path):
    for filename in os.listdir(os.path.abspath(path)):
        sub_ganres = []
        with open(os.path.abspath(path)+'\\'+filename, 'r') as f:
            yield filename[:-4], [tag.lower().replace('-', ' ') for tag in f.read().split('\n')]

In [7]:
def init_ganres_co_occ(path, co_occ_path):
    co_occ = {}
    for ganre, sub_ganres in load_ganres(path):
        co_occ[ganre] = {}
        for sg in sub_ganres:
            co_occ[ganre][sg] = 0
    with open(co_occ_path+'ganresCoOcc.pkl', 'wb') as f:
        pickle.dump(co_occ, f, pickle.HIGHEST_PROTOCOL)
    return co_occ

In [8]:
def who_exist(co_occ_el, tags):
    first = co_occ_el.keys()
    result = []
    for t in tags:
        if t in first:
            result.append(t)
    return result

In [9]:
def update_ganres_co_occ(co_occ, dataset_json):
    for song in dataset_json:
        tags = song['tags']
        for key in co_occ:
            existed = who_exist(co_occ[key], tags)
            if len(existed):
                for tag in tags:
                    if tag in existed:
                        co_occ[key][tag]+=1
                    else:
                        co_occ[key][tag] = 1
    return co_occ

In [10]:
co_occ = init_ganres_co_occ(path='..\\ganres\\', co_occ_path='..\\')
info = get_json('..\\tracks\\')
print(info)
dump(info,'..\\jsons\\', 1)
#TODO После генерации фитч добавить в json поле 'features' чтобы можно было обращаться на прямую

[{'artist': '2Pac', 'track': 'Dear Mama', 'tags': ['rap', 'hip hop', '2pac', 'gangsta rap', 'hip hop'], 'file_name': '2Pac - Dear Mama.mp3'}, {'artist': 'AC_DC', 'track': 'Hells Bells', 'tags': ['hard rock', 'rock', 'classic rock', 'heavy metal'], 'file_name': 'AC_DC - Hells Bells.mp3'}, {'artist': 'DJ Snake', 'track': 'Magenta Riddim', 'tags': ['party', '2018', 'moombahton', '2018 single'], 'file_name': 'DJ Snake - Magenta Riddim.mp3'}, {'artist': 'Elthon John', 'track': 'Sacrifice', 'tags': ['pop', '80s', 'elton john', 'classic rock', 'british'], 'file_name': 'Elthon John - Sacrifice.mp3'}, {'artist': 'Eminem', 'track': 'Framed', 'tags': ['american', '2018 single', 'return to origins'], 'file_name': 'Eminem - Framed.mp3'}, {'artist': 'Godsmack', 'track': 'I Stand Alone', 'tags': ['hard rock', 'metal', 'nu metal', 'rock', 'godsmack'], 'file_name': 'Godsmack - I Stand Alone.mp3'}, {'artist': 'Mozart', 'track': 'Requiem in D minor Complete Full', 'tags': ['classical', 'instrumental', 'c

In [11]:
updated_co_occ = update_ganres_co_occ(co_occ, info)
dump(updated_co_occ,'..\\', 'co_occ')