In [104]:
import requests as req
import lxml.html
import json
import re, os
import pickle
import codecs

In [75]:
def tags_filter(tags, artist, track):
    filtered = []
    #garbage = ['indie','alternative', 'audio','happy','love', 'emotional', 'music', 'beautiful', 'russian', 'american', 'ukrainian', 'canadian', 'german', 'hawaiian', 'australian', 'classic', 'classical']+[str(i) for i in range(10)]
    garbage = [str(i) for i in range(10)]
    for tag in tags:
        check = True
        for g in garbage:
            if (g in tag) or (artist in tag) or (track in tag):
                check = False
                break
        if check:
            filtered.append(tag)
    return filtered

In [136]:
def get_song_tags(artist = "", track = ""):
    tmp_artist = artist.lower().replace(' ', '+')
    tmp_track = track.lower().replace(' ', '+')
    url = f'https://www.last.fm/music/{tmp_artist}/_/{tmp_track}'
    html_from_last_fm = req.get(url).text
    html = lxml.html.fromstring(html_from_last_fm)
    track_dict = {'artist': artist, 'track': track, 'tags':[]}
    for li in html.xpath("//ul[@class='tags-list tags-list--global']/li"):
        track_dict['tags'].append(str(li.text_content().lower().replace('-', ' ')))
    track_dict['tags'] = tags_filter(track_dict['tags'], tmp_artist, tmp_track)
    print('{artist}-{track} tags was parsed: {tags}'.format(artist=artist, track=track, tags=track_dict['tags']))
    return track_dict

In [77]:
def get_track_list(path):
    for file in os.listdir(path):
        track_info = re.split('[-]', file[:-4])
        if track_info[0][-1] == ' ':
            track_info[0] = track_info[0][:-1]
        if track_info[1][0] == ' ':
            track_info[1] = track_info[1][1:]
        yield track_info[0], track_info[1], file

In [78]:
def get_json(path):
    info = []
    for artist, track, file_name in get_track_list(path):
        meta = get_song_tags(artist= artist, track= track)
        meta['file_name'] = file_name
        if meta['tags']:
            info.append(meta)
    return info

In [154]:
def dump(info, path, user_id):
    with open(path+f'{user_id}_info.json', 'w') as fp:
        json.dump(info, fp, sort_keys=True, indent=4)

In [80]:
def load_ganres(path):
    for filename in os.listdir(os.path.abspath(path)):
        sub_ganres = []
        with open(os.path.abspath(path)+'\\'+filename, 'r') as f:
            yield filename[:-4], [tag.lower().replace('-', ' ') for tag in f.read().split('\n')]

In [81]:
def dump_ganres_co_occ(co_occ_path, co_occ):
    with open(co_occ_path+'ganresCoOcc.pkl', 'wb') as f:
        pickle.dump(co_occ, f, pickle.HIGHEST_PROTOCOL)    

In [90]:
def init_ganres_co_occ(path, co_occ_path):
    co_occ = {}
    for ganre, sub_ganres in load_ganres(path):
        #co_occ[ganre] = {}
        for sg in sub_ganres:
            #co_occ[ganre][sg] = 0
            co_occ[sg] = 0
    dump_ganres_co_occ(co_occ_path, co_occ)
    return co_occ

In [83]:
def who_exist(co_occ_el, tags):
    first = co_occ_el.keys()
    result = []
    for t in tags:
        if t in first:
            result.append(t)
    return result

In [84]:
def update_ganres_co_occ(co_occ, dataset_json):
    for song in dataset_json:
        tags = song['tags']
        #for key in co_occ:
        #xisted = who_exist(co_occ[key], tags)
        existed = who_exist(co_occ, tags)
        if len(existed):
            for tag in tags:
                if tag in existed:
                    #co_occ[key][tag]+=1
                    co_occ[tag]+=1
                else:
                    #co_occ[key][tag] = 1
                    co_occ[tag] = 1
    return co_occ

In [173]:
def init_co_occ_matrix(path_with_ganres='..\\ganres_full\\', path_to_save='..\\'):
    co_occ = init_ganres_co_occ(path=path_with_ganres, co_occ_path=path_to_save)
    print(co_occ)
    return co_occ

In [174]:
def update_co_occ_matrix(co_occ, path_with_tracks='..\\tracks\\', path_to_save_info='..\\jsons\\', path_to_save_co_occ='..\\'):
    for folder in os.listdir(path_with_tracks):
        info = get_json('{path}{folder}\\'.format(folder=folder, path=path_with_tracks))
        print(folder, info)
        dump(info, path_to_save_info, folder)
        co_occ = update_ganres_co_occ(co_occ, info)
        dump_ganres_co_occ(path_to_save_co_occ, co_occ)
        dump(co_occ,path_to_save_co_occ, 'co_occ')
    return co_occ
#TODO После генерации фитч добавить в json поле 'features' чтобы можно было обращаться на прямую

IndentationError: expected an indented block (<ipython-input-174-4b11df134dba>, line 2)

In [171]:
import operator
sorted_x = sorted(co_occ.items(), key=operator.itemgetter(1))
sorted_x.reverse()
print(sorted_x)

[('rock', 174), ('alternative', 87), ('alternative rock', 77), ('hard rock', 47), ('classic rock', 45), ('indie', 36), ('punk rock', 24), ('pop', 23), ('electronic', 22), ('metal', 20), ('indie rock', 20), ('punk', 17), ('female vocalists', 13), ('progressive rock', 12), ('hip hop', 12), ('british', 11), ('singer songwriter', 11), ('soundtrack', 11), ('heavy metal', 11), ('dance', 11), ('power metal', 10), ('house', 9), ('oldies', 9), ('pop punk', 9), ('green day', 8), ('speed metal', 8), ('garage rock', 8), ('indie pop', 8), ('cover', 7), ('post grunge', 7), ('folk', 7), ('rap', 7), ('country', 7), ('edm', 6), ('my chemical romance', 6), ('instrumental', 6), ('extreme power metal', 6), ('nu metal', 6), ('gothic metal', 6), ('alternative metal', 6), ('trap', 6), ('ballad', 5), ('beautiful', 5), ('fip', 5), ('soul', 5), ('emo', 5), ('glam rock', 5), ('russian', 5), ('gothic', 4), ('piano', 4), ('electro', 4), ('swedish', 4), ('acoustic', 4), ('canadian', 4), ('german', 4), ('awesome', 4