In [104]:
import requests as req
import lxml.html
import json
import re, os
import pickle
import codecs

In [75]:
def tags_filter(tags, artist, track):
    filtered = []
    #garbage = ['indie','alternative', 'audio','happy','love', 'emotional', 'music', 'beautiful', 'russian', 'american', 'ukrainian', 'canadian', 'german', 'hawaiian', 'australian', 'classic', 'classical']+[str(i) for i in range(10)]
    garbage = [str(i) for i in range(10)]
    for tag in tags:
        check = True
        for g in garbage:
            if (g in tag) or (artist in tag) or (track in tag):
                check = False
                break
        if check:
            filtered.append(tag)
    return filtered

In [136]:
def get_song_tags(artist = "", track = ""):
    tmp_artist = artist.lower().replace(' ', '+')
    tmp_track = track.lower().replace(' ', '+')
    url = f'https://www.last.fm/music/{tmp_artist}/_/{tmp_track}'
    html_from_last_fm = req.get(url).text
    html = lxml.html.fromstring(html_from_last_fm)
    track_dict = {'artist': artist, 'track': track, 'tags':[]}
    for li in html.xpath("//ul[@class='tags-list tags-list--global']/li"):
        track_dict['tags'].append(str(li.text_content().lower().replace('-', ' ')))
    track_dict['tags'] = tags_filter(track_dict['tags'], tmp_artist, tmp_track)
    print('{artist}-{track} tags was parsed: {tags}'.format(artist=artist, track=track, tags=track_dict['tags']))
    return track_dict

In [77]:
def get_track_list(path):
    for file in os.listdir(path):
        track_info = re.split('[-]', file[:-4])
        if track_info[0][-1] == ' ':
            track_info[0] = track_info[0][:-1]
        if track_info[1][0] == ' ':
            track_info[1] = track_info[1][1:]
        yield track_info[0], track_info[1], file

In [78]:
def get_json(path):
    info = []
    for artist, track, file_name in get_track_list(path):
        meta = get_song_tags(artist= artist, track= track)
        meta['file_name'] = file_name
        if meta['tags']:
            info.append(meta)
    return info

In [154]:
def dump(info, path, user_id):
    with open(path+f'{user_id}_info.json', 'w') as fp:
        json.dump(info, fp, sort_keys=True, indent=4)

In [80]:
def load_ganres(path):
    for filename in os.listdir(os.path.abspath(path)):
        sub_ganres = []
        with open(os.path.abspath(path)+'\\'+filename, 'r') as f:
            yield filename[:-4], [tag.lower().replace('-', ' ') for tag in f.read().split('\n')]

In [81]:
def dump_ganres_co_occ(co_occ_path, co_occ):
    with open(co_occ_path+'ganresCoOcc.pkl', 'wb') as f:
        pickle.dump(co_occ, f, pickle.HIGHEST_PROTOCOL)    

In [90]:
def init_ganres_co_occ(path, co_occ_path):
    co_occ = {}
    for ganre, sub_ganres in load_ganres(path):
        #co_occ[ganre] = {}
        for sg in sub_ganres:
            #co_occ[ganre][sg] = 0
            co_occ[sg] = 0
    dump_ganres_co_occ(co_occ_path, co_occ)
    return co_occ

In [83]:
def who_exist(co_occ_el, tags):
    first = co_occ_el.keys()
    result = []
    for t in tags:
        if t in first:
            result.append(t)
    return result

In [84]:
def update_ganres_co_occ(co_occ, dataset_json):
    for song in dataset_json:
        tags = song['tags']
        #for key in co_occ:
        #xisted = who_exist(co_occ[key], tags)
        existed = who_exist(co_occ, tags)
        if len(existed):
            for tag in tags:
                if tag in existed:
                    #co_occ[key][tag]+=1
                    co_occ[tag]+=1
                else:
                    #co_occ[key][tag] = 1
                    co_occ[tag] = 1
    return co_occ

In [159]:
co_occ = init_ganres_co_occ(path='..\\ganres_full\\', co_occ_path='..\\')
print(co_occ)

{'african': 0, 'marabi': 0, 'african heavy metal': 0, 'african hip hop xhom': 0, 'afrobeat': 0, 'apala': 0, 'benga': 0, 'bongo flava': 0, 'bikutsi': 0, 'cape jazz': 0, 'chimurenga': 0, 'congolese rumba': 0, 'coupe decale': 0, 'fuji music': 0, 'genge': 0, 'highlife': 0, 'hiplife': 0, 'igbo highlife': 0, 'igbo rap': 0, 'isicathamiya': 0, 'jit': 0, 'juju': 0, 'kapuka aka boomba': 0, 'kadongo kamu': 0, 'kizomba': 0, 'kuduro': 0, 'kwaito': 0, 'kwela': 0, 'makossa': 0, 'maloya': 0, 'marrabenta': 0, 'mbalax': 0, 'mbaqanga': 0, 'mbube': 0, 'amapiano': 0, 'morna': 0, 'ndombolo': 0, 'palm wine': 0, 'rai': 0, 'sakara': 0, 'sega': 0, 'seggae': 0, 'semba': 0, 'shangaan electro': 0, 'soukous': 0, 'kwassa kwassa': 0, 'taarab': 0, "zouglou cote d'ivoire": 0, 'asian': 0, 'east asian': 0, 'south': 0, 'southeast asian': 0, 'fann at tanbura': 0, 'fijiri': 0, 'khaliji': 0, 'liwa': 0, 'sawt': 0, 'anison': 0, 'cantopop': 0, 'c pop': 0, 'enka': 0, 'hong kong english pop': 0, 'j pop': 0, 'kayokyoku': 0, 'k pop

In [161]:
for folder in os.listdir('..\\tracks\\'):
    info = get_json('..\\tracks\\{folder}\\'.format(folder=folder))
    print(folder, info)
    dump(info,'..\\jsons\\', folder)
    co_occ = update_ganres_co_occ(co_occ, info)
    dump_ganres_co_occ('..\\', co_occ)
    dump(co_occ,'..\\', 'co_occ')
#TODO После генерации фитч добавить в json поле 'features' чтобы можно было обращаться на прямую

2Pac-Dear Mama tags was parsed: ['rap', 'hip hop', 'gangsta rap', 'hip hop']
a ha-Crying in the Rain tags was parsed: ['pop', 'new wave', 'a ha', 'norwegian']
AC_DC-Hells Bells tags was parsed: ['hard rock', 'rock', 'classic rock', 'heavy metal']
Andrew Morris-Dust tags was parsed: ['australian']
Barenaked Ladies-Big Bang Theory Theme tags was parsed: ['big bang theory', 'theme song', 'soundtrack', 'nerdcore']
Bee Gees-Spicks And Specks tags was parsed: ['oldies', 'pop', 'bee gees']
Bee Gees-Stayn Alive tags was parsed: ['disco', 'pop', 'classic rock', 'oldies']
Black Veil Brides-In the End tags was parsed: ['hard rock', 'emo', 'emocore', 'music choice: rock']
Blink 182-All The Small Things tags was parsed: ['punk rock', 'pop punk', 'rock', 'punk']
Blink 182-Love is Dangerous tags was parsed: ['pop punk', 'rock', 'punk rock', 'punk']
Blink 182-Phantom of the Opera tags was parsed: ['punk rock', 'pop punk', 'punk', 'rock', 'alternative']
Blink 182-Stockholm Syndrome tags was parsed: ['p

Nirvana-Lithium tags was parsed: ['grunge', 'rock', 'alternative']
Of Monsters and Men-Little Talks tags was parsed: ['indie', 'folk', 'alternative', 'indie folk', 'icelandic']
OneRepublic-Lets Hurt Tonight tags was parsed: ['pop rock', 'rock', 'alternative', 'alternative rock', 'indie']
Patrick Watson-Lighthouse tags was parsed: ['fip', 'folk', 'indie']
Paul McCartney and Michael Jackson-Say Say Say tags was parsed: ['michael jackson', 'pop', 'paul mccartney']
Paul McCartney Wings-Band On The Run tags was parsed: ['classic rock', 'rock', 'paul mccartney', 'british']
Paul McCartney Wings-Live And Let Die tags was parsed: ['classic rock', 'rock', 'soundtrack', 'james bond', 'paul mccartney']
Paul McCartney Wings-Mrs Vandebilt tags was parsed: []
Phoenix-1901 tags was parsed: ['indie', 'indie pop', 'electronic', 'french', 'dance']
Phoenix-Long distance call (S233bastien Tellier remix) tags was parsed: []
Phoenix-North tags was parsed: ['instrumental', 'mellow', 'indie']
Radiohead-Creep [

10 Years-Dying Youth tags was parsed: ['alternative rock', 'hard rock']
10 Years-Wasteland tags was parsed: ['alternative rock', 'rock', 'alternative', 'hard rock']
3 Doors Down-Changes tags was parsed: ['rock', 'alternative rock', 'hard rock', 'alternative']
A Perfect Circle-Counting Bodies Like Sheep... tags was parsed: ['alternative rock', 'progressive rock', 'alternative', 'rock']
A Perfect Circle-The Outsider tags was parsed: ['rock', 'alternative rock', 'progressive rock', 'alternative']
Alanis Morissette-I Was Hoping tags was parsed: ['rock', 'alternative', 'female vocalists', 'singer songwriter']
Alanis Morissette-Sorry To Myself tags was parsed: ['rock', 'singer songwriter', 'alanis morissette', 'canadian']
Alanis Morissette-Versions of Violence tags was parsed: ['pop', 'alternative', 'rock', 'female vocalists']
Alter Bridge-Broken Wings tags was parsed: ['rock', 'alternative rock', 'hard rock', 'alter bridge', 'post grunge', 'alternative']
Altered-Breaking The Silence tags wa

The Frames-When Your Minds Made Up tags was parsed: ['irish', 'indie', 'indie rock', 'rock', 'alternative']
The Mayan Factor-Warflower tags was parsed: ['progressive rock', 'progressive', 'alternative rock', 'guitar']
The Rising-Cradle tags was parsed: ['rock', 'alternative', 'like', 'angels', 'angel', 'michael']
The Royal Scots Dragoon Guards-The Gael tags was parsed: ['celtic']
Three Days Grace-Last to Know tags was parsed: ['alternative rock', 'rock', 'alternative', 'hard rock', 'rock ballad']
Tiamat-Whatever That Hurts tags was parsed: ['gothic metal', 'doom metal', 'metal', 'gothic']
Trapt-Victim tags was parsed: ['alternative rock', 'rock', 'hard rock']
Trillium-Machine Gun tags was parsed: ['symphonic metal', 'rock', 'female fronted rock', 'winter', 'school', 'great lyrics']
W.A.S.P.-Heavens Hung In Black tags was parsed: ['heavy metal', 'metal', 'hard rock', 'awesome']
W.A.S.P.-The Great Misconceptions Of Me tags was parsed: ['heavy metal', 'hard rock', 'metal']
W.A.S.P.-The Id

A Ha-Early Morning tags was parsed: ['pop', 'new wave']
Asia-Only Time Will Tell tags was parsed: ['classic rock', 'progressive rock', 'rock']
Booker T and The MG039s-Green Onion tags was parsed: []
C.W. McCall-Flowers on the wall tags was parsed: ['country', 'classic country', 'trucker']
C.W.McCall-Convoy tags was parsed: ['country', 'classic country', 'trucking anthems']
Chris Hadfield-Beyond The Terra tags was parsed: ['canadian', 'space', 'astronaut']
Chuck Berry-You Never Can Tell tags was parsed: ['rock and roll', 'oldies', 'soundtrack', 'fip', 'classic rock', 'pulp fiction']
Creedence Clearwater Revival-Cotton Fields tags was parsed: ['classic rock', 'southern rock', 'country', 'rock']
Creedence Clearwater Revival-Have You Ever See The Rain tags was parsed: ['classic rock', 'rock', 'southern rock']
David Bowie-The Man Who Sold The World tags was parsed: ['rock', 'classic rock', 'glam rock', 'david bowie']
Fiona Apple-Slow Like Honey (Album Version) tags was parsed: ['female voca

4B-Bring In The Drums tags was parsed: ['electronic', 'chillout', 'ambient', 'experimental']
4B-Fire tags was parsed: ['electronic', 'chillout', 'ambient', 'experimental']
6IX9INE-Kooda tags was parsed: ['rap', 'hip hop', 'post nerdcore', 'gang']
Aftermarket  Max Adrian-Payback tags was parsed: []
Akula-B.O.S.S tags was parsed: []
Alan Walker-Ignite tags was parsed: ['electronic', 'electronica', 'house', 'dance', 'norwegian', 'edm']
Alan Walker-Sing Me To Sleep tags was parsed: ['eletronic', 'house', 'party']
Beowulf-Plomo tags was parsed: ['rac', 'crossover', 'thrash metal', 'heavy metal']
Blasterjaxx  Olly James-Phoenix (Extended Mix) tags was parsed: []
Blasterjaxx-Temple tags was parsed: ['house', 'electro house', 'dutch house', 'electronic']
blvckmania-Kenshin Zelig  tags was parsed: []
Bumble Beezy-Getter tags was parsed: []
Cesqeaux  D039Maduro-Suerte tags was parsed: []
Cesqeaux  San Holo-Who Am I tags was parsed: []
Cesqeaux-Colossal tags was parsed: ['electronic', 'trap', 'ba

In [171]:
import operator
sorted_x = sorted(co_occ.items(), key=operator.itemgetter(1))
sorted_x.reverse()
print(sorted_x)

[('rock', 174), ('alternative', 87), ('alternative rock', 77), ('hard rock', 47), ('classic rock', 45), ('indie', 36), ('punk rock', 24), ('pop', 23), ('electronic', 22), ('metal', 20), ('indie rock', 20), ('punk', 17), ('female vocalists', 13), ('progressive rock', 12), ('hip hop', 12), ('british', 11), ('singer songwriter', 11), ('soundtrack', 11), ('heavy metal', 11), ('dance', 11), ('power metal', 10), ('house', 9), ('oldies', 9), ('pop punk', 9), ('green day', 8), ('speed metal', 8), ('garage rock', 8), ('indie pop', 8), ('cover', 7), ('post grunge', 7), ('folk', 7), ('rap', 7), ('country', 7), ('edm', 6), ('my chemical romance', 6), ('instrumental', 6), ('extreme power metal', 6), ('nu metal', 6), ('gothic metal', 6), ('alternative metal', 6), ('trap', 6), ('ballad', 5), ('beautiful', 5), ('fip', 5), ('soul', 5), ('emo', 5), ('glam rock', 5), ('russian', 5), ('gothic', 4), ('piano', 4), ('electro', 4), ('swedish', 4), ('acoustic', 4), ('canadian', 4), ('german', 4), ('awesome', 4