In [32]:
# Find most popular genres and assign each artist to one of those

from utils import artist_data
from collections import defaultdict
from tqdm import tqdm
import json
import re
from pyperclip import copy

In [33]:
genre_freq_dict = defaultdict(int)
for data in artist_data.values():
    for g in data["genres"]:
        genre_freq_dict[g] += 1

genre_freq = sorted(genre_freq_dict.items(), key=lambda x: -x[1])

top_genres = [g[0] for g in genre_freq[:12]]
print(f"Top genres: {top_genres}")

with open("./data/all_genre_freq.csv", "w") as fout:
    fout.write("genre,freq\n")
    for genre, freq in genre_freq:
        fout.write(f"{genre},{freq}\n")

Top genres: ['rock', 'pop', 'classic rock', 'rap', 'dance pop', 'album rock', 'modern rock', 'indietronica', 'hip hop', 'soft rock', 'contemporary country', 'alternative metal']


In [34]:
main_genres = {
    "hip hop": "(r\&b)|(hip hop)",
    "rap": "(?<!t)rap",
    "edm": "(edm)|(house)|(trap)|(tronic)",
    "indie": "(indie)|(singer-songwriter)",
    "soul": "(jazz)|(blues)|(soul)",
    "rock": "rock",
    "metal": "(metal)|(grunge)",
    "country": "country",
    "christian": "christian",
    "latin": "(latin)|(salsa)",
    "punk": "(punk)|(emo)|(pop punk)",
    "pop": "pop",
}

def classify_genre(genre):
    for label, pattern in main_genres.items():
        if re.search(pattern, genre):
            yield label

In [35]:
# For the spreadsheet
res = ""
for genre, freq in genre_freq:
    labels = list(classify_genre(genre))
    res += f"{genre}\t{freq}\t{','.join(labels)}"
    if len(labels) > 1:
        res += f"\tyes"
    res += "\n"

copy(res)

In [36]:
def classify_genre_list(genre_list):
    possible_genres = []
    for g in genre_list:
        possible_genres.extend(list(classify_genre(g)))
    possible_genres = set(possible_genres)

    for label in main_genres:
        if label in possible_genres:
            return label

    return "other"

In [37]:
classify_genre_list([
      "emo",
      "modern rock",
      "pop punk",
      "pov: indie",
      "rock"
    ])

'indie'

In [38]:
# Classify everyone
for artist_id, data in tqdm(artist_data.items()):
    genres = data["genres"]
    data["main_genre"] = classify_genre_list(genres)

with open("./data/2.5k_artist_data.json", "w") as fout:
    json.dump(artist_data, fout, indent=2)

100%|██████████| 2500/2500 [00:00<00:00, 26670.06it/s]
