In [1]:
# Find most popular genres and assign each artist to one of those

from utils import artist_data
from collections import defaultdict
from tqdm import tqdm
import json
import re
from pyperclip import copy
import pandas as pd

In [2]:
genre_freq_dict = defaultdict(int)
for data in artist_data.values():
    for g in data["genres"]:
        genre_freq_dict[g] += 1

genre_freq = sorted(genre_freq_dict.items(), key=lambda x: -x[1])
all_genres = [g[0] for g in genre_freq]

top_genres = all_genres[:12]
print(f"Top genres: {top_genres}")

with open("./data/all_genre_freq.csv", "w") as fout:
    fout.write("genre,freq\n")
    for genre, freq in genre_freq:
        fout.write(f"{genre},{freq}\n")

Top genres: ['rock', 'pop', 'classic rock', 'rap', 'dance pop', 'album rock', 'modern rock', 'indietronica', 'hip hop', 'soft rock', 'contemporary country', 'alternative metal']


In [3]:
df = pd.read_csv("./data/genre brainstorm - all.csv").reset_index()
overrides = {}
for genre, reviewed in zip(df["genre"], df["reviewed"]):
    if not pd.isna(reviewed):
        overrides[genre] = reviewed

print(overrides)
print(f"{len(overrides)} overrides")

{'indietronica': 'indie', 'indie rock': 'indie', 'mellow gold': 'rock', 'pop rap': 'pop', 'pop punk': 'punk', 'pop rock': 'pop', 'christian alternative rock': 'christian', 'neon pop punk': 'punk', 'indie poptimism': 'indie', 'country rock': 'country', 'latin pop': 'latin', 'modern country rock': 'country', 'blues rock': 'soul', 'musica mexicana': 'latin', 'brostep': 'edm', 'folk': 'country', 'indie pop rap': 'indie', 'new americana': 'indie', 'disco': 'edm', 'funk': 'soul', 'indie soul': 'soul', 'indie pop': 'indie', 'trap latino': 'latin', 'rap metal': 'metal', 'latin hip hop': 'latin', 'pixie': 'punk', 'rap rock': 'rock', 'modern blues rock': 'soul', 'socal pop punk': 'punk', 'pop r&b': 'soul', 'pop soul': 'soul', 'pop emo': 'punk', 'christian hip hop': 'christian', 'latin arena pop': 'latin', 'pop edm': 'edm', 'christian indie': 'christian', 'latin rock': 'latin', 'country rap': 'country', 'shoegaze': 'indie', 'progressive bluegrass': 'soul', 'swedish indie pop': 'indie', 'christian

In [4]:
main_genres = {
    "hip hop": "(hip hop)",
    "rap": "(?<!t)rap",
    "edm": "(edm)|(house)|(trap)|(tronic)",
    "indie": "(indie)",
    "soul": "(r\&b)|(soul)",
    "jazz": "(blues)|(jazz)",
    "rock": "(rock)|(singer-songwriter)",
    "metal": "(metal)|(grunge)",
    "country": "country",
    "pop": "pop",
}

def classify_genre(genre):
    if genre in overrides:
        return overrides[genre]
    
    for label, pattern in main_genres.items():
        if re.search(pattern, genre):
            return label

    return "other"

In [5]:
# Export for the site
with open("./data/genre_map.json", "w") as fout:
    json.dump({ g: classify_genre(g) for g in all_genres }, fout, indent=2)

In [52]:
def classify_genre_list(genre_list):
    possible_genres = set([classify_genre(g) for g in genre_list])

    for label in main_genres:
        if label in possible_genres:
            return label

    return "other"

In [33]:
classify_genre_list([
      "emo",
      "modern rock",
      "pop punk",
      "pov: indie",
      "rock"
    ])

'indie'

In [38]:
# Classify everyone
for artist_id, data in tqdm(artist_data.items()):
    genres = data["genres"]
    data["main_genre"] = classify_genre_list(genres)

with open("./data/2.5k_artist_data.json", "w") as fout:
    json.dump(artist_data, fout, indent=2)

100%|██████████| 2500/2500 [00:00<00:00, 26670.06it/s]


For every raw sub-genre, put it into a big genre. Then sort the raw sub-genre lists based on frequency.

In [53]:
big_genre_lists = {g: [] for g in main_genres}
big_genre_lists["other"] = []

for raw_genre in genre_freq:
    big_genre = classify_genre(raw_genre[0])
    if big_genre == None:
        print(raw_genre)
    big_genre_lists[big_genre].append(raw_genre)

In [58]:
res = ""

for big_genre in big_genre_lists:
    big_genre_lists[big_genre].sort(key = lambda x: -x[1])
    res += f"{big_genre}\t"
    res += "\t".join(map(lambda x: f"{x[0]}, {x[1]}", big_genre_lists[big_genre])) + "\n"

copy(res)