In [1]:
import pandas as pd
import re, json, time

import urllib.parse
import urllib.request

### Code for generating data with scraped genres from wiki

In [None]:
## Read in a CSV file from the dataset
data = pd.read_csv('billboard_24years_lyrics_spotify.csv')
data.shape

In [None]:
def get_wikipedia_page_data(page_url):
    # Example: '/wiki/Breathe_(Faith_Hill_song)'
    # Extract the page title from the URL
    page_title = page_url.split('/wiki/')[-1].strip()
    page_title = urllib.parse.unquote(page_title)  # decode %20 etc.
    
    # Build the Wikipedia API query
    baseurl = "https://en.wikipedia.org/w/api.php?"
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "titles": page_title,
        "format": "json"
    }
    
    # Encode parameters into a URL query string
    query = baseurl + urllib.parse.urlencode(params)
    
    # Add a user-agent header (Wikipedia requires this)
    headers = {"User-Agent": "MyWikipediaClient/1.0 (s214704@dtu.dk)"}
    req = urllib.request.Request(query, headers=headers)
    
    # Fetch data from Wikipedia
    with urllib.request.urlopen(req) as response:
        wikidata = response.read()
        wikitext = wikidata.decode("utf-8")
    
    return wikitext


In [None]:
def _slice_balanced(text, start):
    """Return substring from start (at '{') until its matching '}' using {{...}} depth."""
    d, i, n = 0, start, len(text)
    while i < n - 1:
        two = text[i:i+2]
        if two == '{{': d += 1; i += 2; continue
        if two == '}}': d -= 1; i += 2; 
        else: i += 1
        if d == 0: return text[start:i]
    return ""

def _extract_infobox(wikitext):
    low = wikitext.lower()
    pos = low.find('{{infobox')
    if pos == -1: return ""
    return _slice_balanced(wikitext, pos)

def _get_param_value(infobox, name='genre'):
    m = re.search(rf'\n\|\s*{re.escape(name)}\s*=\s*', infobox, flags=re.I)
    if not m: return ""
    i, n, d = m.end(), len(infobox), 0
    out = []
    while i < n:
        if d == 0 and infobox.startswith('\n|', i): break
        two = infobox[i:i+2]
        if two == '{{': d += 1; out.append(two); i += 2; continue
        if two == '}}' and d > 0: d -= 1; out.append(two); i += 2; continue
        out.append(infobox[i]); i += 1
    return ''.join(out).strip()

def extract_genres_from_wikitext(wikitext):
    if not wikitext: return []
    box = _extract_infobox(wikitext)
    if not box: return []
    raw = _get_param_value(box, 'genre')
    if not raw: return []

    # remove comments/refs
    raw = re.sub(r'<!--.*?-->', '', raw, flags=re.S)
    raw = re.sub(r'<ref.*?>.*?</ref>|<ref.*?/>', '', raw, flags=re.S)

    # expand simple hlist/flatlist to comma-separated text
    raw = re.sub(r'{{\s*(?:hlist|flatlist)\b[^|}]*\|([^}]*)}}',
                 lambda m: m.group(1).replace('|', ', '), raw, flags=re.I)

    # collect wikilink labels (or targets if no label)
    links = []
    def _grab(m):
        tgt, lbl = m.group(1), m.group(2)
        return lbl or tgt
    links = [ _grab(m) for m in re.finditer(r'\[\[([^|\]]+)(?:\|([^]]+))?\]\]', raw) ]

    # also keep any bare text after removing templates/links
    tmp = re.sub(r'{{[^{}]*}}', '', raw)               # drop leftover templates
    tmp = re.sub(r'\[\[[^]]+\]\]', '', tmp)            # drop links (already captured)
    tmp = re.sub(r'\(.*?\)', '', tmp)                  # drop parens
    tmp = tmp.replace('•', ',').replace('*', ',')
    bare = [p.strip() for p in re.split(r'[;,/]| and ', tmp, flags=re.I) if p.strip()]

    # normalize, remove trailing 'music', dedupe, keep short/simple tokens
    seen, out = set(), []
    for g in links + bare:
        g = re.sub(r'\bmusic\b', '', g, flags=re.I)
        g = re.sub(r'\s+', ' ', g).strip()
        if not g: continue
        g = g.lower()
        if g not in seen:
            seen.add(g)
            out.append(g)
    return out

# unchanged: JSON → wikitext helper
def ensure_wikitext(cell_text):
    if not isinstance(cell_text, str) or not cell_text.strip():
        return ""
    text = cell_text.strip()
    if text.startswith("{") or text.startswith("["):
        try:
            obj = json.loads(text)
            pages = obj.get("query", {}).get("pages", {})
            if pages:
                page = next(iter(pages.values()))
                rev = page.get("revisions", [{}])[0]
                w = (rev.get("slots", {}).get("main", {}).get("*")
                     or rev.get("*") or rev.get("content") or "")
                if isinstance(w, str): return w
        except Exception:
            pass
    return text

def extract_genres_cell(cell):
    return extract_genres_from_wikitext(ensure_wikitext(cell))


def get_genres_for_songurls(songurls, polite_delay=0.2):
    """
    songurls: iterable of '/wiki/...' or full 'https://en.wikipedia.org/wiki/...'
    returns: dict { original_url: [genres...] }
    """
    results = {}
    for url in songurls:
        try:
            # ensure we only pass the /wiki/... part
            if url.startswith('http'):
                url = urllib.parse.urlparse(url).path
            data = get_wikipedia_page_data(url)
            genres = extract_genres_cell(data)
            results[url] = genres
        except Exception as e:
            results[url] = []
            print(f"Error on {url}: {e}")
        time.sleep(polite_delay)  # be nice to Wikipedia
    return results


In [None]:
song_url = data['songurl'].tolist()
i = 0

# genres_by_url = get_genres_for_songurls(song_url[:100])
for url in song_url:
    genres = get_genres_for_songurls([url]).get(url, [])
    ## Add genre to dataframe
    data.loc[data['songurl'] == url, 'genre'] = ', '.join(genres)

    i += 1

    # Print only when 100 songs have been processed
    if i % 10 == 0:
        print(f"Processed {i} songs out of {len(song_url)}, being {i/ len(song_url) * 100:.2f}%")

In [None]:
## Songs where genre is missing, none or empty
no_genre = data[data['genre'].isnull() | (data['genre'] == '')]
print(f"Number of songs where genre is missing: {no_genre.shape[0]}")

In [None]:
## Extract genres from songs in the no_genre dataframe
for index, row in no_genre.iterrows():
    genres = get_genres_for_songurls([row['songurl']]).get(row['songurl'], [])
    ## Add genre to dataframe
    data.loc[data['songurl'] == row['songurl'], 'genre'] = ', '.join(genres)

    print(f"Extracted genre for song: {row['song']} by {row['band_singer']}")

In [None]:
## Songs where genre is missing, none or empty
no_genre = data[data['genre'].isnull() | (data['genre'] == '')]
print(f"Number of songs where genre is missing: {no_genre.shape[0]}")


In [17]:
data = pd.read_csv('billboard_24years_lyrics_spotify_with_genres.csv')

# Remove rows containing NaN values in the 'genre'
data = data.dropna(subset=['genre'])

In [22]:
# Change 'pop-rap' to 'pop rap' in the 'genre' column
data['genre'] = data['genre'].str.replace('pop-rap', 'pop rap')

# Change 'electric dance' to 'edm' in the 'genre' column
data['genre'] = data['genre'].str.replace('electronic dance', 'edm')

# Change 'hip hop' and 'hip-hop' to 'hip-hop' in the 'genre' column
data['genre'] = data['genre'].str.replace('hip hop', 'hiphop')
data['genre'] = data['genre'].str.replace('hip-hop', 'hiphop')

# Change 'electro ()' to 'electro' in the 'genre' column
data['genre'] = data['genre'].str.replace(r'electro \(\)', 'electro', regex=True)   

# Remove the genres '| length ='
data['genre'] = data['genre'].str.replace(r'\| length =.*', '', regex=True)

In [23]:
## Export the updated dataframe to a new CSV file
data.to_csv('billboard_24years_lyrics_spotify_with_genres.csv', index=False)