# Code for webscraping KProfiles

In [1]:
import csv

def load_kaggle_csv_data(filepath):
    """
    Opens & loads Kaggle Kpop CSV file contents as a list (each row = list)
        Note: Header = kpop_csv_data[0] 
    """   
    reader = csv.reader(open(filepath,"r"))
    kpop_csv_data = list(reader)
    return kpop_csv_data

def get_kpop_artist_list(kpop_csv_data):
    """
    Takes in kpop_metadata (from CSV file) & generates a set of kpop artists for the other fucntions.
        Turns the list into a set to remove duplicates & converts back to a list. Returns a list to 
        avoid breaking other functions in case they do not work with sets.
    """
    messy_kpop_artist_list = []
    for artist in kpop_csv_data[1:]:
        messy_kpop_artist_list.append(artist[0])
    no_duplicate_artsits = set(messy_kpop_artist_list)
    kpop_artist_list_from_csv = list(no_duplicate_artsits) 
    return kpop_artist_list_from_csv

def slugify_kpop_artist_list(kpop_artist_list_from_csv):
    """
    Formats artist names for use in URLs. Replaces space characters with hyphens.
        Without doing this, we wouldn't be able to scrape the Stray Kids webpage.
        For KProfile site, the URLs are not case sensitive so we do not apply case formatting.
    """
    kpop_artist_list = []
    for artist in kpop_artist_list_from_csv:
        kpop_artist_list.append(artist.replace(" ", "-"))  
    return kpop_artist_list      

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import unicodedata

def get_album_content(kpop_artist):
    '''
    Returns beautiful soup content for a kpop artist's discography ('disc' for short).
        Soup content criteria: div with the class 'entry-content herald-entry-content'.
        Criteria is true for all group discography pages.
    '''
    disc_page = (f"https://kprofiles.com/{kpop_artist}-discography/")
    disc_soup = BeautifulSoup(requests.get(disc_page).text, 'html.parser')
    disc_soup = disc_soup.find("div", {'class': "entry-content herald-entry-content"})
    return disc_soup

def get_album_title_content(disc_soup):
    """
    Gets album titles for all groups -- applies filterting for special cases.
        Some of Treasure's, (G)I-dle's, and ITZY's albums are randomly formatted in separate p tags, unlike most group pages.
        This function grabs all albums & filters out irrelevant info. 
    """
    ptags = [tag for tag in disc_soup.find_all("p")]
    album_titles = []
    # pattern filters out lines that start with "Release date" and track numbers (1., 2., etc.)
    pattern = r'(^[^Release|\d]+)'
    for tag in ptags:
        lines = tag.text.split("\n")
        if 2 >= len(lines):
            for words in lines:
                if (
                    (re.match(pattern, words)) and (
                        (
                            "album" not in words.lower())
                            and ("discography" not in words.lower()) 
                            and ("ost" not in words.lower()) 
                            and ("single" not in words.lower())
                            and ("click the songs in blue" not in words.lower())
                            and ("songhaena" not in words.lower())
                            and ("comment" not in words.lower())
                            and ("alouette" not in words.lower())
                            and ("special thanks" not in words.lower())
                            and ("made by" not in words.lower())
                            and ("are marked in bold" not in words.lower())
                            and ("credits" not in words.lower())
                            and ("journey through iz*one history" not in words.lower())
                        )
                    ):
                    album_titles.append(unicodedata.normalize('NFKD', words))
        else:
            title = tag.text.split("\n")[0]
            if "release date" in lines[1].lower():
                album_titles.append(unicodedata.normalize('NFKD', title))
    return album_titles

def get_album_titles(kpop_artist):
    """
    Takes in artist name & returns a list of album titles for the artist.
    """
    soup = get_album_content(kpop_artist)
    album_titles = get_album_title_content(soup)
    return album_titles

In [3]:
from dateutil import parser
from datetime import datetime

def get_release_dates_content(disc_soup):
    """
    Returns a list of album release dates (as strings) from group discography page.
        Release date criteria: the phrase 'release date' is in lines.
        Returned list is semi clean, with "Release Date: " removed & nonbreaking spaces formatted
        Some lists use "3rd" or "2nd" for dates, so further cleaning is needed.
    """
    ptags = [tag for tag in disc_soup.find_all("p")]
    album_release_dates = []
    pattern = r"(^[Rrelas Ddt:]{14})"
    for tag in ptags:
        lines = tag.text.split("\n")
        if 2 >= len(lines):
            for words in lines:
                if 'release date' in words.lower():
                    album_release_dates.append(re.sub(pattern, '', (unicodedata.normalize('NFKD', words))))
        else:
            release_date = tag.text.split("\n")[1]            
            if "release date" in lines[1].lower():
                    album_release_dates.append(re.sub(pattern, '', (unicodedata.normalize('NFKD', release_date))))     
    return album_release_dates

def get_loona_release_dates_content(disc_soup):
    # reduces all dates to single release dates (Loona has two release dates for two different albums) 
    date_pattern = r"(\w+ \d{1,2}, \d{4})"
    album_release_dates = get_release_dates_content(disc_soup)
    result =  list(map(lambda date: re.search(date_pattern, date).group(0),
                       album_release_dates))
    return result

def clean_release_dates(album_release_dates):
    """
    Returns a list of album release dates (as strings) from group discography page that are fully cleaned.
        Specifically: this replaces the non-digit characters in "2nd","1st","3rd","24th" with an empty string 
        (using clean_date_pattern & regex substitution).
    """
    clean_date_pattern = r"(?<=\d)(st|nd|rd|th)\b"
    cleaned_album_dates = [re.sub(clean_date_pattern, '', date) for date in album_release_dates]
    return cleaned_album_dates

def format_release_dates(cleaned_album_dates):
    """
    Return a list of ablum release dates in the MM/DD/YYYY format ('%m/%d/$Y') instead of 'Month, day, year'.
    """  
    dt_album_dates = [parser.parse(date) for date in cleaned_album_dates]
    formatted_album_dates = [date.strftime("%m/%d/%Y") for date in dt_album_dates]
    return formatted_album_dates

def get_album_release_dates(kpop_artist):
    """
    Given a Kpop group name, this function returns a list of album release dates, formatted as MM/DD/YYYY, for each album.
        Output is a list of strings. 
    """
    soup = get_album_content(kpop_artist)
    if kpop_artist == "Loona":
        messy_dates = get_loona_release_dates_content(soup)
    else:
        messy_dates = get_release_dates_content(soup)
    unformatted_dates = clean_release_dates(messy_dates)
    album_release_dates = format_release_dates(unformatted_dates)
    return album_release_dates

In [4]:
def get_album_tracks_if_ordered(disc_soup):
    """
    Returns a list of album tracks for each album. Used for web pages with ordered list formatting and consistent ptags. 
    """
    list_of_disc_tracks = [[ptag.text for ptag in tag.find_all("li")] for tag in disc_soup.find_all("ol")]
    return list_of_disc_tracks

def get_album_tracks_if_verivery(disc_soup):
    """
    Returns a list of album tracks for each Verivery album because this group page also has unique html. The track info is in
        the same ptag as the album ttile. Tracks are also manually numbered on this page, so this is removed using regex.    
    """
    album_tracks_list = []
    pattern = r'^[0-9]+\. '
    ptags = disc_soup.find_all("p")
    for tag in ptags:
        album_info = tag.text.split("\n")
        if len(album_info) < 2:
            continue        
        tracks = album_info[2:]
        for track in tracks:
            # this group's disc. webpage starts each album list with an empty string
            # we use this to start each album's track list
            # then we append each track (removing track numbers) to the last album list added
            if track == '':
                album_tracks_list.append([])
            else:
                album_tracks_list[-1].append(re.sub(pattern, '', track))
    return album_tracks_list

def get_album_tracks_if_treasure(disc_soup):
    '''
    Gets album tracks if artist is 'Treasure'
    '''
    album_tracks_list = []
    ptags = disc_soup.find_all("p")
    pattern = r'^[0-9]+\. '    
    for tag in ptags[1:]:
        if "release date" in tag.text.lower():
            album_tracks_list.append([])
        else:
            if re.match(pattern, tag.text): 
                if '\n' in tag.text:
                    tracks = tag.text.split('\n')
                    for track in tracks:
                        album_tracks_list[-1].append(re.sub(pattern, '', track))
                else:
                    album_tracks_list[-1].append(re.sub(pattern, '', tag.text))
    return album_tracks_list

def get_album_tracks_if_ateez(disc_soup):
    '''
    Gets album tracks if artist is 'Ateez'
    '''
    album_tracks_list = []
    o_tags = disc_soup.find_all('ol')
    for o_tag in o_tags:
        album_tracks_list.append([])
        li_tags = o_tag.find_all('li')
        for li in li_tags:
            album_tracks_list[-1].append(li.text) 
    return album_tracks_list

def get_album_tracks(kpop_artist):
    """
    Returns a list of album tracks for each kpop artist.
    """
    soup = get_album_content(kpop_artist)
    if kpop_artist == 'Treasure':
        return get_album_tracks_if_treasure(soup)
    elif kpop_artist == 'Verivery':
        return get_album_tracks_if_verivery(soup)
    elif kpop_artist == 'Ateez':
        return get_album_tracks_if_ateez(soup)
    else:
        return get_album_tracks_if_ordered(soup)

In [5]:
def make_discography(kpop_artist):
    """
    Take artist name to output innner dict. Run for each artist in later func.
    """
    albums = get_album_titles(kpop_artist)
    all_album_tracks = get_album_tracks(kpop_artist)
    all_release_dates = get_album_release_dates(kpop_artist)
    discography = {}
   
    for album, album_tracks, release_date in zip(albums, all_album_tracks, all_release_dates):
        discography[album] = {'release date': release_date, 'album tracks': album_tracks} 
    
    return discography

In [6]:
import json

def get_artist_dict_from_csv(filepath):
    
    """
    Takes in Kpop CSV file name & returns dictionary by webscraping artist names from KProfile site
        Applies all functions defined earlier
    """
    kpop_csv_data = load_kaggle_csv_data(filepath)
    unformatted_list = get_kpop_artist_list(kpop_csv_data)
    kpop_artist_list = slugify_kpop_artist_list(unformatted_list)
    artist_dict = {}
    for artist in kpop_artist_list:
        artist_dict[artist] = make_discography(artist)
    return artist_dict

In [7]:
# run to make JSON file for webscraped artist dictionary

kpop_filepath = 'kpop_data.csv'
artist_dict = get_artist_dict_from_csv(kpop_filepath)
with open('kpop_artist_dictionary.json', 'w', encoding='utf-8') as fp:
    json.dump(artist_dict, fp, ensure_ascii=False, sort_keys=True, indent=4)