In [None]:
from urllib.request import build_opener, HTTPCookieProcessor
from bs4 import BeautifulSoup

import json

In [None]:
# Create page opener with cookie
page_opener = build_opener(HTTPCookieProcessor())

In [None]:
# Constants
site = "https://piugame.com/leaderboard/"
ranking_link = site + 'over_ranking.php'

In [None]:
# Open the page
first_page = page_opener.open(ranking_link)
soup = BeautifulSoup(first_page, 'html.parser')

# Get last page
last_page = None
icons = soup.find_all("button", class_="icon")
for icon in icons:
    page = int(icon['onclick'].split("page=")[1][:-1])
    if "last" in icon.findChildren()[0]['class']:
        last_page = page
print(last_page)

In [None]:
# helper
def get_song_title(song_soup):
    return song_soup.find_all("div", class_="songName_w")[0].findChildren("p")[0].text

In [None]:
def get_song_level(song_soup):
    level = ""
    
    song_lvl_img_collection = song_soup.find_all("div", class_="stepBall_img_wrap")[0].find_all("img")
    for img in song_lvl_img_collection:
        text_to_analyse = img['src']

        # check step type
        if "s_text" in text_to_analyse:
            level += "S"
        elif "d_text" in text_to_analyse:
            level += "D"
        elif "c_text" in text_to_analyse:
            level += "C"

        # check level
        if "_num_" in text_to_analyse:
            # format is "_num_2.png", get the element after _num_ and the first character only
            level += text_to_analyse.split("_num_")[1][0]
    
    return level

In [None]:
def get_song_image_id(song_soup):
    song_img_div = song_soup.find_all("div", class_="songImg_w")[0].findChildren()
    for div in song_img_div:
        if div.has_attr('style'):
            text_to_parse = div['style']
            img_id = text_to_parse.split("/song_img/")[1].split(".png")[0]
            return img_id

In [None]:
result = {}

# Loop from current page until last page
for index in range(1, last_page + 1):
    print("current page: ", str(index))
    url_to_access = ranking_link + "?&&page=" + str(index)
    
    curr_page = page_opener.open(url_to_access)
    curr_soup = BeautifulSoup(curr_page, 'html.parser')
    
    # assume exactly 1 list
    div_with_song_list = curr_soup.find_all("ul", "rating_ranking_list")[0]
    
    # loop through all songs in a page
    items = div_with_song_list.find_all("li")
    for item in items:
        song_link = item.findChildren("a")[0]['href']
        
        complete_song_url = site + song_link
        song_page = page_opener.open(complete_song_url)
        song_soup = BeautifulSoup(song_page, 'html.parser')
        
        # get song title, level, and id
        song_name = get_song_title(song_soup)
        level = get_song_level(song_soup)
        song_img_id = get_song_image_id(song_soup)
        
        # get all scores
        scores = []
        score_divs = song_soup.find_all("div", class_="score")
        for score in score_divs:
            str_score = score.findChildren("i")[0].string
            scores.append(int(str_score.replace(',', '')))
        
        # put into result
        to_insert = {"title": song_name, "img_id": song_img_id, "total": sum(scores), "n": len(scores)}
        if level not in result:
            result[level] = [to_insert]
        else:
            result[level].append(to_insert)

In [None]:
# Dump result into a json file
with open("final_data.json", "w") as outfile:
    json.dump(result, outfile)