In [1]:
import requests
from bs4 import BeautifulSoup
import time 
import pandas as pd

In [2]:
def grab_ids(limit_param):
    '''
    This function scrapes a page of the top mangas list on MyAnimeList for the manga ids. Each page will provide us
    with 50 unique ids of manga series.

    Parameters:
        limit_param: (int) Determines the page we will scrape (rankings limit_param + 1 to limit_param + 50, inclusive)
    
    Outputs:
        temp_manga_ids: (set) Holds the manga id for the series on this page (use the manga id to access https://myanimelist.net/manga/<manga_id>)
    '''

    temp_manga_ids = []

    data = requests.get(f'https://myanimelist.net/topmanga.php?limit={limit_param}')
    if data.status_code != 200:
        print(f'Failed to scrape web page with status code: {data.status_code}')
        return 
    
    soup = BeautifulSoup(data.text, 'html.parser')
    rows = soup.find_all('tr', {'class': 'ranking-list'}) # Represents each of 50 rows for the 50 series on the page

    for row in rows:
        # Extracts the manga_id from https://myanimelist.net/manga/<manga_id>/<manga_name>
        manga_id = row.find('a', {'class': 'fw-b'}).get('href').split('/')[-2] 

        temp_manga_ids.append(manga_id)

    return temp_manga_ids 

In [3]:
MAX_SERIES_NUMBER = 16601 # Represents the page with the last series that have a rating (rest have N/A)
manga_ids = []

In [4]:
for i in range(0, MAX_SERIES_NUMBER, 50):
    manga_ids.extend(grab_ids(i)) # Maintain a set of all manga ids that are scraped
    time.sleep(5) # Make sure to not overflow the MyAnimeList server

In [5]:
# We only use the series from 0-16612 since anything after doesn't have a rating
df = pd.DataFrame(manga_ids[:16613], columns = ['manga_id'])
df.to_csv('manga_ids.csv', index = False)