In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
def generate_urls(type: str, offset: int=None):
    assert type in ['manga', 'anime'], 'Please provide `type` of either `manga` or `anime`'
    _url = f'https://myanimelist.net/top{type}.php'

    if offset is not None:
        assert offset % 50 == 0, 'Please provide offset in multiples of 50'
        _url = _url + f'?limit={offset}'

    res = requests.get(_url)
    soup = BeautifulSoup(res.text, 'html.parser')

    if type == 'anime':
        titles = soup.find_all('h3', class_='anime_ranking_h3')
    else:
        titles = soup.find_all('h3', class_='manga_h3')

    results = [
        {
            'title': title.a.contents,
            'url': title.a.get('href')
        } for title in titles
    ]

    continuation_token = soup.find_all('a', class_='next')
    if continuation_token is not None:
        if offset is None:
            offset = 50
        else:
            offset = offset + 50
    else:
        exit

    return offset, results

In [3]:
manga = []
offset = None
type = 'manga'
manga_count = 0

while manga_count < 200:  # Download n manga
    offset, _res = generate_urls(type=type, offset=offset)
    manga += _res
    print(f'{offset} {type} downloaded')
    manga_count += len(_res)

50 manga downloaded
100 manga downloaded
150 manga downloaded
200 manga downloaded


In [4]:
manga[0]['url']

'https://myanimelist.net/manga/2/Berserk'

In [5]:
import requests
from bs4 import BeautifulSoup

def download_reviews(url: str, offset: int=None):
    url = f'{url}/reviews'
    if offset is not None:
        url += f'?p={offset}'

    res = requests.get(url)
    res.raise_for_status()
    soup = BeautifulSoup(res.text, 'html.parser')

    reviews = [review.get_text().strip('\n').lstrip() for review in soup.find_all('div', class_='text')]

    continuation_token = soup.find(attrs={'data-ga-click-type': 'review-more-reviews'})
    if continuation_token:
        if offset is None:
            offset = 2
        else:
            offset += 1

    return offset, reviews

In [6]:
# Iterate over manga items
for idx, item in enumerate(manga):
    offset = 1
    manga[idx]['reviews'] = []


    print(f'Downloading the first review for {item["title"]}')
    offset, reviews = download_reviews(item['url'], offset=offset)

    if reviews:
        manga[idx]['reviews'].append(reviews[0])


#total_reviews = sum(len(item['reviews']) for item in manga)
#print(f'Total reviews downloaded for all manga: {total_reviews}')
 #itll break if you download too many with a 405/https error

Downloading the first review for ['Berserk']
Downloading the first review for ['JoJo no Kimyou na Bouken Part 7: Steel Ball Run']
Downloading the first review for ['Vagabond']
Downloading the first review for ['One Piece']
Downloading the first review for ['Monster']
Downloading the first review for ['Slam Dunk']
Downloading the first review for ['Vinland Saga']
Downloading the first review for ['Fullmetal Alchemist']
Downloading the first review for ['Grand Blue']
Downloading the first review for ['Oyasumi Punpun']
Downloading the first review for ['Kingdom']
Downloading the first review for ['Houseki no Kuni']
Downloading the first review for ['Real']
Downloading the first review for ['20th Century Boys']
Downloading the first review for ['Ashita no Joe']
Downloading the first review for ['Monogatari Series: First Season']
Downloading the first review for ['Monogatari Series: Second Season']
Downloading the first review for ['Kaguya-sama wa Kokurasetai: Tensai-tachi no Renai Zunousen

HTTPError: 405 Client Error: Not Allowed for url: https://myanimelist.net/manga/11734/Watashitachi_no_Shiawase_na_Jikan/reviews?p=1

In [7]:
manga[0].keys()

dict_keys(['title', 'url', 'reviews'])

In [8]:
len(manga[5]['reviews'])

1

In [9]:
import pandas as pd
data = pd.DataFrame(manga)
data

Unnamed: 0,title,url,reviews
0,[Berserk],https://myanimelist.net/manga/2/Berserk,[Story - 9.38\n\r\nThe first three volumes may...
1,[JoJo no Kimyou na Bouken Part 7: Steel Ball Run],https://myanimelist.net/manga/1706/JoJo_no_Kim...,[Serving as a soft reset for Jojo's Bizarre Ad...
2,[Vagabond],https://myanimelist.net/manga/656/Vagabond,"[Some seinen manga, I believe, take the meanin..."
3,[One Piece],https://myanimelist.net/manga/13/One_Piece,[Frequently categorized with both Bleach and N...
4,[Monster],https://myanimelist.net/manga/1/Monster,[I rarely give tens to mangas. But this one ju...
...,...,...,...
195,[Seasons of Blossom],https://myanimelist.net/manga/145539/Seasons_o...,
196,[Tomodachi Game],https://myanimelist.net/manga/62795/Tomodachi_...,
197,[Here U Are],https://myanimelist.net/manga/119072/Here_U_Are,
198,[Gekkan Shoujo Nozaki-kun],https://myanimelist.net/manga/29211/Gekkan_Sho...,


In [12]:
data.drop(data.tail(50).index,
        inplace = True)

In [13]:
data

Unnamed: 0,title,url,reviews
0,[Berserk],https://myanimelist.net/manga/2/Berserk,[Story - 9.38\n\r\nThe first three volumes may...
1,[JoJo no Kimyou na Bouken Part 7: Steel Ball Run],https://myanimelist.net/manga/1706/JoJo_no_Kim...,[Serving as a soft reset for Jojo's Bizarre Ad...
2,[Vagabond],https://myanimelist.net/manga/656/Vagabond,"[Some seinen manga, I believe, take the meanin..."
3,[One Piece],https://myanimelist.net/manga/13/One_Piece,[Frequently categorized with both Bleach and N...
4,[Monster],https://myanimelist.net/manga/1/Monster,[I rarely give tens to mangas. But this one ju...
...,...,...,...
140,[Blue Lock],https://myanimelist.net/manga/114745/Blue_Lock,"[Honestly, out of all the sports anime I have ..."
141,[Given],https://myanimelist.net/manga/79085/Given,[Given is a truly amazing story that features ...
142,[Kakukaku Shikajika],https://myanimelist.net/manga/44307/Kakukaku_S...,"[If you have read or watched Bakuman., you mig..."
143,[Mahoutsukai no Yoru],https://myanimelist.net/manga/26027/Mahoutsuka...,[This is a review of the visual novel of Witch...


In [14]:
data.to_csv('manga_reviews.csv')