In [205]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

def generate_urls(type: str, offset: int=None):
    assert type in ['manga', 'anime'], 'Please provide `type` of either `manga` or `anime`'
    _url = f'https://myanimelist.net/top{type}.php'

    if offset is not None:
        assert offset % 50 == 0, 'Please provide offset in multiples of 50'
        _url = _url + f'?limit={offset}'

    res = requests.get(_url)
    soup = BeautifulSoup(res.text, 'html.parser')

    if type == 'anime':
        titles = soup.find_all('h3', class_='anime_ranking_h3')
        dates = soup.find_all('div', class_='information di-ib mt4')
        scores = soup.find_all('div', class_='js-top-ranking-score-col di-ib al')
        ranks = soup.find_all('span', class_='rank')
        member_divs = soup.find_all('div', class_='information di-ib mt4')
    else:
        titles = soup.find_all('h3', class_='manga_h3')
        dates = soup.find_all('div', class_='information di-ib mt4')
        scores = soup.find_all('div', class_ ='js-top-ranking-score-col di-ib al' )
        ranks = soup.find_all('span', class_ = 'rank')
        member_divs = soup.find_all('div', class_='information di-ib mt4')

    results = []
    for title, date, score, rank, member_div in zip(titles, dates, scores, ranks, member_divs):
        _title = title.a.contents
        _url = title.a.get('href')
        date_text = date.contents[2].strip()
        # Extract the first and last year from the date range if present
        years = re.findall(r'\b\d{4}\b', date_text)
        if len(years) == 2:
            start_date = datetime.strptime(years[0], '%Y').date()
            end_date = datetime.strptime(years[1], '%Y').date()
            _date = f"{start_date.strftime('%Y-%m')} - {end_date.strftime('%Y-%m')}"
        elif len(years) == 1:
            _date = datetime.strptime(years[0], '%Y').date().strftime('%Y-%m')
        else:
            _date = None

        _score = float(score.text.strip()) if score else None

        _rank = int(rank.text.strip()) if rank else None

        _members = None
        if member_div:
            member_text = member_div.get_text(strip=True)
            member_count_match = re.search(r'(\d{1,3}(?:,\d{3})*)(?=\s+members)', member_text)
            if member_count_match:
                # Extract the digits and remove any commas
                _members = int(member_count_match.group(1).replace(',', ''))

        results.append({
            'title': _title,
            'url': _url,
            'date_published': _date,
            'score': _score,
            'rank': _rank,
            'members': _members
        })

    continuation_token = soup.find_all('a', class_='next')
    if continuation_token:
        if offset is None:
            offset = 50
        else:
            offset = offset + 50
    else:
        offset = None

    return offset, results
offset, results = generate_urls('anime', offset=None)
for result in results:
    print(result)


{'title': ['Sousou no Frieren'], 'url': 'https://myanimelist.net/anime/52991/Sousou_no_Frieren', 'date_published': '2023-01 - 2024-01', 'score': 9.38, 'rank': 1, 'members': 711203}
{'title': ['Fullmetal Alchemist: Brotherhood'], 'url': 'https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood', 'date_published': '2009-01 - 2010-01', 'score': 9.09, 'rank': 2, 'members': 103341696}
{'title': ['Steins;Gate'], 'url': 'https://myanimelist.net/anime/9253/Steins_Gate', 'date_published': '2011-01 - 2011-01', 'score': 9.07, 'rank': 3, 'members': 112561201}
{'title': ['Gintama°'], 'url': 'https://myanimelist.net/anime/28977/Gintama°', 'date_published': '2015-01 - 2016-01', 'score': 9.06, 'rank': 4, 'members': 630216}
{'title': ['Shingeki no Kyojin Season 3 Part 2'], 'url': 'https://myanimelist.net/anime/38524/Shingeki_no_Kyojin_Season_3_Part_2', 'date_published': '2019-01 - 2019-01', 'score': 9.05, 'rank': 5, 'members': 192272982}
{'title': ['Gintama: The Final'], 'url': 'https://myan

In [208]:
manga = []
offset = None
type = 'manga'

while True:
    offset, _res = generate_urls(type=type, offset=offset)
    manga = manga + _res
    print(f'{offset} {type} downloaded')
    if offset == 2000:
        break

50 manga downloaded
100 manga downloaded
150 manga downloaded
200 manga downloaded
250 manga downloaded
300 manga downloaded
350 manga downloaded
400 manga downloaded
450 manga downloaded
500 manga downloaded
550 manga downloaded
600 manga downloaded
650 manga downloaded
700 manga downloaded
750 manga downloaded
800 manga downloaded
850 manga downloaded
900 manga downloaded
950 manga downloaded
1000 manga downloaded
1050 manga downloaded
1100 manga downloaded
1150 manga downloaded
1200 manga downloaded
1250 manga downloaded
1300 manga downloaded
1350 manga downloaded
1400 manga downloaded
1450 manga downloaded
1500 manga downloaded
1550 manga downloaded
1600 manga downloaded
1650 manga downloaded
1700 manga downloaded
1750 manga downloaded
1800 manga downloaded
1850 manga downloaded
1900 manga downloaded
1950 manga downloaded
2000 manga downloaded


In [190]:
manga_df = pd.DataFrame(manga)

In [191]:
num_rows =len(manga_df)
num_rows

150

In [192]:
manga[1]['url']

'https://myanimelist.net/manga/1706/JoJo_no_Kimyou_na_Bouken_Part_7__Steel_Ball_Run'

In [193]:
manga[0]['date_published']

'1989-01'

In [209]:
anime = []
offset = None
type = 'anime'

while True:
    offset, _res = generate_urls(type=type, offset=offset)
    anime = anime + _res
    print(f'{offset} {type} downloaded')
    if offset == 2000:
        break

50 anime downloaded
100 anime downloaded
150 anime downloaded
200 anime downloaded
250 anime downloaded
300 anime downloaded
350 anime downloaded
400 anime downloaded
450 anime downloaded
500 anime downloaded
550 anime downloaded
600 anime downloaded
650 anime downloaded
700 anime downloaded
750 anime downloaded
800 anime downloaded
850 anime downloaded
900 anime downloaded
950 anime downloaded
1000 anime downloaded
1050 anime downloaded
1100 anime downloaded
1150 anime downloaded
1200 anime downloaded
1250 anime downloaded
1300 anime downloaded
1350 anime downloaded
1400 anime downloaded
1450 anime downloaded
1500 anime downloaded
1550 anime downloaded
1600 anime downloaded
1650 anime downloaded
1700 anime downloaded
1750 anime downloaded
1800 anime downloaded
1850 anime downloaded
1900 anime downloaded
1950 anime downloaded
2000 anime downloaded


In [210]:
anime[0]['url']

'https://myanimelist.net/anime/52991/Sousou_no_Frieren'

In [211]:
anime[0]['date_published']

'2023-01 - 2024-01'

In [212]:
manga[0].keys()

dict_keys(['title', 'url', 'date_published', 'score', 'rank', 'members'])

In [213]:
anime[0].keys()

dict_keys(['title', 'url', 'date_published', 'score', 'rank', 'members'])

In [282]:
anime_df = pd.DataFrame(anime)
manga_df = pd.DataFrame(manga)

In [283]:
num_rows =len(manga_df)
num_rows

600

In [284]:
anime_df['title'] = anime_df['title'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)
manga_df['title'] = manga_df['title'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

In [285]:
frames = [anime_df, manga_df]
manga_w_anime = pd.concat(frames)
manga_w_anime = manga_w_anime.merge(anime_df, on='title', how='left', suffixes=('_manga', '_anime'))
manga_w_anime['anime_exists'] = np.where(manga_w_anime['url_anime'].isna(), 'no', 'yes')

manga_w_anime

Unnamed: 0,title,url_manga,date_published_manga,score_manga,rank_manga,members_manga,url_anime,date_published_anime,score_anime,rank_anime,members_anime,anime_exists
0,Sousou no Frieren,https://myanimelist.net/anime/52991/Sousou_no_...,2023-01 - 2024-01,9.38,1,711203,https://myanimelist.net/anime/52991/Sousou_no_...,2023-01 - 2024-01,9.38,1.0,711203.0,yes
1,Fullmetal Alchemist: Brotherhood,https://myanimelist.net/anime/5114/Fullmetal_A...,2009-01 - 2010-01,9.09,2,103341696,https://myanimelist.net/anime/5114/Fullmetal_A...,2009-01 - 2010-01,9.09,2.0,103341696.0,yes
2,Steins;Gate,https://myanimelist.net/anime/9253/Steins_Gate,2011-01 - 2011-01,9.07,3,112561201,https://myanimelist.net/anime/9253/Steins_Gate,2011-01 - 2011-01,9.07,3.0,112561201.0,yes
3,Gintama°,https://myanimelist.net/anime/28977/Gintama°,2015-01 - 2016-01,9.06,4,630216,https://myanimelist.net/anime/28977/Gintama°,2015-01 - 2016-01,9.06,4.0,630216.0,yes
4,Shingeki no Kyojin Season 3 Part 2,https://myanimelist.net/anime/38524/Shingeki_n...,2019-01 - 2019-01,9.05,5,192272982,https://myanimelist.net/anime/38524/Shingeki_n...,2019-01 - 2019-01,9.05,5.0,192272982.0,yes
...,...,...,...,...,...,...,...,...,...,...,...,...
1195,Akachan to Boku,https://myanimelist.net/manga/76/Akachan_to_Boku,1991-01 - 1997-01,7.66,1,972604,,,,,,no
1196,DOLL: IC in a Doll,https://myanimelist.net/manga/298/DOLL__IC_in_...,2000-01 - 2002-01,7.66,2,22247,,,,,,no
1197,Penguin☆Brothers,https://myanimelist.net/manga/899/Penguin☆Brot...,1999-01 - 2001-01,7.66,3,15587,,,,,,no
1198,Suzumiya Haruhi no Yuuutsu,https://myanimelist.net/manga/1253/Suzumiya_Ha...,2004-01 - 2004-01,7.66,4,43879,,,,,,no


In [286]:
merged_manga = pd.merge(manga_w_anime, manga_df, on='title')
columns_to_drop = [col for col in merged_manga.columns if col.endswith('_manga')]
merged_manga = merged_manga.drop(columns=columns_to_drop)
merged_manga

Unnamed: 0,title,url_anime,date_published_anime,score_anime,rank_anime,members_anime,anime_exists,url,date_published,score,rank,members
0,Steins;Gate,https://myanimelist.net/anime/9253/Steins_Gate,2011-01 - 2011-01,9.07,3.0,112561201.0,yes,https://myanimelist.net/manga/39859/Steins_Gate,2011-01 - 2012-01,8.58,3,213037
1,Steins;Gate,https://myanimelist.net/anime/9253/Steins_Gate,2011-01 - 2011-01,9.07,3.0,112561201.0,yes,https://myanimelist.net/manga/39859/Steins_Gate,2011-01 - 2012-01,8.58,3,213037
2,One Piece,https://myanimelist.net/anime/21/One_Piece,1999-01,8.72,1.0,2352301.0,yes,https://myanimelist.net/manga/13/One_Piece,1997-01,9.22,4,619021
3,One Piece,https://myanimelist.net/anime/21/One_Piece,1999-01,8.72,1.0,2352301.0,yes,https://myanimelist.net/manga/13/One_Piece,1997-01,9.22,4,619021
4,Kimi no Suizou wo Tabetai,https://myanimelist.net/anime/36098/Kimi_no_Su...,2018-01 - 2018-01,8.55,5.0,965483.0,yes,https://myanimelist.net/manga/117077/Kimi_no_S...,2015-01 - 2015-01,8.68,4,157521
...,...,...,...,...,...,...,...,...,...,...,...,...
649,Akachan to Boku,,,,,,no,https://myanimelist.net/manga/76/Akachan_to_Boku,1991-01 - 1997-01,7.66,1,972604
650,DOLL: IC in a Doll,,,,,,no,https://myanimelist.net/manga/298/DOLL__IC_in_...,2000-01 - 2002-01,7.66,2,22247
651,Penguin☆Brothers,,,,,,no,https://myanimelist.net/manga/899/Penguin☆Brot...,1999-01 - 2001-01,7.66,3,15587
652,Suzumiya Haruhi no Yuuutsu,,,,,,no,https://myanimelist.net/manga/1253/Suzumiya_Ha...,2004-01 - 2004-01,7.66,4,43879


In [288]:
merged_manga.to_csv('merged_manga.csv')