<a href="https://colab.research.google.com/github/trodman201/Data-3000/blob/main/AnimeScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re

def generate_urls(type: str, offset: int=None):
    assert type in ['manga', 'anime'], 'Please provide `type` of either `manga` or `anime`'
    _url = f'https://myanimelist.net/top{type}.php'

    if offset is not None:
        assert offset % 50 == 0, 'Please provide offset in multiples of 50'
        _url = _url + f'?limit={offset}'

    res = requests.get(_url)
    soup = BeautifulSoup(res.text, 'html.parser')

    if type == 'anime':
        titles = soup.find_all('h3', class_='anime_ranking_h3')
        dates = soup.find_all('div', class_='information di-ib mt4')
        scores = soup.find_all('div', class_='js-top-ranking-score-col di-ib al')
        ranks = soup.find_all('span', class_='rank')
        member_divs = soup.find_all('div', class_='information di-ib mt4')
    else:
        titles = soup.find_all('h3', class_='manga_h3')
        dates = soup.find_all('div', class_='information di-ib mt4')
        scores = soup.find_all('div', class_ ='js-top-ranking-score-col di-ib al' )
        ranks = soup.find_all('span', class_ = 'rank')
        member_divs = soup.find_all('div', class_='information di-ib mt4')

    results = []
    for title, date, score, rank, member_div in zip(titles, dates, scores, ranks, member_divs):
        _title = title.a.contents
        _url = title.a.get('href')
        date_text = date.contents[2].strip()
        # Extract the first and last year from the date range if present
        years = re.findall(r'\b\d{4}\b', date_text)
        if len(years) == 2:
            start_date = datetime.strptime(years[0], '%Y').date()
            end_date = datetime.strptime(years[1], '%Y').date()
            _date = f"{start_date.strftime('%Y-%m')} - {end_date.strftime('%Y-%m')}"
        elif len(years) == 1:
            _date = datetime.strptime(years[0], '%Y').date().strftime('%Y-%m')
        else:
            _date = None

        _score = float(score.text.strip()) if score else None

        _rank = int(rank.text.strip()) if rank else None

        _members = None
        if member_div:
            member_text = member_div.get_text(strip=True)
            member_count_match = re.search(r'(\d{1,3}(?:,\d{3})*)(?=\s+members)', member_text)
            if member_count_match:
                # Extract the digits and remove any commas
                _members = int(member_count_match.group(1).replace(',', ''))

        results.append({
            'title': _title,
            'url': _url,
            'date_published': _date,
            'score': _score,
            'rank': _rank,
            'members': _members
        })

    continuation_token = soup.find_all('a', class_='next')
    if continuation_token:
        if offset is None:
            offset = 50
        else:
            offset = offset + 50
    else:
        offset = None

    return offset, results
offset, results = generate_urls('anime', offset=None)
for result in results:
    print(result)


{'title': ['Sousou no Frieren'], 'url': 'https://myanimelist.net/anime/52991/Sousou_no_Frieren', 'date_published': '2023-01 - 2024-01', 'score': 9.39, 'rank': 1, 'members': 688641}
{'title': ['Fullmetal Alchemist: Brotherhood'], 'url': 'https://myanimelist.net/anime/5114/Fullmetal_Alchemist__Brotherhood', 'date_published': '2009-01 - 2010-01', 'score': 9.09, 'rank': 2, 'members': 103335637}
{'title': ['Steins;Gate'], 'url': 'https://myanimelist.net/anime/9253/Steins_Gate', 'date_published': '2011-01 - 2011-01', 'score': 9.07, 'rank': 3, 'members': 112556712}
{'title': ['Gintama°'], 'url': 'https://myanimelist.net/anime/28977/Gintama°', 'date_published': '2015-01 - 2016-01', 'score': 9.06, 'rank': 4, 'members': 628896}
{'title': ['Shingeki no Kyojin Season 3 Part 2'], 'url': 'https://myanimelist.net/anime/38524/Shingeki_no_Kyojin_Season_3_Part_2', 'date_published': '2019-01 - 2019-01', 'score': 9.05, 'rank': 5, 'members': 192267065}
{'title': ['Gintama: The Final'], 'url': 'https://myan

In [None]:
manga = []
offset = None
type = 'manga'

while True:
    offset, _res = generate_urls(type=type, offset=offset)
    manga = manga + _res
    print(f'{offset} {type} downloaded')
    if offset == 3000:
        break

50 manga downloaded
100 manga downloaded
150 manga downloaded
200 manga downloaded
250 manga downloaded
300 manga downloaded
350 manga downloaded
400 manga downloaded
450 manga downloaded
500 manga downloaded
550 manga downloaded
600 manga downloaded
650 manga downloaded
700 manga downloaded
750 manga downloaded
800 manga downloaded
850 manga downloaded
900 manga downloaded
950 manga downloaded
1000 manga downloaded
1050 manga downloaded
1100 manga downloaded
1150 manga downloaded
1200 manga downloaded
1250 manga downloaded
1300 manga downloaded
1350 manga downloaded
1400 manga downloaded
1450 manga downloaded
1500 manga downloaded
1550 manga downloaded
1600 manga downloaded
1650 manga downloaded
1700 manga downloaded
1750 manga downloaded
1800 manga downloaded
1850 manga downloaded
1900 manga downloaded
1950 manga downloaded
2000 manga downloaded
2050 manga downloaded
2100 manga downloaded
2150 manga downloaded
2200 manga downloaded
2250 manga downloaded
2300 manga downloaded
2350 man

In [None]:
manga[1]['url']

'https://myanimelist.net/manga/1706/JoJo_no_Kimyou_na_Bouken_Part_7__Steel_Ball_Run'

In [None]:
manga[0]['date_published']

'1989-01'

In [None]:
anime = []
offset = None
type = 'anime'

while True:
    offset, _res = generate_urls(type=type, offset=offset)
    anime = anime + _res
    print(f'{offset} {type} downloaded')
    if offset == 3500:
        break

50 anime downloaded
100 anime downloaded
150 anime downloaded
200 anime downloaded
250 anime downloaded
300 anime downloaded
350 anime downloaded
400 anime downloaded
450 anime downloaded
500 anime downloaded
550 anime downloaded
600 anime downloaded
650 anime downloaded
700 anime downloaded
750 anime downloaded
800 anime downloaded
850 anime downloaded
900 anime downloaded
950 anime downloaded
1000 anime downloaded
1050 anime downloaded
1100 anime downloaded
1150 anime downloaded
1200 anime downloaded
1250 anime downloaded
1300 anime downloaded
1350 anime downloaded
1400 anime downloaded
1450 anime downloaded
1500 anime downloaded
1550 anime downloaded
1600 anime downloaded
1650 anime downloaded
1700 anime downloaded
1750 anime downloaded
1800 anime downloaded
1850 anime downloaded
1900 anime downloaded
1950 anime downloaded
2000 anime downloaded
2050 anime downloaded
2100 anime downloaded
2150 anime downloaded
2200 anime downloaded
2250 anime downloaded
2300 anime downloaded
2350 ani

In [None]:
anime[0]['url']

'https://myanimelist.net/anime/52991/Sousou_no_Frieren'

In [None]:
anime[0]['date_published']

'2023-01 - 2024-01'

In [None]:
def download_reviews(url: str, offset: int=None):
    url = f'{url}/reviews'
    if offset is not None:
        url = url + f'?p={offset}'

    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')

    reviews = [review.get_text() for review in soup.find_all('div', class_='text')]

    continuation_token = soup.find_all(attrs={'data-ga-click-type': 'review-more-reviews'})
    if continuation_token is not None:
        if offset is None:
            offset = 2
        else:
            offset += 1
    else:
        exit

    return offset, reviews

In [None]:
for idx, item in enumerate(manga):
    offset = None
    manga[idx]['reviews'] = []

    while True:
        print(f'downloading reviews for {item["title"]}, page {offset}')
        offset, _res = download_reviews(item['url'], offset=offset)
        manga[idx]['reviews'] += _res

        if offset == 20:
            break
    if idx > 2:
        break

downloading reviews for ['Berserk'], page None
downloading reviews for ['Berserk'], page 2
downloading reviews for ['Berserk'], page 3
downloading reviews for ['Berserk'], page 4
downloading reviews for ['Berserk'], page 5
downloading reviews for ['Berserk'], page 6
downloading reviews for ['Berserk'], page 7
downloading reviews for ['Berserk'], page 8
downloading reviews for ['Berserk'], page 9
downloading reviews for ['Berserk'], page 10
downloading reviews for ['Berserk'], page 11
downloading reviews for ['Berserk'], page 12
downloading reviews for ['Berserk'], page 13
downloading reviews for ['Berserk'], page 14
downloading reviews for ['Berserk'], page 15
downloading reviews for ['Berserk'], page 16
downloading reviews for ['Berserk'], page 17
downloading reviews for ['Berserk'], page 18
downloading reviews for ['Berserk'], page 19
downloading reviews for ['JoJo no Kimyou na Bouken Part 7: Steel Ball Run'], page None
downloading reviews for ['JoJo no Kimyou na Bouken Part 7: Steel

In [None]:
for idx, item in enumerate(anime):
    offset = None
    anime[idx]['reviews'] = []

    while True:
        print(f'downloading reviews for {item["title"]}, page {offset}')
        offset, _res = download_reviews(item['url'], offset=offset)
        anime[idx]['reviews'] += _res

        if offset == 20:
            break
    if idx > 2:
        break

downloading reviews for ['Sousou no Frieren'], page None
downloading reviews for ['Sousou no Frieren'], page 2
downloading reviews for ['Sousou no Frieren'], page 3
downloading reviews for ['Sousou no Frieren'], page 4
downloading reviews for ['Sousou no Frieren'], page 5
downloading reviews for ['Sousou no Frieren'], page 6
downloading reviews for ['Sousou no Frieren'], page 7
downloading reviews for ['Sousou no Frieren'], page 8
downloading reviews for ['Sousou no Frieren'], page 9
downloading reviews for ['Sousou no Frieren'], page 10
downloading reviews for ['Sousou no Frieren'], page 11
downloading reviews for ['Sousou no Frieren'], page 12
downloading reviews for ['Sousou no Frieren'], page 13
downloading reviews for ['Sousou no Frieren'], page 14
downloading reviews for ['Sousou no Frieren'], page 15
downloading reviews for ['Sousou no Frieren'], page 16
downloading reviews for ['Sousou no Frieren'], page 17
downloading reviews for ['Sousou no Frieren'], page 18
downloading revi

In [None]:
manga[0].keys()

dict_keys(['title', 'url', 'date_published', 'score', 'rank', 'members', 'reviews'])

In [None]:
anime[0].keys()

dict_keys(['title', 'url', 'date_published', 'score', 'rank', 'members', 'reviews'])

In [None]:
import pandas as pd
pd.DataFrame(manga)

Unnamed: 0,title,url,date_published,score,rank,members,reviews
0,[Berserk],https://myanimelist.net/manga/2/Berserk,1989-01,9.47,1,689424,[\n Story - 9.38\n\r\nThe first...
1,[JoJo no Kimyou na Bouken Part 7: Steel Ball Run],https://myanimelist.net/manga/1706/JoJo_no_Kim...,2004-01 - 2011-01,9.30,2,265230,[]
2,[Vagabond],https://myanimelist.net/manga/656/Vagabond,1998-01 - 2015-01,9.25,3,381496,[]
3,[One Piece],https://myanimelist.net/manga/13/One_Piece,1997-01,9.22,4,617283,[]
4,[Monster],https://myanimelist.net/manga/1/Monster,1994-01 - 2001-01,9.16,5,245010,
...,...,...,...,...,...,...,...
895,[Terror Man],https://myanimelist.net/manga/148850/Terror_Man,2016-01 - 2020-01,7.50,1,203549,
896,[Kimi no Yokogao wo Miteita],https://myanimelist.net/manga/150834/Kimi_no_Y...,2022-01,7.50,2,1989,
897,[Muse on Fame],https://myanimelist.net/manga/157571/Muse_on_Fame,2022-01,7.50,3,992,
898,[Boy-Friend's Rule],https://myanimelist.net/manga/164423/Boy-Frien...,2023-01,7.50,4,503,


In [None]:
pd.DataFrame(anime)

Unnamed: 0,title,url,date_published,score,rank,members,reviews
0,[Sousou no Frieren],https://myanimelist.net/anime/52991/Sousou_no_...,2023-01 - 2024-01,9.39,1,688641,[]
1,[Fullmetal Alchemist: Brotherhood],https://myanimelist.net/anime/5114/Fullmetal_A...,2009-01 - 2010-01,9.09,2,103335637,[]
2,[Steins;Gate],https://myanimelist.net/anime/9253/Steins_Gate,2011-01 - 2011-01,9.07,3,112556712,[]
3,[Gintama°],https://myanimelist.net/anime/28977/Gintama°,2015-01 - 2016-01,9.06,4,628896,[]
4,[Shingeki no Kyojin Season 3 Part 2],https://myanimelist.net/anime/38524/Shingeki_n...,2019-01 - 2019-01,9.05,5,192267065,
...,...,...,...,...,...,...,...
1045,[Ple Ple Pleiades × Kagejitsu!],https://myanimelist.net/anime/57034/Ple_Ple_Pl...,2023-01 - 2023-01,7.17,1,237660,
1046,[Precure All Stars Movie New Stage 2: Kokoro n...,https://myanimelist.net/anime/16442/Precure_Al...,2013-01 - 2013-01,7.17,2,135016,
1047,[Sarusuberi: Miss Hokusai],https://myanimelist.net/anime/23987/Sarusuberi...,2015-01 - 2015-01,7.17,3,534973,
1048,[Shin Hakkenden],https://myanimelist.net/anime/1823/Shin_Hakkenden,1999-01 - 1999-01,7.17,4,996586,


In [None]:
anime_df = pd.DataFrame(anime)
manga_df = pd.DataFrame(manga)

anime_df['title'] = anime_df['title'].apply(lambda x: ''.join(x) if isinstance(x, list) else x)
manga_df['title'] = manga_df['title'].apply(lambda x: ''.join(x) if isinstance(x, list) else x)
anime_w_manga = pd.merge(anime_df, manga_df, on='title', how='inner', suffixes=('_anime', '_manga'))
anime_w_manga


Unnamed: 0,title,url_anime,date_published_anime,score_anime,rank_anime,members_anime,reviews_anime,url_manga,date_published_manga,score_manga,rank_manga,members_manga,reviews_manga
0,One Piece,https://myanimelist.net/anime/21/One_Piece,1999-01,8.72,2,2345720,,https://myanimelist.net/manga/13/One_Piece,1997-01,9.22,4,617283,[]
1,Slam Dunk,https://myanimelist.net/anime/170/Slam_Dunk,1993-01 - 1996-01,8.55,5,300220,,https://myanimelist.net/manga/51/Slam_Dunk,1990-01 - 1996-01,9.09,1,170265,
2,Kimi no Suizou wo Tabetai,https://myanimelist.net/anime/36098/Kimi_no_Su...,2018-01 - 2018-01,8.55,1,962518,,https://myanimelist.net/manga/104538/Kimi_no_S...,2016-01 - 2017-01,8.49,3,743375,
3,Grand Blue,https://myanimelist.net/anime/37105/Grand_Blue,2018-01 - 2018-01,8.43,1,776555,,https://myanimelist.net/manga/70345/Grand_Blue,2014-01,9.03,4,175957,
4,Natsume Yuujinchou,https://myanimelist.net/anime/4081/Natsume_Yuu...,2008-01 - 2008-01,8.31,2,539375,,https://myanimelist.net/manga/1859/Natsume_Yuu...,2003-01,8.67,3,43889,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,Amanchu!,https://myanimelist.net/anime/31771/Amanchu,2016-01 - 2016-01,7.24,5,693138,,https://myanimelist.net/manga/11061/Amanchu,2008-01 - 2021-01,7.78,2,219467,
62,Higurashi no Naku Koro ni Gou,https://myanimelist.net/anime/41006/Higurashi_...,2020-01 - 2021-01,7.20,4,274819,,https://myanimelist.net/manga/130251/Higurashi...,2020-01 - 2021-01,7.53,4,212281,
63,Kaijuu no Kodomo,https://myanimelist.net/anime/37981/Kaijuu_no_...,2019-01 - 2019-01,7.20,2,109981,,https://myanimelist.net/manga/5655/Kaijuu_no_K...,2005-01 - 2011-01,7.98,3,116163,
64,Dimension W,https://myanimelist.net/anime/31163/Dimension_W,2016-01 - 2016-01,7.18,5,298509,,https://myanimelist.net/manga/42279/Dimension_W,2011-01 - 2019-01,7.59,3,914941,


In [None]:
anime_row_count = len(anime_df)
anime_row_count

In [None]:
manga_row_count = len(manga_df)
manga_row_count

#Why is there 120 in each but goes down to 6 we combined.

In [None]:
anime_w_manga_df = pd.DataFrame(anime_w_manga)

anime_w_manga_df.to_csv("output.csv")


In [None]:
row_count = len(anime_w_manga_df)
row_count


In [None]:
!pip install fuzzywuzzy
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Assuming anime_df and manga_df are your DataFrames
anime_df = pd.DataFrame(anime)
manga_df = pd.DataFrame(manga)

# Convert manga titles to strings
manga_df['title'] = manga_df['title'].astype(str)

# Define a threshold for string similarity
# This will determine how closely the titles need to match
threshold = 80

# Function to find best match for each title in anime_df
def find_best_match(title, choices):
    match, score = process.extractOne(str(title), choices, scorer=fuzz.token_sort_ratio)
    if score >= threshold:
        return match
    else:
        return None

# Create a list of unique titles from manga_df
manga_titles = manga_df['title'].unique().tolist()

# Apply the find_best_match function to anime_df['title'] and create a new column 'matched_title'
anime_df['matched_title'] = anime_df['title'].apply(lambda x: find_best_match(x, manga_titles))

# Merge anime_df and manga_df based on 'matched_title'
merged_df = pd.merge(anime_df, manga_df, left_on='matched_title', right_on='title', how='inner')

# Drop the 'matched_title' column if you don't need it in the final merged DataFrame
merged_df.drop('matched_title', axis=1, inplace=True)

# Display the resulting merged DataFrame
print(merged_df)


                                               title_x  \
0                                          [One Piece]   
1    [Rurouni Kenshin: Meiji Kenkaku Romantan - Tsu...   
2                                          [Slam Dunk]   
3                          [Kimi no Suizou wo Tabetai]   
4           [Kono Subarashii Sekai ni Shukufuku wo! 3]   
..                                                 ...   
109                                 [Kaijuu no Kodomo]   
110                                  [Kamichama Karin]   
111          [Chuunibyou demo Koi ga Shitai! Ren Lite]   
112                                      [Dimension W]   
113                                           [Suzuka]   

                                                 url_x   date_published_x  \
0           https://myanimelist.net/anime/21/One_Piece            1999-01   
1    https://myanimelist.net/anime/44/Rurouni_Kensh...  1999-01 - 1999-01   
2          https://myanimelist.net/anime/170/Slam_Dunk  1993-01 - 1996-0

In [None]:
merged_df

Unnamed: 0,title_x,url_x,date_published_x,score_x,rank_x,members_x,reviews_x,title_y,url_y,date_published_y,score_y,rank_y,members_y,reviews_y
0,[One Piece],https://myanimelist.net/anime/21/One_Piece,1999-01,8.72,2,2345720,,['One Piece'],https://myanimelist.net/manga/13/One_Piece,1997-01,9.22,4,617283,[]
1,[Rurouni Kenshin: Meiji Kenkaku Romantan - Tsu...,https://myanimelist.net/anime/44/Rurouni_Kensh...,1999-01 - 1999-01,8.70,4,276470,,['Rurouni Kenshin: Meiji Kenkaku Romantan'],https://myanimelist.net/manga/22/Rurouni_Kensh...,1994-01 - 1999-01,8.56,5,997732,
2,[Slam Dunk],https://myanimelist.net/anime/170/Slam_Dunk,1993-01 - 1996-01,8.55,5,300220,,['Slam Dunk'],https://myanimelist.net/manga/51/Slam_Dunk,1990-01 - 1996-01,9.09,1,170265,
3,[Kimi no Suizou wo Tabetai],https://myanimelist.net/anime/36098/Kimi_no_Su...,2018-01 - 2018-01,8.55,1,962518,,['Kimi no Suizou wo Tabetai'],https://myanimelist.net/manga/104538/Kimi_no_S...,2016-01 - 2017-01,8.49,3,743375,
4,[Kono Subarashii Sekai ni Shukufuku wo! 3],https://myanimelist.net/anime/49458/Kono_Subar...,2024-01,8.55,2,309970,,['Kono Subarashii Sekai ni Shukufuku wo!'],https://myanimelist.net/manga/80385/Kono_Subar...,2014-01,8.00,5,23598,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,[Kaijuu no Kodomo],https://myanimelist.net/anime/37981/Kaijuu_no_...,2019-01 - 2019-01,7.20,2,109981,,['Kaijuu no Kodomo'],https://myanimelist.net/manga/5655/Kaijuu_no_K...,2005-01 - 2011-01,7.98,3,116163,
110,[Kamichama Karin],https://myanimelist.net/anime/1808/Kamichama_K...,2007-01 - 2007-01,7.20,5,752603,,['Kamichama Karin Chu'],https://myanimelist.net/manga/2575/Kamichama_K...,2006-01 - 2008-01,7.71,3,87955,
111,[Chuunibyou demo Koi ga Shitai! Ren Lite],https://myanimelist.net/anime/21797/Chuunibyou...,2013-01 - 2014-01,7.18,1,489473,,['Chuunibyou demo Koi ga Shitai!'],https://myanimelist.net/manga/40099/Chuunibyou...,2011-01 - 2017-01,8.05,5,179558,
112,[Dimension W],https://myanimelist.net/anime/31163/Dimension_W,2016-01 - 2016-01,7.18,5,298509,,['Dimension W'],https://myanimelist.net/manga/42279/Dimension_W,2011-01 - 2019-01,7.59,3,914941,
