In [10]:
# Scaping MyAnimeList ID from Top Anime
# Url : https://myanimelist.net/topanime.php?limit=0

from bs4 import BeautifulSoup

In [20]:
# Get Anime Url from Sitemap
# Url : https://myanimelist.net/sitemap/anime-001.xml

from bs4 import BeautifulSoup
import json

with open('anime-001.xml', 'r') as f:
    data = f.read()

AnimeData = BeautifulSoup(data, "xml")

url_list = AnimeData.find_all('loc')

listID = []
for url in url_list:
    listID.append(url.string)

with open('anime_url.json', 'w') as json_file:
    json.dump(listID, json_file, indent=4)

In [8]:
# Get anime detail from anime url

from bs4 import BeautifulSoup
import json
import time
import random
import re

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options

options = Options()
options.add_argument('-headless')
driver = webdriver.Firefox(options=options)

with open('anime_url_test.json', 'r') as json_file:
    url_list = json.load(json_file)

failedFile = open('failed.txt', "w")
    
allAnimeData = []
for url in url_list:
    try:
        print("Fetching {}".format(url))
        driver.get(url+"/stats")
        print("Fetch Complete".format(url))
    except:
        failedFile.write(url+"\n")

    try:
        print("Parsing...".format(url))
        content = driver.page_source
        soup = BeautifulSoup(content, 'html.parser')

        title = soup.find('h1', class_='title-name').text
        score = soup.find('span', attrs={'itemprop': 'ratingValue'}).text
        ratingCount = soup.find('span', attrs={'itemprop': 'ratingCount'}).text
        
        rightSide = soup.find('div', class_='rightside')
        stats = rightSide.find_all("div", class_="spaceit_pad", recursive=False)

        pattern = r'[\d,]+'
        summary_stats = []
        for stat in stats:
            numbers = re.findall(pattern, stat.text)
            summary_stats.append(numbers[0])
        cleaned_summary = [num.replace(',', '') for num in summary_stats]

        anime = {
            'title': title,
            'score': score,
            'rating_count': ratingCount,
            'watching': cleaned_summary[0],
            'complete': cleaned_summary[1],
            'on_hold': cleaned_summary[2],
            'dropped': cleaned_summary[3],
            'plan_to_watch': cleaned_summary[4],
            'total': cleaned_summary[5],
        }

        allAnimeData.append(anime)

        print("Parsing Complete")
        random_sleep_time = random.uniform(1, 2)
        print(f"Sleeping for {random_sleep_time:.2f} seconds...")
        time.sleep(random_sleep_time)
    except Exception as error:
        failedFile.write(url+"\n")
        print(error)
        continue

with open('anime_data.json', 'w') as json_file:
    json.dump(allAnimeData, json_file, indent=4)
    
json_file.close()
failedFile.close()

Fetching https://myanimelist.net/anime/1/Cowboy_Bebop
Fetch Complete
Parsing...
module 're' has no attribute 'find'


In [24]:
# Crawling using JIKAN API

import requests

with open('anime_url.json', 'r') as json_file:
    url_list = json.load(json_file)

dataFull = []

counter = 0

id_pattern = re.compile(r'/anime/(\d+)/')
for url in url_list:
    match = id_pattern.search(url)
    if match:
        anime_id = match.group(1)  # Extract the ID part

    jikanUrl = "https://api.jikan.moe/v4/anime/{}/full".format(anime_id)

    print("Fetching data {}".format(url))
    animeData = requests.get(url = jikanUrl)

    data = animeData.json()

    dataFull.append(data['data'])

    counter = counter + 1

    time.sleep(2)

with open('anime_data.json', 'w') as json_file:
    json.dump(dataFull, json_file, indent=4)

Fetching data https://myanimelist.net/anime/1/Cowboy_Bebop
Fetching data https://myanimelist.net/anime/5/Cowboy_Bebop__Tengoku_no_Tobira
Fetching data https://myanimelist.net/anime/6/Trigun
Fetching data https://myanimelist.net/anime/7/Witch_Hunter_Robin
Fetching data https://myanimelist.net/anime/8/Bouken_Ou_Beet
Fetching data https://myanimelist.net/anime/15/Eyeshield_21
Fetching data https://myanimelist.net/anime/16/Hachimitsu_to_Clover
Fetching data https://myanimelist.net/anime/17/Hungry_Heart__Wild_Striker
Fetching data https://myanimelist.net/anime/18/Initial_D_Fourth_Stage
Fetching data https://myanimelist.net/anime/19/Monster
Fetching data https://myanimelist.net/anime/20/Naruto
Fetching data https://myanimelist.net/anime/21/One_Piece
Fetching data https://myanimelist.net/anime/22/Tennis_no_Oujisama
Fetching data https://myanimelist.net/anime/23/Ring_ni_Kakero_1
Fetching data https://myanimelist.net/anime/24/School_Rumble
Fetching data https://myanimelist.net/anime/25/Sunabouz