In [1]:
import csv
import pandas as pd
import numpy as np
import requests 
from bs4 import BeautifulSoup
import time

In [2]:
manga_ids = []

series_names = []
series_synopses = []
series_english_names = []
series_japanese_names = []
series_types = []
series_volumes = []
series_chapters = []
series_status = []
series_publish_time = []
series_magazines = []
series_popularity = []
series_num_members = []
series_num_favourites = []
series_genres = []
series_themes = []
series_demo = []
series_authors = []
series_scores = []
series_ranks = []

In [3]:
with open('manga_ids.csv', newline = '') as f:
    for row in csv.reader(f):
        manga_ids.append(row[0])

f.close()
manga_ids = manga_ids[1:]

In [4]:
def get_name(data):
    ''' 
    Parameters:
        data: (list) The list of the data we search through
    Outputs:
        name: (str) Returns the name of the series
    '''

    return soup.find('span', {'itemprop': 'name'}).text 

In [5]:
def get_synopsis(data):
    ''' 
    Parameters:
        data: (list) The list of the data we search through
    Outputs:
        synopsis: (str) Returns the synopsis of the series
    '''

    return soup.find('span', {'itemprop': 'description'}).text 

In [6]:
def check_field(name, data):
    '''
    Parameters:
        name: (str) The name of the field whos existence you want to check
        data: (list) The list of data you want to search through

    Outputs:
        index: (int) Returns index in list if found; -1 otherwise
    '''

    for i in range(len(data)):
        if name in data[i].find('span', {'class': 'dark_text'}).text:
            return i
        
    return -1

In [7]:
def get_info_single(name, data):
    ''' 
    Parameters:
        data: (list) The list that holds the information we want to scrape
    Outputs:
        info: (str/None) Returns string value of the information to scrape; None if it doesn't exist
    '''

    index = check_field(name, data)
    if index == -1:
        return None 
    
    if name in ('Type', 'Serialization'):
        return data[index].contents[2].text
    else:
        return data[index].contents[1].strip()

In [8]:
def get_info_multi(name, data):
    ''' 
    Parameters:
        data: (list) The list that holds the information we want to scrape
    Outputs:
        info: (str/None) Returns string value of the information to scrape; None if it doesn't exist
    '''

    index = check_field(name, data)
    if index == -1:
        return None
    
    values = data[index].find_all('span', {'itemprop': 'genre'})
    return ','.join([val.text for val in values])

In [9]:
def get_authors(data):
    ''' 
    Parameters:
        data: (list) The list that holds the information we want to scrape
    Outputs:
        authors: (str) comma-delimited sequence of the authors of the series
    '''

    index = check_field('Authors', data)
    if index == -1:
        return None
    
    authors = data[index].contents[2:-1:2]
    return ','.join([author.text.replace(', ', ' ') for author in authors])

In [10]:
def get_score(data):
    ''' 
    Parameters:
        data: (list) The list that holds the information we want to scrape
    Outputs:
        score: (str) The fan voted score out of 10 for the series
    '''

    index = check_field('Score', data)
    if index == -1:
        return None
    
    return data[index].find('span', {'itemprop': 'ratingValue'}).text
    

In [11]:
def get_rank(data):
    ''' 
    Parameters:
        data: (list) The list that holds the information we want to scrape
    Outputs:
        rank: (str) The fan determined rank of the series amongst the others
    '''
    
    return data.find('div', class_ = "spaceit_pad po-r js-statistics-info di-ib").contents[2].strip()

In [12]:
for manga_id in manga_ids[:10]:
    response = requests.get(f'https://myanimelist.net/manga/{manga_id}')
    if response.status_code != 200:
        print(f'Failed to scrape the web page. Restart from: {manga_id}')
        break 

    soup = BeautifulSoup(response.text, 'html.parser')
    temp_block = soup.find('td', {'class': 'borderClass'})
    info = temp_block.find_all('div', {'class': 'spaceit_pad'})

    series_names.append(get_name(soup))
    series_synopses.append(get_synopsis(soup))
    series_english_names.append(get_info_single('English', info))
    series_japanese_names.append(get_info_single('Japanese', info))
    series_types.append(get_info_single('Type', info))
    series_volumes.append(get_info_single('Volumes', info))
    series_chapters.append(get_info_single('Chapter', info))
    series_status.append(get_info_single('Status', info))
    series_publish_time.append(get_info_single('Published', info))
    series_magazines.append(get_info_single('Serialization', info))
    series_popularity.append(get_info_single('Popularity', info))
    series_num_members.append(get_info_single('Member', info))
    series_num_favourites.append(get_info_single('Favorite', info))
    series_genres.append(get_info_multi('Genre', info))
    series_themes.append(get_info_multi('Theme', info))
    series_demo.append(get_info_multi('Demographic', info))
    series_authors.append(get_authors(info))
    series_scores.append(get_score(info))
    series_ranks.append(get_rank(soup))

    time.sleep(5)

In [13]:
df = pd.DataFrame(
    {
        'name': series_names,
        'english_name': series_english_names,
        'japanese_name': series_japanese_names,
        'synopsis': series_synopses,
        'type': series_types,
        'volumes': series_volumes,
        'chapters': series_chapters,
        'status': series_status,
        'publishing_period': series_publish_time,
        'genres': series_genres,
        'themes': series_themes,
        'authors': series_authors,
        'demographic': series_demo,
        'magazine': series_magazines,
        'score': series_scores,
        'rank': series_ranks,
        'popularity': series_popularity,
        'members': series_num_members,
        'favourites': series_num_favourites
    }
)

In [14]:
df.to_csv('manga_info.csv', index = False)