# Gernerate Anime Data from HTMLs

In [1]:
import os
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

## Get MAL ID List from Directory

In [2]:
ROOT_PATH = os.getcwd()
HTML_PATH = ROOT_PATH + '/html'

In [3]:
mal_id_list = []

for item in os.listdir(HTML_PATH):
    if item.endswith('.html'):
        mal_id = int(item[:-5])
        mal_id_list.append(mal_id) 

mal_id_list.sort()

## Define HTML Parsing Function

In [4]:
def get_html(mal_id):
    file_name = HTML_PATH + f'/{mal_id}.html'
    f = open(file_name, 'r')
    html = f.read()
    return html 

In [5]:
def remove_tail(s):
    if len(s) == 0:
        return s

    elif s[-1] == ')':
        idx = -1
        while s[idx] != '(':
            idx -= 1
            if idx == -len(s):
                return s[:s.find('Source:')].strip()
        return s[:idx].strip()

    elif s[-1] == ']':
        idx = -1
        while s[idx] != '[':
            idx -= 1
        return s[:idx].strip()

    return s

In [6]:
def get_info(mal_id):
    data = get_html(mal_id)
    soup = BeautifulSoup(data, "html.parser")

    # get name
    div_name = soup.find("div", {"itemprop": "name"})
    name = div_name.find("strong").text.strip()

    # get type
    type_name = soup.find("span", {"class": "information type"})
    try:
        type_ = type_name.find("a").text.strip()
    except:
        type_ = 'Unknown'

    # get genre string
    rows = soup.find_all("span", {"itemprop": "genre"})
    genre = ""
    for i, row in enumerate(rows):
        genre += row.text.strip()
        if i != len(rows)-1:
            genre += ", "

    # get synopsis string
    synopsis = soup.find("p", {"itemprop": "description"}).text.strip()
    synopsis = " ".join(synopsis.split())
    synopsis = remove_tail(synopsis)

    return [name, type_, genre, synopsis]

## Get Informations from HTMLs

In [7]:
full_info = []

for mal_id in tqdm(mal_id_list):
    res = get_info(mal_id)

    res = [mal_id] + res
    full_info.append(res)

100%|██████████| 17562/17562 [14:42<00:00, 19.90it/s]


In [8]:
info_df = pd.DataFrame(data    = full_info,
                       columns = ['MAL_ID', 'Name', 'Type', 'Genre', 'Synopsis'])

In [9]:
info_df

Unnamed: 0,MAL_ID,Name,Type,Genre,Synopsis
0,1,Cowboy Bebop,TV,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,Movie,"Action, Drama, Mystery, Sci-Fi, Space","Another day, another bounty—such is the life o..."
2,6,Trigun,TV,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,TV,"Action, Mystery, Police, Supernatural, Drama, ...",Witches are individuals with special powers li...
4,8,Bouken Ou Beet,TV,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...
...,...,...,...,...,...
17557,48481,Daomu Biji Zhi Qinling Shen Shu,ONA,"Adventure, Mystery, Supernatural",No synopsis information has been added to this...
17558,48483,Mieruko-chan,TV,"Comedy, Horror, Supernatural",Miko is a typical high school student whose li...
17559,48488,Higurashi no Naku Koro ni Sotsu,TV,"Mystery, Dementia, Horror, Psychological, Supe...",Sequel to Higurashi no Naku Koro ni Gou .
17560,48491,Yama no Susume: Next Summit,TV,"Adventure, Slice of Life, Comedy",New Yama no Susume anime.


In [10]:
info_df.to_csv('anime.csv', index=False)

In [11]:
!rm -rf ./html