# Top 1000 Manhwa Web Scraper
### Tugas Seleksi 1 Calon Asisten Lab Basis Data
### Wan Aufa Azis - 18221001

## Library Import

In [2]:

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from datetime import datetime
import time
import requests
import json
import pandas as pd


## Function and Procedure

In [178]:

# Function to get manhwa_data from each url
def get_manhwa_data(url, headers):
    manhwa_webpage = requests.get(url, headers=headers)

    new_soup = BeautifulSoup(manhwa_webpage.content, "html.parser")

    # Get soup left_side of page 
    left_side = new_soup.find('div', attrs={'class' : 'leftside'})
    attrs = left_side.find_all('div', {'class' : 'spaceit_pad'})

    # Get att name
    name = new_soup.find('span', attrs={'class' : 'h1-title'}).text
    
    # Store genre and author list
    gen_list = []
    auth_list = []

    # Loop for extract manhwa attributes from left_side page soup
    for att in attrs:
        if("Volumes" in att.text):
            vol = att.text.strip('Volumes:').strip()
            if(vol=='Unknown'):
                vol = None
            else:
                vol = int(vol)
        elif("Chapters" in att.text):
            chap = att.text.strip('Chapters:').strip()
            if(chap=='Unknown'):
                chap = None
            else:
                chap = int(chap)
        elif("Status" in att.text):
            stat = att.text.strip('Status:').strip()
        elif("Published" in att.text):
            try:
                pubs = att.text.strip('Published:').strip().split(' to ')
                pubs_starts = pubs[0]
                pubs_ends = pubs[1]
                if(pubs_ends=='?'):
                    pubs_ends = None
                else:
                    pubs_ends = datetime.strptime(pubs_ends, "%b %d, %Y")
                pubs_starts = datetime.strptime(pubs_starts, "%b %d, %Y")
            except:
                pubs_starts = None
                pubs_ends = None
        elif("Genre" in att.text):
            gens = att.find_all('span', {'itemprop' : 'genre'})
            for gen in gens:
                gen_list.append(gen.text)
        elif("Serialization" in att.text):
            serial = att.text.strip('Serialization:').strip()
            if(serial=='N'):serial=None
        elif("Authors" in att.text):
            authors = att.find_all('a')
            for auth in authors:
                auth_list.append(auth.text)
        elif("Score" in att.text):
            score = float(att.find('span', {'itemprop' : 'ratingValue'}).text.strip())
        elif("Ranked" in att.text):
            rank = att.find('span').next.next.text.strip()
            rank = int(rank[1:])
        elif("Popularity" in att.text):
            pop_rank = int(att.text.strip('Popularity: #'))
        elif("Members" in att.text):
            memb = att.text.strip('Members:').strip()
            if(',' in memb):
                memb = memb.replace(',','')
            memb = int(memb)
        elif("Favorites" in att.text):
            fav = att.text.strip('Favorites:').strip()
            if(',' in fav):
                fav = fav.replace(',','')
            fav = int(fav)
        
    return name, vol, chap, stat, pubs_starts, pubs_ends, gen_list, serial, auth_list, score, rank, pop_rank, memb, fav

# Function to write data to json file
def write_json(new_data, filename='..\data\data_manhwa.json'):
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        # Join new_data with file_data inside emp_details
        file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file, indent = 5, default=str)

# Function to add each data_lsit to arr
def add_data_to_list(arr, data_list):
    for data in data_list:
        arr.append(data)

## Main Program

In [188]:

if __name__ == '__main__':

    # Headers for request
    headers = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'})
    
    # Webpage url
    url = 'https://myanimelist.net/topmanga.php?type=manhwa&limit=0'

    #Chrome webdriver initialization
    browser = webdriver.Chrome()
    browser.get(url)

    # List of all manhwa links
    links_list = []

    # Loop 20 times to get 1000 record
    for i in range(20):
        # Sub-webpage html
        html = browser.page_source

        # Soup object
        soup = BeautifulSoup(html, "html.parser")

        # Links extraction
        rows = soup.find_all('a', attrs={'class' : 'hoverinfo_trigger fs14 fw-b'})

        # Store the links to links_list
        for row in rows:
            links_list.append(row.get('href'))

        # Search for next sub-webpage button
        next_button = browser.find_element(By.XPATH, '//a[@class="link-blue-box next"]')

        if next_button.is_enabled:
            next_button.click()
            time.sleep(10)

    with open("..\data\data_manhwa.json", "w") as f:
        json.dump([], f)

    # Store all genre and author list
    all_genre_list = []
    all_author_list = []

    # Loop for extracting product details from each link
    for link in links_list:
        name, vol, chap, stat, pubs_starts, pubs_ends, gen_list, serial, auth_list, score, rank, pop_rank, memb, fav = get_manhwa_data(link, headers)

        data = {
            "name": name,
            "volumes": vol,
            "chapters": chap,
            "status": stat,
            "published_start": pubs_starts,
            "published_end": pubs_ends,
            "genres": gen_list,
            "serialization": serial,
            "authors": auth_list,
            "score": score,
            "ranked": rank,
            "popularity_rank": pop_rank,
            "members": memb,
            "favorites": fav
        }

        # Write data to json file
        write_json(data)

        # Add genre and author data to list
        add_data_to_list(all_genre_list, gen_list)
        add_data_to_list(all_author_list, auth_list)

    # Remove duplicates value from list
    all_genre_list = list(dict.fromkeys(all_genre_list))
    all_author_list = list(dict.fromkeys(all_author_list))




ValueError: invalid literal for int() with base 10: 'English: Who Can Define Popularity?'

## Data Converting to csv file

In [198]:

df = pd.read_json('..\data\data_manhwa.json')

# 1. Manhwa_Info Entity
manhwa_info_df = df[['name', 'volumes', 'chapters', 'status', 'published_start', 'published_end', 'serialization']]
manhwa_info_df.insert(0,'idmanhwa', range(1, len(manhwa_info_df)+1))

# 2. Genre Entity
genre_df = pd.DataFrame(all_genre_list, columns=['name'])
genre_df.insert(0, 'idgenre', range(1, len(genre_df)+1))
genre_df.insert(2, 'description', value=None)

genre_df.to_csv('..\data\genre.csv', index=False)

# 3. Manhwa_Genre Entity
manhwa_genre_df = (df[['name', 'genres']]).explode('genres')

manhwa_genre_df = manhwa_genre_df.merge(manhwa_info_df, left_on='name', right_on='name', how='left')
manhwa_genre_df = manhwa_genre_df.drop(['name', 'volumes', 'chapters', 'status', 'published_start', 'published_end', 'serialization'], axis=1)

manhwa_genre_df = manhwa_genre_df.merge(genre_df, left_on='genres', right_on='name', how='left')
manhwa_genre_df = manhwa_genre_df.drop(['name', 'genres', 'description'], axis=1)

manhwa_genre_df.to_csv('..\data\manhwa_genre.csv', index=False)

# 4. Manhwa_Statistic Entity
manhwa_statistic_df = df[['score', 'ranked', 'popularity_rank', 'members', 'favorites']]
manhwa_statistic_df.insert(0,'manhwa', range(1, len(manhwa_statistic_df)+1))

manhwa_statistic_df.to_csv('..\data\manhwa_statistic.csv', index=False)

# 5. Author Entity
author_df = pd.DataFrame(all_author_list, columns=['name'])
author_df.insert(0, 'idauthor', range(1, len(author_df)+1))
author_df.insert(2, 'family_name', value=None)
author_df.insert(3, 'gender', value=None)
author_df.insert(4, 'birthday', value=None)
author_df.insert(5, 'website', value=None)
author_df.insert(6, 'member_favorites', value=None)

author_df.to_csv('../data/author.csv', index=False)

# 6. Manhwa_Author Entity
manhwa_author_df = (df[['name', 'authors']]).explode('authors')

manhwa_author_df = manhwa_author_df.merge(manhwa_info_df, left_on='name', right_on='name', how='left')
manhwa_author_df = manhwa_author_df.drop(['name', 'volumes', 'chapters', 'status', 'published_start', 'published_end', 'serialization'], axis=1)

manhwa_author_df = manhwa_author_df.merge(author_df, left_on='authors', right_on='name', how='left')
manhwa_author_df = manhwa_author_df.drop(['authors', 'name', 'family_name', 'gender', 'birthday', 'website', 'member_favorites'], axis=1)

manhwa_author_df.to_csv('..\data\manhwa_author.csv', index=False)

# 7. serialization entity
serialization_df = df[['serialization']].dropna().drop_duplicates()
serialization_df.insert(0, 'idserialization', range(1, len(serialization_df)+1))
serialization_df.insert(2, 'website', value=None)
serialization_df.insert(3, 'manhwa_published', value=None)
serialization_df.insert(4, 'owner', value=None)
serialization_df.insert(5, 'launched', value=None)

serialization_df.to_csv('..\data\serialization.csv', index=False)

manhwa_info_df = manhwa_info_df.merge(serialization_df, on='serialization', how='left')
manhwa_info_df = manhwa_info_df.drop(['serialization', 'website', 'manhwa_published', 'owner', 'launched'], axis=1)

manhwa_info_df.to_csv('..\data\manhwa_info.csv', index=False)


## Data cleaning remove null and duplicate value

In [None]:

df = pd.read_csv('../data/manhwa_genre.csv')
df = df.dropna()
df = df.drop_duplicates()
df.to_csv('../data/manhwa_genre.csv', index=False)

df = pd.read_csv('../data/manhwa_author.csv')
df = df.dropna()
df = df.drop_duplicates()
df.to_csv('../data/manhwa_author.csv', index=False)