# wuxia-pdf

wuxia-pdf is a tool that creates a pdf of any novel on [wuxiaworld](http://wuxiaworld.com).

My reason for creating this was to easily read my novels in bulk on mobile, in areas where I don't have internet service.

In [1]:
from bs4 import BeautifulSoup, NavigableString
import urllib3
urllib3.disable_warnings()

from IPython.display import IFrame

from reportlab.lib.colors import HexColor
from reportlab.lib.enums import TA_CENTER
from reportlab.lib.units import inch
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont 
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer

from PyPDF2 import PdfFileMerger, PdfFileReader
import os

http = urllib3.PoolManager()

def load_chapter(novel, num, pre_index=None):
    pre = novel if pre_index is None else pre_index
    url = f"http://www.wuxiaworld.com/{pre}-index/{novel}-chapter-{num}"
    response = http.request("GET", url)
    soup = BeautifulSoup(response.data, "html5lib").find("body")
    
    article = soup.find("div", itemprop="articleBody")
    if article is None:
        return None
    
    top = True
    chapter = {"body": "", "title": None}
    chapter["novel"] = novel
    chapter["number"] = num
    
    for p in article.children:
        if isinstance(p, NavigableString) or p.find("a"): continue
        if p.text is not "" and top:
            chapter["title"] = p.text
            top = False
        else:
            chapter["body"]+= f"{p.text}<br/><br/>"
            
    if chapter["title"] is None:
        return None
    
    """Sometimes a chapter will be inside one <p>, rather than distributed through the div in <p>'s"""
    if len(chapter["title"]) > len(chapter["body"]):
        chapter["title"], chapter["body"] = chapter["body"], chapter["title"]
        chapter["title"] = chapter["body"].split("\n")[0]
        chapter["body"] = chapter["body"].replace("\n", "<br/>")
            
    return chapter

# PDF Generation

Using the reportlab module, you can create readable, customizable chapters.

In [2]:
pdfmetrics.registerFont(TTFont("Open Sans", 'OpenSans-Regular.ttf'))

styles = {"default": getSampleStyleSheet()["Normal"]}
styles["default"].fontName = "Open Sans"
styles["title"] = ParagraphStyle("title",
                                 parent=styles["default"],
                                 fontSize=18,
                                 alignment=TA_CENTER,
                                 textColor=HexColor(0x3970D0))

def make_pdf(chapter, ignore):
    file_name = f"{chapter['novel']}_{chapter['number']}.pdf"
    doc = SimpleDocTemplate(file_name,
                            rightMargin=inch/2,
                            leftMargin=inch/2,
                            topMargin=inch/4,
                            bottomMargin=inch/2)

    flow = []
    flow.append(Paragraph(chapter["title"], styles["default"] if ignore else styles["title"]))
    if ignore:
        flow.append(Spacer(0, -2 * inch / 4))
    else:
        flow.append(Spacer(0, inch / 4))
        
    flow.append(Paragraph(chapter["body"], styles["default"]))
    
    doc.build(flow)
    
    return open(file_name, "rb")

# PDF Merging

PyPDF2 allows for the merging of multiple PDF files.

After a PDF is created, it is merged and promptly deleted

In [3]:
merger = PdfFileMerger()

def group_pdf(novel, start=1, depth=1000, pre=None, ignore=False):
    if start < 1 or depth < 1:
        raise ValueError("Chapters must be greater than 1")
        
    dist = 0
    for n in range(start, start + depth):
        dist = n
        """Some novels have a index dissimilar to their chapter format"""
        if pre is None:
            chap = load_chapter(novel, n)
        else:
            chap = load_chapter(novel, n, pre_index=pre)
            
        if chap is None:
            break
        
        pdf = make_pdf(chap, ignore)
        merger.append(PdfFileReader(pdf))
        
        pdf.close()
        os.remove(pdf.name)
          
    out_file = f"{novel}_{start}-{dist}.pdf"
    merger.write(out_file)
    
    return open(out_file, "rb")

Most generated PDF's are perfect, with some having few to no errors.

Some chapters will be weird because there will almost always be a chapter with a completely different format thrown in the middle somewhere.

Novels that function with a book and chapter in their index will not work for now.