In [1]:
import time
import subprocess
import os
from urllib.request import urlopen

import pandas as pd
from bs4 import BeautifulSoup
from pdfrw import PdfReader, PdfWriter

## settings

In [2]:
filenames = [
    '/Users/ygene.lee/Desktop/pdf_raw/회계학 문제집_신은미.pdf',
    '/Users/ygene.lee/Desktop/pdf_raw/객관식민법_이찬석.pdf',
    '/Users/ygene.lee/Desktop/pdf_raw/객관식 경제학_미시편(2판)_함경백.pdf',
    '/Users/ygene.lee/Desktop/pdf_raw/관계법규객관식_이상곤.pdf',
]
intervals = [
    [(1,371),(372,508),(510,669)],
    [(1,222),(224,428)],
    [(1,318),(319,579),(580,745)],
    [(1,168),(170,229),(230,292),(294,365),(366,422),(424,489),(490,546),(548,630)],
]
start_pages = [[s for (s, e) in i] for i in intervals]

save_path = '/Users/ygene.lee/Desktop/pdf_split'
os.makedirs(save_path, exist_ok=True)

## split pdf

In [3]:
for i, f in enumerate(filenames):
    
    name, ext = f.split('/')[-1].split('.')
    starts = start_pages[i]
    
    pdf = PdfReader(f)
    file_num_fmt = '{' + ':0{}d'.format(len(str(len(starts)))) + '}'

    first = True
    for page_num, page in enumerate(pdf.pages, 1):
        if page_num in starts:
            if first:
                writer = PdfWriter()
                first = False
                file_num = 1
                save_name = '{}_{}.{}'.format(name, file_num_fmt.format(file_num), ext)
                print('file name: {}, start page: {}'.format(save_name, page_num))
            else:                
                # save previous file
                writer.write(os.path.join(save_path, save_name))                
                # open new writer
                writer = PdfWriter()
                file_num += 1
                save_name = '{}_{}.{}'.format(name, file_num_fmt.format(file_num), ext)
                print('file name: {}, start page: {}'.format(save_name, page_num))
                
        writer.addpages([page])
        
    # save last file
    writer.write(os.path.join(save_path, save_name))                

file name: 회계학 문제집_신은미_1.pdf, start page: 1
file name: 회계학 문제집_신은미_2.pdf, start page: 372
file name: 회계학 문제집_신은미_3.pdf, start page: 510
file name: 객관식민법_이찬석_1.pdf, start page: 1
file name: 객관식민법_이찬석_2.pdf, start page: 224
file name: 객관식 경제학_미시편(2판)_함경백_1.pdf, start page: 1
file name: 객관식 경제학_미시편(2판)_함경백_2.pdf, start page: 319
file name: 객관식 경제학_미시편(2판)_함경백_3.pdf, start page: 580
file name: 관계법규객관식_이상곤_1.pdf, start page: 1
file name: 관계법규객관식_이상곤_2.pdf, start page: 170
file name: 관계법규객관식_이상곤_3.pdf, start page: 230
file name: 관계법규객관식_이상곤_4.pdf, start page: 294
file name: 관계법규객관식_이상곤_5.pdf, start page: 366
file name: 관계법규객관식_이상곤_6.pdf, start page: 424
file name: 관계법규객관식_이상곤_7.pdf, start page: 490
file name: 관계법규객관식_이상곤_8.pdf, start page: 548


## file sizes

In [4]:
raw_path = '/Users/ygene.lee/Desktop/pdf_raw/'
split_path = '/Users/ygene.lee/Desktop/pdf_split/'

raw_files = os.listdir(raw_path)
split_files = os.listdir(split_path)

raw_sizes = []
for f in raw_files:
    raw_sizes.append(os.path.getsize(os.path.join(raw_path, f)) / 10**6)

split_sizes = []
for f in split_files:
    split_sizes.append(os.path.getsize(os.path.join(split_path, f)) / 10**6)

In [5]:
split_data = pd.DataFrame()
split_data['file_name'] = split_files
split_data['size_mb'] = split_sizes

raw_data = pd.DataFrame()
raw_data['file_name'] = raw_files
raw_data['size_mb'] = raw_sizes

In [6]:
split_data['base_name'] = split_data.file_name.apply(lambda x: '_'.join(x[:-4].split('_')[:-1]))
raw_data['base_name'] = raw_data.file_name.apply(lambda x: x[:-4])

In [7]:
split_data.groupby(['base_name'])['size_mb'].sum().to_frame().reset_index()

Unnamed: 0,base_name,size_mb
0,.DS,0.006148
1,객관식 경제학_미시편(2판)_함경백,316.716778
2,객관식민법_이찬석,225.428205
3,관계법규객관식_이상곤,276.98997
4,회계학 문제집_신은미,280.593954


In [8]:
raw_data[['base_name', 'size_mb']].sort_values('base_name')

Unnamed: 0,base_name,size_mb
0,.DS_S,0.006148
3,객관식 경제학_미시편(2판)_함경백,316.931209
4,객관식민법_이찬석,225.550961
1,관계법규객관식_이상곤,277.171067
2,회계학 문제집_신은미,280.786773
