In [1]:
from io import BytesIO

from IPython.core import display
from PIL import Image

# Installed apps
from PyPDF2 import PdfFileReader
from sorl.thumbnail import ImageField
from wand.image import Image as WandImage
from wand.color import Color
import os.path
# Project apps

def display_pil_image(im):
   """Displayhook function for PIL Images, rendered as PNG."""

   b = BytesIO()
   im.save(b, format='png')
   data = b.getvalue()
   ip_img = display.Image(data=data, format='png', embed=True)
   return ip_img._repr_png_()


# register display func with PNG formatter:
png_formatter = get_ipython().display_formatter.formatters['image/png']
dpi = png_formatter.for_type(Image.Image, display_pil_image)

In [2]:
import os
from glob import glob
from datetime import datetime
from django.conf import settings
from django.core.files.base import ContentFile
from apps.issues.models import PrintIssue, current_issue

PDF_STAGING = os.path.join(settings.STAGING_ROOT, 'STAGING', 'PDF')
PDF_FOLDER = os.path.join(settings.STAGING_ROOT, 'pdf')
FILENAME_PATTERN = 'universitas_{issue.date.year}-{issue.number}{suffix}.pdf'

def get_staging_pdf_files(magazine='1'):
    globpattern = '{folder}/UNI1{version}VER*.pdf'.format(
        folder=PDF_STAGING,
        version=magazine,
    )
    all_files = glob(globpattern)
    new_files = []
    for pdf_file in all_files:
        age = datetime.now() - \
            datetime.fromtimestamp(os.path.getctime(pdf_file))
        if age.days > 4:
            os.remove(pdf_file)
        else:
            new_files.append(pdf_file)
    return sorted(new_files)

pdf_files = get_staging_pdf_files()

In [16]:
import subprocess
import os
import hashlib
import tempfile

def hash_file(filepath, blocksize=65536):
    """Hexadecimal sha256 hash of a file stored on local disk"""
    hasher = hashlib.sha256()
    with open(filepath, 'rb') as source:
        buf = source.read(blocksize)
        while len(buf) > 0:
            hasher.update(buf)
            buf = source.read(blocksize)
    return hasher.hexdigest()


def tmp_file(filepath):
    tmp_dir = tempfile.gettempdir()
    filename = os.path.join(
        tmp_dir, 
        hash_file(filepath) + '.' +
        os.path.basename(filepath),
    )
    return filename

def optimize_page(input_file):
    """Compress images and convert to rgb"""
    output_file = tmp_file(input_file)
    binary = '/usr/bin/gs'
    args = [
        binary,
        '-dColorConversionStrategy=/sRGB',
        '-dColorConversionStrategyForImages=/sRGB',
        '-dBATCH',
        '-dNOPAUSE',
        '-sDEVICE=pdfwrite',
        '-dConvertCMYKImagesToRGB=true',
        '-dDownsampleColorImages=true',
        '-dDownsampleGrayImages=true',
        '-dDownsampleMonoImages=true',
        '-dColorImageResolution=120',
        '-dGrayImageResolution=120',
        '-dMonoImageResolution=120',
        '-o', output_file,
        input_file,
    ]
    if not os.path.exists(output_file):
        subprocess.run(args)
    return output_file

pages = [optimize_page(pdf) for pdf in pdf_files]
print(pages) 

['/tmp/user/1000/a2a512c16b1a055f467655e3c366a2e27d72e7bbe1f9258407768ced70d00bb7.UNI11VER16090201000.pdf', '/tmp/user/1000/14c75687cb83300e15913e3e7dd71e812ca8e99473f65e3e2393a0c631ccba7a.UNI11VER16090202000.pdf', '/tmp/user/1000/513b29826f962aa698f384ff5044a08a6672a1f2f5c7b5910d690d328d87ec7d.UNI11VER16090203000.pdf', '/tmp/user/1000/9ea333f600ea733f95d2554777ee90ec12c3b9e7b45b0254a120c807dec20ef8.UNI11VER16090204000.pdf', '/tmp/user/1000/99bdd3bd5c8a6aa0d2d2d265d1977ff8f1c3671cb1c97d39b132908ff52f11ae.UNI11VER16090205000.pdf', '/tmp/user/1000/a2a3eb5018202a6e98ee037ec914ede9e0e88eabf2db29e76bb3a4274cf2c149.UNI11VER16090206000.pdf', '/tmp/user/1000/eb3adedad63027a744331dc6361539d2382063111a8819def92819de5a93e90b.UNI11VER16090207000.pdf', '/tmp/user/1000/1f58e4202e5f6be8af8115e7ba134b91d16de4cb15ca01126e76e9ecd43923a3.UNI11VER16090208000.pdf', '/tmp/user/1000/514cbded51caf57e1babaf43ef0dfc91a1d57619ba0669a1ec151fa24ba1fdb0.UNI11VER16090209000.pdf', '/tmp/user/1000/b7ab3888f39bf15fd753

In [18]:
# from IPython.display import Image
# from wand.exceptions import BlobError
# from wand.drawing import Drawing
from PyPDF2 import PdfFileWriter, PdfFileReader
from io import BytesIO
from wand.image import Image as WandImage
# from wand.color import Color
from PyPDF2 import PdfFileWriter, PdfFileReader

def pdf_to_wandimage(pdf):
    reader = PdfFileReader(pdf)
    writer = PdfFileWriter()
    writer.addPage(reader.getPage(0))
    outputStream = BytesIO()
    writer.write(outputStream)
    outputStream.seek(0)
    img = WandImage(blob=outputStream, format='pdf', resolution=50)
    return img

def pdf_to_png(pdf_page):
    folder = './img'
    os.makedirs(folder, exist_ok=True)
    filename = os.path.join(
        folder, os.path.basename(pdf_page) + '.png')
    print(filename)
    if not os.path.exists(filename):
        img = pdf_to_wandimage(pdf_page)
        img.save(filename=filename)
    return filename
    
images = [pdf_to_png(pdf) for pdf in pages]

./img/a2a512c16b1a055f467655e3c366a2e27d72e7bbe1f9258407768ced70d00bb7.UNI11VER16090201000.pdf.png
./img/14c75687cb83300e15913e3e7dd71e812ca8e99473f65e3e2393a0c631ccba7a.UNI11VER16090202000.pdf.png
./img/513b29826f962aa698f384ff5044a08a6672a1f2f5c7b5910d690d328d87ec7d.UNI11VER16090203000.pdf.png
./img/9ea333f600ea733f95d2554777ee90ec12c3b9e7b45b0254a120c807dec20ef8.UNI11VER16090204000.pdf.png
./img/99bdd3bd5c8a6aa0d2d2d265d1977ff8f1c3671cb1c97d39b132908ff52f11ae.UNI11VER16090205000.pdf.png
./img/a2a3eb5018202a6e98ee037ec914ede9e0e88eabf2db29e76bb3a4274cf2c149.UNI11VER16090206000.pdf.png
./img/eb3adedad63027a744331dc6361539d2382063111a8819def92819de5a93e90b.UNI11VER16090207000.pdf.png
./img/1f58e4202e5f6be8af8115e7ba134b91d16de4cb15ca01126e76e9ecd43923a3.UNI11VER16090208000.pdf.png
./img/514cbded51caf57e1babaf43ef0dfc91a1d57619ba0669a1ec151fa24ba1fdb0.UNI11VER16090209000.pdf.png
./img/b7ab3888f39bf15fd7531787f8cb33abe95e6c310a650304b9c6a1b4a9c92b86.UNI11VER16090210000.pdf.png
./img/aad3

In [19]:
from IPython.display import HTML
html = '\n'.join('<img src={} style="display: inline-block; width: 20%;" />'.format(fn) for fn in images)
HTML(html)


In [57]:
from PyPDF2 import PdfFileMerger
def merge_pages(pages, bundle_file='bundle.pdf'):
    merger = PdfFileMerger()
    merger.setPageLayout('/TwoPageRight')
    for page in pages:
        merger.append(page)
    merger.write(bundle_file)
    return bundle_file

    
def optimize_pdf(filename):
    binary = '/usr/bin/gs'
    outfile = 'bundle2.pdf'
    args = [
        binary,
        '-dNOPAUSE',
        '-dBATCH',
        '-dCompressFonts=true',
        '-dSubsetFonts=true',
        '-dCompatibilityLevel=1.6',
        '-dDetectDuplicateImages=true',
        '-sDEVICE=pdfwrite',
        '-o', outfile,
        filename,
    ]
    result = subprocess.run(args)
    print(result)

filename = merge_pages(pages)
optimize_pdf(filename)

CompletedProcess(args=['/usr/bin/gs', '-dNOPAUSE', '-dBATCH', '-dCompressFonts=true', '-dSubsetFonts=true', '-dCompatibilityLevel=1.6', '-dDetectDuplicateImages=true', '-sDEVICE=pdfwrite', '-o', 'bundle2.pdf', 'bundle.pdf'], returncode=0)


In [23]:
import PyPDF2
