In [1]:
import pdfrw
from pdf2image import convert_from_path # Needs conda install -c conda-forge poppler
from PIL import Image
from collections import OrderedDict

ANNOT_KEY = '/Annots'               # key for all annotations within a page
ANNOT_FIELD_KEY = '/T'              # Name of field. i.e. given ID of field
ANNOT_FORM_type = '/FT'             # Form type (e.g. text/button)
ANNOT_FORM_button = '/Btn'          # ID for buttons, i.e. a checkbox
ANNOT_FORM_text = '/Tx'             # ID for textbox
SUBTYPE_KEY = '/Subtype'
WIDGET_SUBTYPE_KEY = '/Widget'
ANNOT_FIELD_PARENT_KEY = '/Parent'  # Parent key for older pdf versions
ANNOT_FIELD_KIDS_KEY = '/Kids'      # Kids key for older pdf versions
ANNOT_VAL_KEY = '/V'
ANNOT_RECT_KEY = '/Rect'

def get_form_fields(input_pdf_path):
    """
    Retrieves the form fields from a pdf to then be stored as a dictionary and
    passed to the write_fillable_pdf() function. Uses pdfrw.
    Parameters
    ---------
    input_pdf_path: str
        Path to the pdf you want the fields from.
    Returns
    ---------
    A dictionary of form fields and their filled values.
    """
    data_dict = {}

    pdf = pdfrw.PdfReader(input_pdf_path)
    for page in pdf.pages:
        annotations = page[ANNOT_KEY]
        if annotations:
            for annotation in annotations:
                if annotation[SUBTYPE_KEY] == WIDGET_SUBTYPE_KEY:
                    if annotation[ANNOT_FIELD_KEY]:
                        key = annotation[ANNOT_FIELD_KEY][1:-1]
                        data_dict[key] = ''
                        if annotation[ANNOT_VAL_KEY]:
                            value = annotation[ANNOT_VAL_KEY]
                            data_dict[key] = annotation[ANNOT_VAL_KEY]
                            try:
                                if type(annotation[ANNOT_VAL_KEY]) == pdfrw.objects.pdfstring.PdfString:
                                    data_dict[key] = pdfrw.objects.PdfString.decode(annotation[ANNOT_VAL_KEY])
                            except:
                                pass
    return data_dict


def print_form_fields(input_pdf_path):
    """
    Retrieves the form fields from get_form_fields(), then pretty prints
    the data_dict. Uses pdfrw.
    Parameters
    ---------
    input_pdf_path: str
        Path to the pdf you want the fields from.
    Returns
    ---------
    """
    data_dict = get_form_fields(input_pdf_path)
    print("{" + ",\n".join("{!r}: {!r}".format(k, v) for k, v in data_dict.items()) + "}")


def flatten_pdf(input_pdf_path, output_pdf_path, as_images=False):
    """
    Flattens the pdf so each annotation becomes uneditable. This function provides
    two ways to do so, either with the pdfrw function annotation.update(pdfrw.PdfDict(Ff=1))
    or converting the pages to images then reinserting.
    Parameters
    ---------
    input_pdf_path: str
        Path to the pdf you want to flatten.
    output_pdf_path: str
        Path of the new pdf that is generated.
    as_images: bool
        Default is False meaning it will update each individual annotation and set
        it to False. True means it will convert to images and then reinsert into the
        pdf
    Returns
    ---------
    """
    if as_images == True:
        images = convert_from_path(input_pdf_path) 
        im1 = images[0]
        images.pop(0)

        pdf1_filename = output_pdf_path

        im1.save(pdf1_filename, "PDF" ,resolution=100.0, save_all=True, append_images=images)
    else:
        ANNOT_KEY = '/Annots'               # key for all annotations within a page

        template_pdf = pdfrw.PdfReader(input_pdf_path)
        for Page in template_pdf.pages:
            if Page[ANNOT_KEY]:
                for annotation in Page[ANNOT_KEY]:
                    annotation.update(pdfrw.PdfDict(Ff=1))
        template_pdf.Root.AcroForm.update(pdfrw.PdfDict(NeedAppearances=pdfrw.PdfObject('true')))
        pdfrw.PdfWriter().write(output_pdf_path, template_pdf)
        

def convert_dict_values_to_string(dictionary):
    """
    Converts dictionary values to string including arrays and tuples.
    Parameters
    ---------
    dictionary: dict
        Any single level dictionary. Specifically made for the data_dict returned from
        the function get_form_fields() from the fillpdf library
    Returns
    ---------
    res: dict
        The resulting dictionary with only string values.
    """
    list_delim, tuple_delim = '-', '^'
  
    res = dict()
    for sub in dictionary:

        # checking data types
        if isinstance(dictionary[sub], list):
            res[sub] = list_delim.join([str(ele) for ele in dictionary[sub]])
        elif isinstance(dictionary[sub], tuple):
            res[sub] = tuple_delim.join(list([str(ele) for ele in dictionary[sub]]))
        else:
            res[sub] = str(dictionary[sub])
            
    return res    
    
    
def write_fillable_pdf(input_pdf_path, output_pdf_path, data_dict, flatten=False):
    """
    Writes the dictionary values to the pdf. Currently supports text and buttons.
    Does so by updating each individual annotation with the contents of the dat_dict.
    Parameters
    ---------
    input_pdf_path: str
        Path to the pdf you want to flatten.
    output_pdf_path: str
        Path of the new pdf that is generated.
    data_dict: dict
        The data_dict returned from the function get_form_fields()
    flatten: bool
        Default is False meaning it will stay editable. True means the annotations
        will be uneditable.
    Returns
    ---------
    """
    data_dict = convert_dict_values_to_string(data_dict)

    template_pdf = pdfrw.PdfReader(input_pdf_path)
    for Page in template_pdf.pages:
        if Page[ANNOT_KEY]:
            for annotation in Page[ANNOT_KEY]:
                target = annotation if annotation[ANNOT_FIELD_KEY] else annotation[ANNOT_FIELD_PARENT_KEY]
                if target and annotation[SUBTYPE_KEY] == WIDGET_SUBTYPE_KEY:
                    key = target[ANNOT_FIELD_KEY][1:-1] # Remove parentheses
                    if key in data_dict.keys():
                        if target[ANNOT_FORM_type] == ANNOT_FORM_button:
                            # button field i.e. a checkbox
                            target.update( pdfrw.PdfDict( V=pdfrw.PdfName(data_dict[key]) , AS=pdfrw.PdfName(data_dict[key]) ))
                            if target[ANNOT_FIELD_KIDS_KEY]:
                                target[ANNOT_FIELD_KIDS_KEY][0].update( pdfrw.PdfDict( V=pdfrw.PdfName(data_dict[key]) , AS=pdfrw.PdfName(data_dict[key]) ))
                        elif target[ANNOT_FORM_type] == ANNOT_FORM_text:
                            # regular text field
                            target.update( pdfrw.PdfDict( V=data_dict[key], AP=data_dict[key]) )
                            if target[ANNOT_FIELD_KIDS_KEY]:
                                target[ANNOT_FIELD_KIDS_KEY][0].update( pdfrw.PdfDict( V=data_dict[key], AP=data_dict[key]) )
                if flatten == True:
                    annotation.update(pdfrw.PdfDict(Ff=1))
    template_pdf.Root.AcroForm.update(pdfrw.PdfDict(NeedAppearances=pdfrw.PdfObject('true')))
    pdfrw.PdfWriter().write(output_pdf_path, template_pdf)

In [3]:
pip install fillpdf -U

Collecting fillpdf
  Downloading fillpdf-0.3.2-py3-none-any.whl (5.7 kB)
Installing collected packages: fillpdf
  Attempting uninstall: fillpdf
    Found existing installation: fillpdf 0.2.2
    Uninstalling fillpdf-0.2.2:
      Successfully uninstalled fillpdf-0.2.2
Successfully installed fillpdf-0.3.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
from fillpdf import fillpdfs

In [5]:
p = fillpdfs.print_form_fields('ex2.pdf')

{'Given Name Text Box': '',
'Family Name Text Box': '',
'Address 1 Text Box': '',
'House nr Text Box': '',
'Address 2 Text Box': '',
'Postcode Text Box': '',
'City Text Box': '',
'Country Combo Box': '',
'Gender List Box': 'Man',
'Height Formatted Field': '150',
'Driving License Check Box': '/Off',
'Language 1 Check Box': '/Off',
'Language 2 Check Box': '/Yes',
'Language 3 Check Box': '/Off',
'Language 4 Check Box': '/Off',
'Language 5 Check Box': '/Off',
'Favourite Colour List Box': 'Red'}


In [6]:
p

In [4]:
data_dict = {'Given Name Text Box': '',
'Family Name Text Box': 'dd',
'Address 1 Text Box': '',
'House nr Text Box': '',
'Address 2 Text Box': '',
'Postcode Text Box': '',
'City Text Box': '',
'Country Combo Box': '',
'Gender List Box': 'Woman',
'Height Formatted Field': '150',
'Driving License Check Box': '/Off',
'Language 1 Check Box': '/Off',
'Language 2 Check Box': '/Yes',
'Language 3 Check Box': '/Off',
'Language 4 Check Box': '/Off',
'Language 5 Check Box': '/Off',
'Favourite Colour List Box': 'Red'}

In [5]:
write_fillable_pdf('ex2.pdf', 'new.pdf', data_dict)

In [4]:
get_form_fields('new.pdf')

NameError: name 'get_form_fields' is not defined

In [19]:
write_fillable_pdf('ex2.pdf', 'new.pdf', data_dict, flatten=True)

In [20]:
get_form_fields('new.pdf')

{'Given Name Text Box': '',
'Family Name Text Box': 'dd',
'Address 1 Text Box': '',
'House nr Text Box': '',
'Address 2 Text Box': '',
'Postcode Text Box': '',
'City Text Box': '',
'Country Combo Box': '',
'Gender List Box': 'Man',
'Height Formatted Field': '150',
'Driving License Check Box': '//Off',
'Language 1 Check Box': '//Off',
'Language 2 Check Box': '//Yes',
'Language 3 Check Box': '//Off',
'Language 4 Check Box': '//Off',
'Language 5 Check Box': '//Off',
'Favourite Colour List Box': 'Red'}
