# Analyzing a zip of PDFs for submission instruction compatibility

Currenlty checks:

* Page count limit
* Anonymity: whether there is an email on the first page
* Margin width and hight (compares it to the first PDF, make sure that one is correct : )

We highly recommend that you analyze all submissions **24 hours before the submission deadline!**

At ECMLPKDD 2022 we had around 40/1200 of desk rejects, most of which were detected by this code. And almost all of them asked to resubmit a properly formated/anonymized version, which we unfortunately couldn't as reviewing had started...


In [1]:
import PyPDF2 # pip install PyPDF2
import zipfile
import tempfile
import pandas as pd
import os
import re
import subprocess # for calling 'gs', you need to have 'ghostscript' installed to check for margins!
from tqdm import tqdm

page_limit = 16
zname = "data/Submissions Fake.zip"

In [2]:
# emails people use as anonymous placeholders
ignorelist = ["lncs@springer.com", "lncs@springer.comhttp", "blinded@email.com", "author@mail.com", "XXX@XXX.com", "email@domain.com", "mail@mail.mail", "some@email.com", "firstname.lastname@XXXX.com", "anon.email@domain.com", "xxx@xxx.xxx"]

# https://stackabuse.com/python-validate-email-address-with-regular-expressions-regex/
regex_email = re.compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')

margin_slack = 4 # how far can the margins diverge from the first PDF?
                 # UNTESTED footnotes may mess things up...

has_gs = True
bbox_first = None
if subprocess.run('which gs', shell=True).returncode != 0:
    # turns out extracting the bbox from a PDF is non-trivial
    # and the best to do is to rerender it with gs, as per:
    # https://unix.stackexchange.com/questions/668990/extract-document-information-from-pdf-file-like-width-of-margins
    print("Warning, Ghostscript (command 'gs') not installed, can not check margins")
    has_gs = False

data = [] # (reason, explanation, fname)
with tempfile.TemporaryDirectory() as tmpdirname:
    #print('created temporary directory', tmpdirname)

    with zipfile.ZipFile(zname, 'r') as archive:
        for fname in tqdm(archive.namelist()):
            #print(f"Extracting '{fname}'")
            archive.extract(fname, tmpdirname)
            tmpname = os.path.join(tmpdirname, fname)
            
            with open(tmpname, 'rb') as pdfObj:
                x = PyPDF2.PdfFileReader(pdfObj)
            
                # check page count
                if x.getNumPages() > page_limit:
                    data.append( ("page limit", f"potentially over page limit: {x.getNumPages()} pages", fname) )
            
                # search for email in first page
                p = x.getPage(0)
                txt = p.extractText()
                match = regex_email.search(txt)
                if match is not None:
                    email = match.group(0)
                    if not email in ignorelist and not 'anonymous' in email:
                        data.append( ("non-blind", f"potentially non-anonymous: {email}", fname) )
            
            # check margins
            st = subprocess.run(['gs', '-sDEVICE=bbox', '-q', '-dFirstPage=1', '-dLastPage=1', tmpname], capture_output=True)
            bbox = str(st.stderr).split('\\n')[0].split(' ')[1:5]
            bbox = list(map(int, bbox))
            if bbox_first is None:
                bbox_first = bbox
            else:
                if any([abs(fst - ths) > margin_slack for fst,ths in zip(bbox_first, bbox)]):
                    # one of the margins over the allowed slack, probably different template
                    data.append( ("different margins", f"Different margins detected: {bbox} versus first paper had {bbox_first}", fname) )
            
            os.remove(tmpname)

df = pd.DataFrame(data, columns=["reason", "explanation", "filename"])
df.sort_values(['reason', 'explanation'])

100%|██████████| 4/4 [00:00<00:00, 10.63it/s]


Unnamed: 0,reason,explanation,filename
2,different margins,"Different margins detected: [89, 46, 514, 693]...",5/Submission/lncs_a4wide.pdf
0,non-blind,potentially non-anonymous: ecmlpkdd2022@framal...,2/Submission/lncs_authors.pdf
1,page limit,potentially over page limit: 17 pages,3/Submission/lncs_overlength.pdf
