# Analyzing a zip of PDFs for submission instruction compatibility

Currenlty checks:

* Page count limit
* Anonymity: whether there is an email on the first page

We highly recommend that you analyze all submissions **24 hours before the submission deadline!**

At ECMLPKDD 2022 we had around 40/1200 of desk rejects, most of which were detected by this code. And almost all of them asked to resubmit a properly formated/anonymized version, which we unfortunately couldn't as reviewing had started...

__Feature request__: we also had around 5/1200 desk rejects due to wrong margins, which this code can not detect. We welcome any contribution that can automatically detect margin size (and hence deviations in margin size)!


In [1]:
import PyPDF2 # pip install PyPDF2
import zipfile
import tempfile
import pandas as pd
import os
import re

page_limit = 16
zname = "data/Submissions Fake.zip"

In [2]:
# emails people use as anonymous placeholders
ignorelist = ["lncs@springer.com", "lncs@springer.comhttp", "blinded@email.com", "author@mail.com", "XXX@XXX.com", "email@domain.com", "mail@mail.mail", "some@email.com", "firstname.lastname@XXXX.com", "anon.email@domain.com", "xxx@xxx.xxx"]

# https://stackabuse.com/python-validate-email-address-with-regular-expressions-regex/
regex_email = re.compile(r'([A-Za-z0-9]+[.-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+')

data = [] # (reason, explanation, fname)
with tempfile.TemporaryDirectory() as tmpdirname:
    #print('created temporary directory', tmpdirname)

    with zipfile.ZipFile(zname, 'r') as archive:
        for fname in archive.namelist():
            #print(f"Extracting '{fname}'")
            archive.extract(fname, tmpdirname)
            tmpname = os.path.join(tmpdirname, fname)
            
            with open(tmpname, 'rb') as pdfObj:
                x = PyPDF2.PdfFileReader(pdfObj)
            
                # check page count
                if x.getNumPages() > page_limit:
                    data.append( ("page limit", f"potentially over page limit: {x.getNumPages()} pages", fname) )
            
                # search for email in first page
                p = x.getPage(0)
                txt = p.extractText()
                match = regex_email.search(txt)
                if match is not None:
                    email = match.group(0)
                    if not email in ignorelist and not 'anonymous' in email:
                        data.append( ("non-blind", f"potentially non-anonymous: {email}", fname) )
            os.remove(tmpname)

df = pd.DataFrame(data, columns=["reason", "explanation", "filename"])
df.sort_values(['reason', 'explanation'])

Unnamed: 0,reason,explanation,filename
0,non-blind,potentially non-anonymous: ecmlpkdd2022@framal...,2/Submission/lncs_authors.pdf
1,page limit,potentially over page limit: 17 pages,3/Submission/lncs_overlength.pdf
