# Assignment 1, ex3, part1
by Raphael Ebner, Nicolas Hellthaler, Bastian Müller

In [2]:
# Imports
# pdf tools
import PyPDF2
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

# file tools
import os

# analyzer tools
from difflib import SequenceMatcher

In [3]:
def write_text_from_pdf(pdf_file_path, method):
    """
    Converts a *.pdf file into plain text using a specific library, then writes the text of the pdf file into a txt file.
    :param pdf_file_path: path to *.pdf file
    :param method: one of two possible options: "pypdf2" or "OTHER_LIBRARY"
    """
    target_path = os.path.dirname(pdf_file_path)
    target_name = os.path.splitext(os.path.basename(pdf_file_path))[0]  # splitext: file name without extension

    print("Start working on file", pdf_file_path, "with method", method)

    if method.lower() == "pypdf2":
        text = get_text_from_pdf_using_pypdf2(pdf_file_path)
    elif method.lower() == "pdfminer":
        text = get_text_from_pdf_using_pdfminer(pdf_file_path)
    else:
        raise ValueError("Method '" + method + "' does not exist.")

    write_text_to_file(target_path, target_name, method, text)

In [4]:
def get_text_from_pdf_using_pypdf2(pdf_file_path):
    """
    Returns the text found in the pdf.
    :param pdf_file_path: path to pdf file
    :return: text found in pdf file
    """
    with open(pdf_file_path,'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file, strict=False)
        amount_pages = pdf_reader.numPages
        text = ""
        for page_num in range(amount_pages):
            page_object = pdf_reader.getPage(page_num)
            text += page_object.extractText()
    return text

In [5]:
def get_text_from_pdf_using_pdfminer(pdf_file_path):
    """
    Returns the text found in the pdf.
    :param pdf_file_path: path to pdf file
    :return: text found in pdf file
    """

    # Source (slightly adjusted): https://stackoverflow.com/questions/26494211/extracting-text-from-a-pdf-file-using-pdfminer-in-python

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    with open(pdf_file_path, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()

        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)

        text = retstr.getvalue()

    device.close()
    retstr.close()
    return text

In [6]:
def write_text_to_file(path, name, method, text):
    """
    Writes text to a file named name.txt in target_path.
    :param path: path where file should be written to
    :param name: name of the txt file
    :param method: method that was used to extract the text
    :param text: text of the txt file
    """
    with open(os.path.join(path, name + "_" + method + ".txt"), "w") as writefile:
        writefile.writelines(text)

## Run pdf to txt on all provided files using both methods:
## *Warning*: pdfminer takes a long time analyzing 'bundeswehr.pdf'

In [7]:
folders = ['flyers', 'iban', 'scans']

files_method_pypdf2 = []
files_method_pdfminer = []

for folder in folders:
    for file in os.listdir(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'files', folder)):
        name, ext = os.path.splitext(file)
        if ext == '.pdf':
            path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'files', folder, file)
            write_text_from_pdf(path, 'pypdf2')
            write_text_from_pdf(path, 'pdfminer')
            files_method_pypdf2.append(os.path.join(os.path.dirname(path), os.path.splitext(file)[0] + "_" + "pypdf2" + ".txt"))
            files_method_pdfminer.append(os.path.join(os.path.dirname(path), os.path.splitext(file)[0] + "_" + "pdfminer" + ".txt"))

Start working on file /Users/bastianmuller/Desktop/Programming/Python/ds-project-wc2022/files/flyers/wegweiser_senioren.pdf with method pypdf2
Start working on file /Users/bastianmuller/Desktop/Programming/Python/ds-project-wc2022/files/flyers/wegweiser_senioren.pdf with method pdfminer
Start working on file /Users/bastianmuller/Desktop/Programming/Python/ds-project-wc2022/files/flyers/bahnstadt.pdf with method pypdf2
Start working on file /Users/bastianmuller/Desktop/Programming/Python/ds-project-wc2022/files/flyers/bahnstadt.pdf with method pdfminer
Start working on file /Users/bastianmuller/Desktop/Programming/Python/ds-project-wc2022/files/flyers/bundeswehr.pdf with method pypdf2
Start working on file /Users/bastianmuller/Desktop/Programming/Python/ds-project-wc2022/files/flyers/bundeswehr.pdf with method pdfminer
Start working on file /Users/bastianmuller/Desktop/Programming/Python/ds-project-wc2022/files/iban/liste1.pdf with method pypdf2
Start working on file /Users/bastianmulle

# Analysis
## Part 1: Quality
Quality in this case refers to the readability of the text document and the accuracy of the extracted information.

We observed that pypdf2 was able to extract information in a more readable way.
An example for that would be 'iban/liste1.pdf' file. Both were able to extract about the same information but in case of 'pypdf2' we get an iban in one row, while with 'pdfminer' we get the country id over some rows and after that the numbers belonging to the id. It would take way more time to extract the correct ibans using this text file.
Another example of that behavior would be the 'flyers/bundeswehr.pdf' file. It is not really clear where which numbers belong to. While using the 'pypdf2' generated text file we can clearly observe a pattern which makes text analytics work later easier.

Also, from an accuracy perspective it seems like the 'pypdf2' extraction is better.
For example, we get 'BERGH EIM' from 'pdfminer' and 'BERGHEIM' from 'pypdf2' in the 'flyers/bahnstadt.pdf' file.

## Part 2: Quantity

In [8]:
for i in range(len(files_method_pypdf2)):
    with open(files_method_pypdf2[i], 'r') as file:
        string = str(file.read()).replace(" ", "").replace("\n","")
        amount_chars_pypdf2 = len(string)

    with open(files_method_pdfminer[i], 'r') as file:
        string = str(file.read()).replace(" ", "").replace("\n","")
        amount_chars_pdfminer = len(string)

    print("Amount characters pypdf2:", amount_chars_pypdf2)
    print("Amount characters pdfminer:", amount_chars_pdfminer)
    print("pypdf2 more?", amount_chars_pypdf2 > amount_chars_pdfminer)
    print("---")

Amount characters pypdf2: 111620
Amount characters pdfminer: 111710
pypdf2 more? False
---
Amount characters pypdf2: 110096
Amount characters pdfminer: 109901
pypdf2 more? True
---
Amount characters pypdf2: 297143
Amount characters pdfminer: 297223
pypdf2 more? False
---
Amount characters pypdf2: 3449
Amount characters pdfminer: 3451
pypdf2 more? False
---
Amount characters pypdf2: 4892
Amount characters pdfminer: 4893
pypdf2 more? False
---
Amount characters pypdf2: 13043
Amount characters pdfminer: 13044
pypdf2 more? False
---


Is seems like pdfminer was able to extract more characters when not counting new lines and spaces.

## Finally, let's see how similar the two outputs are.

In [9]:
for i in range(len(files_method_pypdf2)):
    with open(files_method_pypdf2[i], 'r') as file:
        string_pypdf2 = str(file.read()).replace(" ", "").replace("\n","")


    with open(files_method_pdfminer[i], 'r') as file:
        string_pdfminer = str(file.read()).replace(" ", "").replace("\n","")


    print("SequenceMatcher.ratio:", SequenceMatcher(None, string_pypdf2, string_pdfminer).ratio())
    print("---")

SequenceMatcher.ratio: 0.9682443021537634
---
SequenceMatcher.ratio: 0.13267453647095187
---
SequenceMatcher.ratio: 0.28162108868946073
---
SequenceMatcher.ratio: 0.19246376811594204
---
SequenceMatcher.ratio: 0.5951967296882984
---
SequenceMatcher.ratio: 0.26848621918963467
---


Since we found the quality of the 'pypdf2' files to be higher, we prefer the formatting and found the differences in extracted text to be very low (almost same amount of characters), we decide to move forward with those files. The low similarity based on SequenceMatcher stems from the different formatting. For example in the 'iban/liste1.pdf', the approach of 'pypdf2' is more row-based and in case of 'pdfminer' more column-based.