# Assignment 1, ex3, part3
by Raphael Ebner, Nicolas Hellthaler, Bastian Müller

In [5]:
# Imports
# File utils
import os
import shutil

# Analyzing tools
import re
import validators
import requests
from itertools import product

Get arrays with paths to all .txt files generated using 'pypdf2'.

In [4]:
folders = ['flyers', 'iban', 'scans']
files = []

for folder in folders:
    for file in os.listdir(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'files', folder)):
        name, ext = os.path.splitext(file)
        if ext == '.pdf':
            path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'files', folder, file)
            files.append(os.path.join(os.path.dirname(path), os.path.splitext(file)[0] + "_" + "pypdf2" + ".txt"))

files_flyers = [file for file in files if "flyers" in file]
files_flyers_copy = []
files_iban = [file for file in files if "iban" in file]
files_iban_copy = []
files_scans = [file for file in files if "scans" in file]
files_scans_copy = []

## (i) Extract phone numbers from PDF files in /flyers
We start by identifying what format the phone numbers are in by looking at our PDF files.

Takeaways bahnstadt.pdf
- Most start with 0
- Have to exclude fax numbers, often prefixed by 'Fax' or 'Telefax'
- Different formats like '0621 / 89..' or '0621 - 89 ..'
- Different length
- Some phone numbers don't have a prefix because the prefix is mentioned in the row above (no solution found)

Takeaways bundeswehr.pdf
- Only use first number, second (if exists) is fax number
- 'Telephone IVB' in third column?

Takeaways wegweiser_senioren.pdf
- Different formats '(0621) 89 ..'
- Again sometimes no prefix, e.g. '(0 62 21) 58-38340, 58-38330'
- Replace thin spaces in txt file

First we copy each txt file to format it later.

In [5]:
def copy_subpart1_files():
    """
    Copies the txt files used for the analysis. Keeps the file name, adds '_subpartX'.
    """
    for flyer in files_flyers:
        shutil.copy(flyer, flyer.replace('.txt','_subpart1.txt'))
        files_flyers_copy.append(flyer.replace('.txt', '_subpart1.txt'))

In [6]:
copy_subpart1_files()

Based on the findings above we write three different algorithms to takle each .pdf file individually.
We start by formatting the txt files and then make use of regex extensively.

In [7]:
numbers = []

In [8]:
def format_bahnstadt():
    """
    Formats the bahnstadt copy txt file and prepares it for analysis.
    """
    bahnstadt_copy = [flyer for flyer in files_flyers_copy if 'bahnstadt' in flyer][0]
    text = ""
    with open(bahnstadt_copy, 'r') as bahnstadt_file:
        text += bahnstadt_file.read()
    text = text.replace('\n', ' ')  # Get rid of newlines
    text = text.replace('-', ' ')  # Get rid of dashes
    text = text.replace('/', ' ')  # Get rid of slashes
    text = text.replace(' ', '')  # Get rid of spaces
    with open(bahnstadt_copy, 'w') as bahnstadt_file:
        bahnstadt_file.writelines(text)

In [9]:
format_bahnstadt()

Now, we get all numbers that:
- are so called 'Notrufnummern' (3 to 6 digits)
- start with '0' and are longer than 5 (normal numbers without prefix or with prefix, use prefix from number before)
- are not prefixed by a string containing 'fax'

In [10]:
def find_numbers_bahnstadt() -> list:
    """
    Searches inside the bahnstadt copy txt file for valid phone numbers.
    :return: List of numbers found in txt file.
    """
    matches = []
    fax_numbers = []
    bahnstadt_copy = [flyer for flyer in files_flyers_copy if 'bahnstadt' in flyer][0]
    text = ""
    with open(bahnstadt_copy, 'r') as bahnstadt_file:
        text += bahnstadt_file.read()
    for match in re.findall(r'(110|112|115|116117|19222|116006)', text):
        matches.append(match)
    for match in re.findall(r'0[1-9][0-9]{6,12}', text):
        matches.append(match)
    for match in re.findall(r'[a-zA-Z]*fax[a-zA-Z]*(\d+)', text, flags=re.I):  # find all fax numbers
        fax_numbers.append(match)
    matches = [match for match in matches if match not in fax_numbers]  # remove fax numbers from solution
    return list(set(matches))  # convert to set to remove duplicates

Add numbers from bahnstadt to numbers array

In [11]:
for number in find_numbers_bahnstadt():
    numbers.append(number)

Prepare bundeswehr file

In [12]:
def format_bundeswehr():
    """
    Formats the bahnstadt copy txt file and prepares it for analysis.
    """
    bundeswehr_copy = [flyer for flyer in files_flyers_copy if 'bundeswehr' in flyer][0]
    text = ""
    with open(bundeswehr_copy, 'r') as bundeswehr_file:
        text += bundeswehr_file.read()
    text = text.replace('\n', ' ')  # Get rid of newlines
    text = text.replace('-', ' ')  # Get rid of dashes
    text = text.replace('/', ' ')  # Get rid of slashes
    text = text.replace(' ', '')  # Get rid of spaces
    with open(bundeswehr_copy, 'w') as bundeswehr_file:
        bundeswehr_file.writelines(text)

In [13]:
format_bundeswehr()

It is (at least for us) impossible to tell when looking at the txt file that was generated what a telephone and what a fax number is.
So for this part, we will ignore all numbers but the first one for each entry. This is fine for the majority of cases.
The only exception is, when we find four numbers, then we add the first and second to last ones to our solution.

In [14]:
def find_numbers_bundeswehr() -> list:
    """
    Searches inside the bundeswehr copy txt file for valid phone numbers.
    :return: List of numbers found in txt file.
    """
    matches = []
    bundeswehr_copy = [flyer for flyer in files_flyers_copy if 'bundeswehr' in flyer][0]
    text = ""
    with open(bundeswehr_copy, 'r') as bundeswehr_file:
        text += bundeswehr_file.read()
    for match in re.findall(r'(\+\d+)\+\d+(\+\d+)\+\d+', text):  # extract two phone numbers in case four numbers were found
        matches.append(match[0])
        matches.append(match[1])
    for match in re.findall(r'(\+\d+)\+\d+(\+\d+)[^+]', text):  # extract two phone numbers in case three numbers where found. In almost all cases, when three numbers were provided they are in the order phone, fax, phone
        matches.append(match[0])
        matches.append(match[1])
    for match in re.findall(r'(\+\d+)\+\d+[^+]', text):  # extract phone number in case only one number is found
        matches.append(match)
    return list(set(matches))  # convert to set to remove duplicates

In [15]:
for number in find_numbers_bundeswehr():
    numbers.append(number)

Lastly we look at the wegweiser_senioren.pdf.
Again, we start by formatting the file.

In [16]:
def format_wegweiser_senioren():
    """
    Formats the wegweiser_senioren copy txt file and prepares it for analysis.
    """
    wegweiser_senioren_copy = [flyer for flyer in files_flyers_copy if 'wegweiser_senioren' in flyer][0]
    text = ""
    with open(wegweiser_senioren_copy, 'r') as wegweiser_senioren_file:
        text += wegweiser_senioren_file.read()
    text = text.replace('\n', ' ')  # Get rid of newlines
    text = text.replace('-', ' ')  # Get rid of dashes
    text = text.replace('/', ' ')  # Get rid of slashes
    text = text.replace('(', ' ')  # Remove parentheses
    text = text.replace(')', ' ')  # Remove parentheses
    text = re.sub(r'[^\x00-\x7F]+',' ', text)  # Get rid of thin spaces and other unicode characters
    text = text.replace(' ', '')  # Get rid of spaces
    with open(wegweiser_senioren_copy, 'w') as wegweiser_senioren_file:
        wegweiser_senioren_file.writelines(text)

In [17]:
format_wegweiser_senioren()

Now we start extracting numbers again.
This time we are in a similar situation to the bahnstadt.pdf problem. We have to filter for fax numbers again. Additionally the format is different.

In [18]:
def find_numbers_wegweiser_senioren() -> list:
    """
    Searches inside the wegweiser_senioren copy txt file for valid phone numbers.
    :return: List of numbers found in txt file.
    """
    matches = []
    fax_numbers = []
    wegweiser_senioren_copy = [flyer for flyer in files_flyers_copy if 'wegweiser_senioren' in flyer][0]
    text = ""
    with open(wegweiser_senioren_copy, 'r') as wegweiser_senioren_file:
        text += wegweiser_senioren_file.read()
    for match in re.findall(r'(110|112|115|116117|19222|116006)', text):
        matches.append(match)
    for match in re.findall(r'0[1-9][0-9]{6,12}', text):
        matches.append(match)
    for match in re.findall(r'[a-zA-Z]*fax[a-zA-Z]*(\d+)', text, flags=re.I):  # find all fax numbers
        fax_numbers.append(match)
    matches = [match for match in matches if match not in fax_numbers]  # remove fax numbers from solution
    return list(set(matches))  # convert to set to remove duplicates

In [19]:
for number in find_numbers_bundeswehr():
    numbers.append(number)

Now we glue everything together.
First, we get rid of duplicates.
Second, we bring all numbers (except emergency short numbers) in a normalized format.
Third, we write everything in one solution file.

In [20]:
numbers = list(set(numbers))

In [21]:
numbers_normalized = []

for number in numbers:
    if len(number) < 7:
        numbers_normalized.append(number)
    elif '+' in number:
        numbers_normalized.append(number)
    else:
        numbers_normalized.append('+49' + number[1:])


In [22]:
with open(os.path.dirname(files_flyers[0]) + '/numbers.txt', 'w') as writefile:
    for number in numbers_normalized:
        writefile.write(number)
        writefile.write('\n')

## (ii) Extract URLs and mail addresses from PDF files in /flyers
We start  by looking at our PDF files.

Again, we make copies of the original parse work, format the files and then analyze and validate the urls and mail addresses found using the 'validator' library.

In [23]:
files_flyers_copy.clear()
def copy_subpart2_files():
    """
    Copies the txt files used for the analysis. Keeps the file name, adds '_subpartX'.
    """
    for flyer in files_flyers:
        shutil.copy(flyer, flyer.replace('.txt','_subpart2.txt'))
        files_flyers_copy.append(flyer.replace('.txt', '_subpart2.txt'))

In [24]:
copy_subpart2_files()

Define arrays to save urls and mail addresses to.

In [62]:
urls = []
mail_addresses = []

Let's begin with the bahnstadt.pdf file.

In [63]:
def format_bahnstadt_url_mail():
    """
    Formats the bahnstadt copy txt file and prepares it for analysis.
    """
    bahnstadt_copy = [flyer for flyer in files_flyers_copy if 'bahnstadt' in flyer][0]
    text = ""
    with open(bahnstadt_copy, 'r') as bahnstadt_file:
        text += bahnstadt_file.read()
    text = text.replace('\n', ' ')  # Get rid of newlines
    text = text.lower()
    with open(bahnstadt_copy, 'w') as bahnstadt_file:
        bahnstadt_file.writelines(text)

In [64]:
format_bahnstadt_url_mail()

First find urls in bahnstadt text.

In [65]:
def find_urls_bahnstadt() -> list:
    """
    Searches inside the bahnstadt copy txt file for valid urls.
    :return: List of urls found in txt file.
    """
    potential_urls = []
    final_urls = []
    bahnstadt_copy = [flyer for flyer in files_flyers_copy if 'bahnstadt' in flyer][0]
    text = ""
    with open(bahnstadt_copy, 'r') as bahnstadt_file:
        text += bahnstadt_file.read()
    text = text.split(' ')
    for element in text:
        if '.' in element:  # every website contains at least one dot
            potential_urls.append(element)

    # Source: https://stackoverflow.com/questions/7160737/how-to-validate-a-url-in-python-malformed-or-not (Answer by cetver)
    is_url_regex = re.compile(
        r'^www.' # starts with 'www.'
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

    for url in potential_urls:
        match = re.match(r'([\w.-]+\.(de|com))\w+', url, flags=re.IGNORECASE)
        if re.match(is_url_regex, url) is not None:
            if match:
                url = match.groups(0)[0]
            try:
                print("Start validating:", url)
                if requests.get('https://' + url, timeout=5).ok or requests.get('http://' + url, timeout=5).ok:
                    final_urls.append(url)
            except requests.ConnectionError:
                print("Error: Url", url, "is not valid.")
            except requests.ReadTimeout:
                print("Error: Timeout for", url)

    return list(set(final_urls))  # convert to set to remove duplicates

In [66]:
for url in find_urls_bahnstadt():
    urls.append(url)

Start validating: www.zahnarzt-heidelberg-kirchheim.de
Start validating: www.frauennotruf-heidelberg.de
Start validating: www.theatergemeinde-heidelberg.de
Start validating: www.kreisverkehrswacht-rhein-neckar.de
Start validating: www.weststadtverein.de
Start validating: www.bundespolizei.de
Start validating: www.malteser.de
Start validating: www.johanniter-baden.de
Error: Url www.johanniter-baden.de is not valid.
Start validating: www.fgz-heidelberg.de
Error: Timeout for www.fgz-heidelberg.de
Start validating: www.selbsthilfe-heidelberg.de
Start validating: www.bahnstadtverein.de/.
Start validating: www.bahnstadtverein.de/
Start validating: www.offener-mrt.com
Start validating: www.offener-mrt.com
Start validating: www.hd-bergheim.de.
Start validating: www.bestattungshaus-heidelberg.de
Start validating: www.fgz-heidelberg.de
Start validating: www.facebook.com/fgzheidelberg
Start validating: www.bibez.de
Start validating: www.weststadtverein.de/festbuch
Start validating: www.ihkkg.de
S

Let's continue with urls from the bundeswehr.pdf file.

In [67]:
def format_bundeswehr_url_mail():
    """
    Formats the bahnstadt copy txt file and prepares it for analysis.
    """
    bundeswehr_copy = [flyer for flyer in files_flyers_copy if 'bundeswehr' in flyer][0]
    text = ""
    with open(bundeswehr_copy, 'r') as bundeswehr_file:
        text += bundeswehr_file.read()
    text = text.replace('\n', ' ')  # Get rid of newlines
    text = text.replace(' ', '')  # Get rid of spaces
    text = text.lower()
    with open(bundeswehr_copy, 'w') as bundeswehr_file:
        bundeswehr_file.writelines(text)

In [68]:
format_bundeswehr_url_mail()

In [69]:
def find_urls_bundeswehr() -> list:
    """
    Searches inside the bundeswehr copy txt file for valid urls.
    :return: List of urls found in txt file.
    """
    potential_urls = []
    final_urls = []
    bundeswehr_copy = [flyer for flyer in files_flyers_copy if 'bundeswehr' in flyer][0]
    text = ""
    with open(bundeswehr_copy, 'r') as bundeswehr_file:
        text += bundeswehr_file.read()

    for match in re.findall(r'((http|https)://www\.(\w|-|)+\.de)', text):  # All urls in this sheet are structured like this
        potential_urls.append(match[0])

    # The urls are sometimes written over multiple lines with '-' symbols to separate them. Sometimes this makes a found url invalid, since we don't know if the '-' is from a line break or from the url itself. So we have to try all solutions:
    # First, let's get rid of duplicates again
    potential_urls = list(set(potential_urls))

    for potential_url in potential_urls:
        # Source: https://stackoverflow.com/questions/14841652/string-replacement-combinations
        options = [(c,) if c != '-' else ('-', '') for c in potential_url]
        for index, _ in enumerate(list((''.join(o) for o in product(*options)))):
            try:
                if requests.get(_, timeout=5).ok:
                    final_urls.append(potential_url)
                    print("Option", index, "successful with", _)
                    break
            except requests.ConnectionError:
                pass
            except requests.ReadTimeout:
                print("Error: Timeout for", url)

    return list(set(final_urls))  # convert to set to remove duplicates

In [70]:
for _ in find_urls_bundeswehr():  # This takes a while to finish
    urls.append(_)

Option 0 successful with http://www.fz-borstel.de
Option 1 successful with http://www.patentanwalt.de
Option 1 successful with http://www.pflegebevollmaechtigte.de
Option 0 successful with http://www.adk.de
Option 2 successful with http://www.zukunftsforum-familie.de
Option 0 successful with http://www.bnotk.de
Option 1 successful with http://www.dzm-museum.de
Option 0 successful with http://www.stiftung-evz.de
Option 1 successful with http://www.behindertenbeauftragter.de
Option 0 successful with http://www.mdc-berlin.de
Option 0 successful with http://www.acatech.de
Option 0 successful with http://www.helmholtz.de
Option 0 successful with http://www.agj.de
Option 0 successful with http://www.deutsche-kinemathek.de
Option 0 successful with http://www.iom-leipzig.de
Option 0 successful with http://www.thuenen.de
Option 0 successful with http://www.giz.de
Option 0 successful with http://www.awv-net.de
Option 0 successful with http://www.leibniz-ifl.de
Option 0 successful with http://www

Let's focus on the last pdf.

In [71]:
def format_wegweiser_senioren_url_mail():
    """
    Formats the wegweiser_senioren copy txt file and prepares it for analysis.
    """
    wegweiser_senioren_copy = [flyer for flyer in files_flyers_copy if 'wegweiser_senioren' in flyer][0]
    text = ""
    with open(wegweiser_senioren_copy, 'r') as wegweiser_senioren_file:
        text += wegweiser_senioren_file.read()
    text = text.replace('\n', ' ')  # Get rid of newlines
    text = text.lower()
    text = re.sub(r'[^\x00-\x7F]+',' ', text)  # Get rid of thin spaces and other unicode characters
    with open(wegweiser_senioren_copy, 'w') as wegweiser_senioren_file:
        wegweiser_senioren_file.writelines(text)

In [72]:
format_wegweiser_senioren_url_mail()

In [73]:
def find_urls_wegweiser_senioren() -> list:
    """
    Searches inside the wegweiser_senioren copy txt file for valid urls.
    :return: List of urls found in txt file.
    """
    potential_urls = []
    final_urls = []
    wegweiser_senioren_copy = [flyer for flyer in files_flyers_copy if 'wegweiser_senioren' in flyer][0]
    text = ""
    with open(wegweiser_senioren_copy, 'r') as wegweiser_senioren_file:
        text += wegweiser_senioren_file.read()
    text = text.split(' ')
    for element in text:
        if '.' in element:  # every website contains at least one dot
            potential_urls.append(element)

    # Source: https://stackoverflow.com/questions/7160737/how-to-validate-a-url-in-python-malformed-or-not (Answer by cetver)
    is_url_regex = re.compile(
        r'^www.' # starts with 'www.'
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

    for url_ in potential_urls:
        match = re.match(r'([\w.-]+\.(de|com))\w+', url_, flags=re.IGNORECASE)
        if re.match(is_url_regex, url_) is not None:
            if match:
                url_ = match.groups(0)[0]
            try:
                print("Start validating:", url_)
                if requests.get('https://' + url_, timeout=5).ok or requests.get('http://' + url_, timeout=5).ok:
                    final_urls.append(url_)
            except requests.ConnectionError:
                print("Error: Url", url_, "is not valid.")
            except requests.ReadTimeout:
                print("Error: Timeout for", url)

    return list(set(final_urls))  # convert to set to remove duplicates

In [74]:
for _ in find_urls_wegweiser_senioren():
    urls.append(_)

Start validating: www.heidelberg.de/senioren
Start validating: www.seniorenzentren-hd.de
Start validating: www.diakonie-heidelberg.de
Start validating: www.diakonie-heidelberg.de
Start validating: www.diakonie-heidelberg.de
Start validating: www.awo-heidelberg.de/
Start validating: www.akademie-fuer-aeltere.de
Start validating: www.vrn.de
Start validating: www.diakonie-heidelberg.de
Start validating: www.selbsthilfe-heidelberg.de
Start validating: www.freiwilligenagentur-heidelberg.de
Start validating: www.tauschring-heidelberg.de
Error: Url www.tauschring-heidelberg.de is not valid.
Start validating: www.diakonie-heidelberg.de
Start validating: www.diakonie-heidelberg.de
Start validating: www.kath-dekanat-hw.de
Start validating: www.kirche-heidelberg.de.
Error: Url www.kirche-heidelberg.de. is not valid.
Start validating: www.caritas-heidelberg.de
Start validating: www.drk-heidelberg.de
Start validating: www.heidelberg.de/senioren
Start validating: www.caritas-heidelberg.de
Start vali

Now let's write all the urls into a dedicated file.
First, normalize the urls.

In [75]:
urls_normalized = []

for _ in list(set(urls)):
    if 'www' == _[:3]:
        urls_normalized.append('https://' + _)
    else:
        urls_normalized.append(_)

In [77]:
with open(os.path.dirname(files_flyers[0]) + '/urls.txt', 'w') as writefile:
    for normalized_url in urls_normalized:
        writefile.write(normalized_url)
        writefile.write('\n')

Woohoo. Now let's focus on finding some mail addresses.

In [85]:
def find_mails_bahnstadt() -> list:
    """
    Searches inside the bahnstadt copy txt file for valid mails.
    :return: List of mails found in txt file.
    """
    potential_mails = []
    final_mails = []
    bahnstadt_copy = [flyer for flyer in files_flyers_copy if 'bahnstadt' in flyer][0]
    text = ""
    with open(bahnstadt_copy, 'r') as bahnstadt_file:
        text += bahnstadt_file.read()
    text = text.split(' ')
    for element in text:
        if '@' in element:  # every mail contains an '@'
            potential_mails.append(element)

    potential_mails = list(set(potential_mails))

    for mail in potential_mails:
        regexp = re.compile(r'(.*\.(de|com|org))\w*')
        match = re.match(regexp, mail)
        if match:
            mail = match.groups(0)[0]
        if validators.email(mail):
            final_mails.append(mail)

    return list(set(final_mails))  # convert to set to remove duplicates

In [87]:
for mail in find_mails_bahnstadt():
    mail_addresses.append(mail)

Off to the next file.

In [100]:
def find_mails_bundeswehr() -> list:
    """
    Searches inside the bundeswehr copy txt file for valid mails.
    :return: List of mails found in txt file.
    """
    potential_mails = []
    final_mails = []
    bundeswehr_copy = [flyer for flyer in files_flyers_copy if 'bundeswehr' in flyer][0]
    text = ""
    with open(bundeswehr_copy, 'r') as bundeswehr_file:
        text += bundeswehr_file.read()

    for match in re.findall(r'[a-z]+\d{5}((\w|-|\.)+@(\w|-|\.)+)(http|\+)', text):  # All mails are preceded by an address (city ([a-z], plz (\d{5})) and tailed by the website (http...) or a phone/fax number
        potential_mails.append(match[0])

    potential_mails = list(set(potential_mails))

    # This part is tricky, because again we face the issue of lots of '-' separating some mail addresses, and we don't know whether they belong to the mail address or stem from the formatting of the pdf.
    # So again, we would need to test all options.
    # The additional problem is that to properly verify a mail address you have to check SMTP servers and so on.
    # However, these services (e.g. Mailboxlayer, isiteralemail, etc.) are paid, so we will just include every possible mail, so we know for sure, that we will reach everyone.

    for potential_mail in potential_mails:
        # Source: https://stackoverflow.com/questions/14841652/string-replacement-combinations
        options = [(c,) if c != '-' else ('-', '') for c in potential_mail]
        for index, _ in enumerate(list((''.join(o) for o in product(*options)))):
            if validators.email(_):
                final_mails.append(_)

    return list(set(final_mails))  # convert to set to remove duplicates

In [102]:
for mail in find_mails_bundeswehr():
    mail_addresses.append(mail)

In [103]:
def find_mails_wegweiser_senioren() -> list:
    """
    Searches inside the wegweiser_senioren copy txt file for valid mails.
    :return: List of mails found in txt file.
    """
    potential_mails = []
    final_mails = []
    wegweiser_senioren_copy = [flyer for flyer in files_flyers_copy if 'wegweiser_senioren' in flyer][0]
    text = ""
    with open(wegweiser_senioren_copy, 'r') as wegweiser_senioren_file:
        text += wegweiser_senioren_file.read()
    text = text.split(' ')
    for element in text:
        if '@' in element:  # every website contains at least one dot
            potential_mails.append(element)

    potential_mails = list(set(potential_mails))

    for _ in potential_mails:
        regexp = re.compile(r'(.*\.(de|com|org))\w*')
        match = re.match(regexp, _)
        if match:
            _ = match.groups(0)[0]
        if validators.email(_):
            final_mails.append(_)

    return list(set(final_mails))  # convert to set to remove duplicates

In [105]:
for mail in find_mails_wegweiser_senioren():
    mail_addresses.append(mail)

Let's put 'em in a file.

In [106]:
with open(os.path.dirname(files_flyers[0]) + '/mails.txt', 'w') as writefile:
    for mail in mail_addresses:
        writefile.write(mail)
        writefile.write('\n')

After a never ending part ii:
## part iii: Find IBANs in iban.pdf file

In [133]:
ibans = []

In [131]:
def find_ibans_in_iban():
    potential_ibans = []
    final_ibans = []
    with open(files_iban[0], 'r') as iban_read:
        for line in iban_read.readlines():
            potential_ibans.append(line[:-2])

    for iban in potential_ibans:
        match = re.findall(r'([A-Z]{2}[0-9]{2}(?:[ ][A-Z0-9]{4})+)[ ]', iban)
        if match:
            final_ibans.append(match[0])

    return final_ibans

In [134]:
for iban in find_ibans_in_iban():
    ibans.append(iban)

In [135]:
with open(os.path.dirname(files_iban[0]) + '/ibans.txt', 'w') as writefile:
    for iban in ibans:
        writefile.write(iban)
        writefile.write('\n')

# Are we.. are we.. done?? o.O