# Assignment 1, ex3, part3
by Raphael Ebner, Nicolas Hellthaler, Bastian Müller

In [35]:
# Imports
# File utils
import os
import shutil

# Analyzing tools
import re
import validators
import requests

Get arrays with paths to all .txt files generated using 'pypdf2'.

In [2]:
folders = ['flyers', 'iban', 'scans']
files = []

for folder in folders:
    for file in os.listdir(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'files', folder)):
        name, ext = os.path.splitext(file)
        if ext == '.pdf':
            path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'files', folder, file)
            files.append(os.path.join(os.path.dirname(path), os.path.splitext(file)[0] + "_" + "pypdf2" + ".txt"))

files_flyers = [file for file in files if "flyers" in file]
files_flyers_copy = []
files_iban = [file for file in files if "iban" in file]
files_iban_copy = []
files_scans = [file for file in files if "scans" in file]
files_scans_copy = []

## (i) Extract phone numbers from PDF files in /flyers
We start by identifying what format the phone numbers are in by looking at our PDF files.

Takeaways bahnstadt.pdf
- Most start with 0
- Have to exclude fax numbers, often prefixed by 'Fax' or 'Telefax'
- Different formats like '0621 / 89..' or '0621 - 89 ..'
- Different length
- Some phone numbers don't have a prefix because the prefix is mentioned in the row above (no solution found)

Takeaways bundeswehr.pdf
- Only use first number, second (if exists) is fax number
- 'Telephone IVB' in third column?

Takeaways wegweiser_senioren.pdf
- Different formats '(0621) 89 ..'
- Again sometimes no prefix, e.g. '(0 62 21) 58-38340, 58-38330'
- Replace thin spaces in txt file

First we copy each txt file to format it later.

In [3]:
def copy_subpart1_files():
    """
    Copies the txt files used for the analysis. Keeps the file name, adds '_subpartX'.
    """
    for flyer in files_flyers:
        shutil.copy(flyer, flyer.replace('.txt','_subpart1.txt'))
        files_flyers_copy.append(flyer.replace('.txt', '_subpart1.txt'))

In [4]:
copy_subpart1_files()

Based on the findings above we write three different algorithms to takle each .pdf file individually.
We start by formatting the txt files and then make use of regex extensively.

In [5]:
numbers = []

In [6]:
def format_bahnstadt():
    """
    Formats the bahnstadt copy txt file and prepares it for analysis.
    """
    bahnstadt_copy = [flyer for flyer in files_flyers_copy if 'bahnstadt' in flyer][0]
    text = ""
    with open(bahnstadt_copy, 'r') as bahnstadt_file:
        text += bahnstadt_file.read()
    text = text.replace('\n', ' ')  # Get rid of newlines
    text = text.replace('-', ' ')  # Get rid of dashes
    text = text.replace('/', ' ')  # Get rid of slashes
    text = text.replace(' ', '')  # Get rid of spaces
    with open(bahnstadt_copy, 'w') as bahnstadt_file:
        bahnstadt_file.writelines(text)

In [7]:
format_bahnstadt()

Now, we get all numbers that:
- are so called 'Notrufnummern' (3 to 6 digits)
- start with '0' and are longer than 5 (normal numbers without prefix or with prefix, use prefix from number before)
- are not prefixed by a string containing 'fax'

In [8]:
def find_numbers_bahnstadt() -> list:
    """
    Searches inside the bahnstadt copy txt file for valid phone numbers.
    :return: List of numbers found in txt file.
    """
    matches = []
    fax_numbers = []
    bahnstadt_copy = [flyer for flyer in files_flyers_copy if 'bahnstadt' in flyer][0]
    text = ""
    with open(bahnstadt_copy, 'r') as bahnstadt_file:
        text += bahnstadt_file.read()
    for match in re.findall(r'(110|112|115|116117|19222|116006)', text):
        matches.append(match)
    for match in re.findall(r'0[1-9][0-9]{6,12}', text):
        matches.append(match)
    for match in re.findall(r'[a-zA-Z]*fax[a-zA-Z]*(\d+)', text, flags=re.I):  # find all fax numbers
        fax_numbers.append(match)
    matches = [match for match in matches if match not in fax_numbers]  # remove fax numbers from solution
    return list(set(matches))  # convert to set to remove duplicates

Add numbers from bahnstadt to numbers array

In [9]:
for number in find_numbers_bahnstadt():
    numbers.append(number)

Prepare bundeswehr file

In [10]:
def format_bundeswehr():
    """
    Formats the bahnstadt copy txt file and prepares it for analysis.
    """
    bundeswehr_copy = [flyer for flyer in files_flyers_copy if 'bundeswehr' in flyer][0]
    text = ""
    with open(bundeswehr_copy, 'r') as bundeswehr_file:
        text += bundeswehr_file.read()
    text = text.replace('\n', ' ')  # Get rid of newlines
    text = text.replace('-', ' ')  # Get rid of dashes
    text = text.replace('/', ' ')  # Get rid of slashes
    text = text.replace(' ', '')  # Get rid of spaces
    with open(bundeswehr_copy, 'w') as bundeswehr_file:
        bundeswehr_file.writelines(text)

In [11]:
format_bundeswehr()

It is (at least for us) impossible to tell when looking at the txt file that was generated what a telephone and what a fax number is.
So for this part, we will ignore all numbers but the first one for each entry. This is fine for the majority of cases.
The only exception is, when we find four numbers, then we add the first and second to last ones to our solution.

In [12]:
def find_numbers_bundeswehr() -> list:
    """
    Searches inside the bundeswehr copy txt file for valid phone numbers.
    :return: List of numbers found in txt file.
    """
    matches = []
    bundeswehr_copy = [flyer for flyer in files_flyers_copy if 'bundeswehr' in flyer][0]
    text = ""
    with open(bundeswehr_copy, 'r') as bundeswehr_file:
        text += bundeswehr_file.read()
    for match in re.findall(r'(\+\d+)\+\d+(\+\d+)\+\d+', text):  # extract two phone numbers in case four numbers were found
        matches.append(match[0])
        matches.append(match[1])
    for match in re.findall(r'(\+\d+)\+\d+(\+\d+)[^+]', text):  # extract two phone numbers in case three numbers where found. In almost all cases, when three numbers were provided they are in the order phone, fax, phone
        matches.append(match[0])
        matches.append(match[1])
    for match in re.findall(r'(\+\d+)\+\d+[^+]', text):  # extract phone number in case only one number is found
        matches.append(match)
    return list(set(matches))  # convert to set to remove duplicates

In [13]:
for number in find_numbers_bundeswehr():
    numbers.append(number)

Lastly we look at the wegweiser_senioren.pdf.
Again, we start by formatting the file.

In [14]:
def format_wegweiser_senioren():
    """
    Formats the wegweiser_senioren copy txt file and prepares it for analysis.
    """
    wegweiser_senioren_copy = [flyer for flyer in files_flyers_copy if 'wegweiser_senioren' in flyer][0]
    text = ""
    with open(wegweiser_senioren_copy, 'r') as wegweiser_senioren_file:
        text += wegweiser_senioren_file.read()
    text = text.replace('\n', ' ')  # Get rid of newlines
    text = text.replace('-', ' ')  # Get rid of dashes
    text = text.replace('/', ' ')  # Get rid of slashes
    text = text.replace('(', ' ')  # Remove parentheses
    text = text.replace(')', ' ')  # Remove parentheses
    text = re.sub(r'[^\x00-\x7F]+',' ', text)  # Get rid of thin spaces and other unicode characters
    text = text.replace(' ', '')  # Get rid of spaces
    with open(wegweiser_senioren_copy, 'w') as wegweiser_senioren_file:
        wegweiser_senioren_file.writelines(text)

In [15]:
format_wegweiser_senioren()

Now we start extracting numbers again.
This time we are in a similar situation to the bahnstadt.pdf problem. We have to filter for fax numbers again. Additionally the format is different.

In [16]:
def find_numbers_wegweiser_senioren() -> list:
    """
    Searches inside the wegweiser_senioren copy txt file for valid phone numbers.
    :return: List of numbers found in txt file.
    """
    matches = []
    fax_numbers = []
    wegweiser_senioren_copy = [flyer for flyer in files_flyers_copy if 'wegweiser_senioren' in flyer][0]
    text = ""
    with open(wegweiser_senioren_copy, 'r') as wegweiser_senioren_file:
        text += wegweiser_senioren_file.read()
    for match in re.findall(r'(110|112|115|116117|19222|116006)', text):
        matches.append(match)
    for match in re.findall(r'0[1-9][0-9]{6,12}', text):
        matches.append(match)
    for match in re.findall(r'[a-zA-Z]*fax[a-zA-Z]*(\d+)', text, flags=re.I):  # find all fax numbers
        fax_numbers.append(match)
    matches = [match for match in matches if match not in fax_numbers]  # remove fax numbers from solution
    return list(set(matches))  # convert to set to remove duplicates

In [17]:
for number in find_numbers_bundeswehr():
    numbers.append(number)

Now we glue everything together.
First, we get rid of duplicates.
Second, we bring all numbers (except emergency short numbers) in a normalized format.
Third, we write everything in one solution file.

In [18]:
numbers = list(set(numbers))

In [19]:
numbers_normalized = []

for number in numbers:
    if len(number) < 7:
        numbers_normalized.append(number)
    elif '+' in number:
        numbers_normalized.append(number)
    else:
        numbers_normalized.append('+49' + number[1:])


In [20]:
with open(os.path.dirname(files_flyers[0]) + '/numbers.txt', 'w') as writefile:
    for number in numbers_normalized:
        writefile.write(number)
        writefile.write('\n')

## (ii) Extract URLs and mail addresses from PDF files in /flyers
We start  by looking at our PDF files.

Again, we make copies of the original parse work, format the files and then analyze and validate the urls and mail addresses found using the 'validator' library.

In [21]:
files_flyers_copy.clear()
def copy_subpart2_files():
    """
    Copies the txt files used for the analysis. Keeps the file name, adds '_subpartX'.
    """
    for flyer in files_flyers:
        shutil.copy(flyer, flyer.replace('.txt','_subpart2.txt'))
        files_flyers_copy.append(flyer.replace('.txt', '_subpart2.txt'))

In [22]:
copy_subpart2_files()

Define arrays to save urls and mail addresses to.

In [23]:
urls = []
mail_addresses = []

Let's begin with the bahnstadt.pdf file.

In [24]:
def format_bahnstadt_url_mail():
    """
    Formats the bahnstadt copy txt file and prepares it for analysis.
    """
    bahnstadt_copy = [flyer for flyer in files_flyers_copy if 'bahnstadt' in flyer][0]
    text = ""
    with open(bahnstadt_copy, 'r') as bahnstadt_file:
        text += bahnstadt_file.read()
    text = text.replace('\n', ' ')  # Get rid of newlines
    text = text.lower()
    with open(bahnstadt_copy, 'w') as bahnstadt_file:
        bahnstadt_file.writelines(text)

In [25]:
format_bahnstadt_url_mail()

First find urls in bahnstadt text.

In [46]:
def find_urls_bahnstadt() -> list:
    """
    Searches inside the bahnstadt copy txt file for valid urls.
    :return: List of urls found in txt file.
    """
    potential_urls = []
    final_urls = []
    bahnstadt_copy = [flyer for flyer in files_flyers_copy if 'bahnstadt' in flyer][0]
    text = ""
    with open(bahnstadt_copy, 'r') as bahnstadt_file:
        text += bahnstadt_file.read()
    text = text.split(' ')
    for element in text:
        if '.' in element:  # every website contains at least one dot
            potential_urls.append(element)

    # Source: https://stackoverflow.com/questions/7160737/how-to-validate-a-url-in-python-malformed-or-not (Answer by cetver)
    is_url_regex = re.compile(
        r'^www.' # starts with 'www.'
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

    for url in potential_urls:
        match = re.match(r'([\w.-]+\.(de|com))\w+', url, flags=re.IGNORECASE)
        if re.match(is_url_regex, url) is not None:
            if match:
                url = match.groups(0)[0]
            try:
                print("Start validating:", url)
                if requests.get('https://' + url).ok or requests.get('http://' + url).ok:
                    final_urls.append(url)
            except requests.ConnectionError:
                print("Error: Url", url, "is not valid.")

    return list(set(final_urls))  # convert to set to remove duplicates

In [48]:
for url in find_urls_bahnstadt():
    urls.append(url)

Start validating: www.zahnarzt-heidelberg-kirchheim.de
Start validating: www.frauennotruf-heidelberg.de
Start validating: www.theatergemeinde-heidelberg.de
Start validating: www.kreisverkehrswacht-rhein-neckar.de
Start validating: www.weststadtverein.de
Start validating: www.bundespolizei.de
Start validating: www.malteser.de
Start validating: www.johanniter-baden.de
Error: Url www.johanniter-baden.de is not valid.
Start validating: www.fgz-heidelberg.de
Start validating: www.selbsthilfe-heidelberg.de
Start validating: www.bahnstadtverein.de/.
Start validating: www.bahnstadtverein.de/
Start validating: www.offener-mrt.com
Start validating: www.offener-mrt.com
Start validating: www.hd-bergheim.de.
Start validating: www.bestattungshaus-heidelberg.de
Start validating: www.fgz-heidelberg.de
Start validating: www.facebook.com/fgzheidelberg
Start validating: www.bibez.de
Start validating: www.weststadtverein.de/festbuch
Start validating: www.ihkkg.de
Start validating: www.vhs-hd.de
Start vali

Let's continue with urls from the bundeswehr.pdf file.

In [49]:
def format_bundeswehr_url_mail():
    """
    Formats the bahnstadt copy txt file and prepares it for analysis.
    """
    bundeswehr_copy = [flyer for flyer in files_flyers_copy if 'bundeswehr' in flyer][0]
    text = ""
    with open(bundeswehr_copy, 'r') as bundeswehr_file:
        text += bundeswehr_file.read()
    text = text.replace('\n', ' ')  # Get rid of newlines
    text = text.lower()
    with open(bundeswehr_copy, 'w') as bundeswehr_file:
        bundeswehr_file.writelines(text)

In [50]:
format_bundeswehr_url_mail()