In [619]:
import pandas as pd
import numpy as np
from pathlib import Path
import re

df = pd.read_csv("data/campusplan_enriched.csv")

In [620]:
# remove building IDs from natural language names
ID_pattern = r' \(\d{2}\.\d{2}\)'
df['title'] = df['title'].astype(str).str.replace(ID_pattern, '', regex=True)

In [621]:
# replace categorys with German translations
df['category'] = df['category'].astype(str).str.replace('lecturehall', 'Hörsaal', regex=False)
df['category'] = df['category'].astype(str).str.replace('institute', 'Institut', regex=False)
df['category'] = df['category'].astype(str).str.replace('facility', 'Einrichtung', regex=False)
df.loc[df['category'].str.contains('building', na=False), 'category'] = np.nan

In [622]:
# remove all types that are not buildings
non_building_types = ['fire_extinguisher', 'house', 'outdoor_seating', 'first_aid_kit', 'pedestrian', 'reception_desk', 'unclassified', 'works', 'elevator', 'platform', 'bus_stop', 'grit_bin', 'insurance', 'track', 'internet_cafe', 'garden', 'yes', 'fire_alarm_box', 'university', 'path']
for type in non_building_types:
    df.loc[df['rev_type'].str.contains(type, na=False), 'rev_type'] = np.nan

In [623]:
# translate building types to German
building_types_en = ['library', 'bicycle_parking', 'atm', 'auditorium', 'toilets', 'doctors', 'kitchen', 'garage', 'language_school', 'administrative', 'artwork', 'stadium', 'food_sharing', 'greenhouse', 'parking', 'kindergarten', 'sports_centre', 'memorial', 'government', 'office', 'dormitory', 'research_institute', 'research', 'residential', 'apartments', 'restaurant', 'cafe', 'education_centre', 'canteen', 'service', 'fire_station', 'tower', 'company', 'commercial', 'retail', 'association', 'hackerspace']
building_types_de = ['Bibliothek', 'Fahrradstellplätze', 'Bankautomat', 'Auditorium', 'Toiletten', 'Ärzte', 'Küche', 'Garage', 'Sprachschule', 'Verwaltung', 'Kunstwerk', 'Stadion', 'Foodsharing', 'Gewächshaus', 'Parkplatz', 'Kindergarten', 'Sportzentrum', 'Denkmal', 'Regierung', 'Büro', 'Wohnheim', 'Forschungsinstitut', 'Forschung', 'Wohnen', 'Apartments', 'Restaurant', 'Cafe', 'Bildungszentrum', 'Kantine', 'Service', 'Feuerwehr', 'Turm', 'Unternehmen', 'Kommerziell', 'Einzelhandel', 'Verein', 'Hackerspace']

for index, type_en in enumerate(building_types_en):
    df.loc[df['rev_type'].str.contains(type_en, na=False), 'rev_type'] = building_types_de[index]

In [624]:
# remove names not fitting buildings
non_building_names = ['Bruchsaler Straße', 'Büchenauer Straße', 'Neureuter Straße', 'Friedrichstaler Straße', 'Eggensteiner Straße', 'Leopoldshafener Allee', 'Blankenlocher Straße', 'Karlsruher Allee', 'Linkenheimer Straße', 'Weingartener Straße', 'Spöcker Straße', 'Neutharder Straße', 'Campus Nord', 'Hochstetter Straße', 'Karlsruher Institut für Technologie', 'Büchiger Allee']
for name in non_building_names:
    df.loc[df['rev_name'].str.contains(name, na=False), 'rev_name'] = np.nan

name_id_pattern = r'\d{2}\.\d{2}'
df.loc[df['rev_name'].str.contains(pat = name_id_pattern, regex=True, na=False), 'rev_name'] = np.nan
name_num_pattern = r'\d{3}'
df.loc[df['rev_name'].str.contains(pat = name_num_pattern, regex=True, na=False), 'rev_name'] = np.nan

In [625]:
# remove any address that contains no further info than that the building is part of KIT
generic_KIT_address = 'Karlsruher Institut für Technologie, Ehingen (Donau), Gemeindeverwaltungsverband Ehingen (Donau), Alb-Donau-Kreis, Baden-Württemberg, 89584, Deutschland'
df.loc[df['rev_display_name'].str.contains(generic_KIT_address, regex=False, na=False), 'rev_display_name'] = np.nan

In [626]:
# remove columns that are unused in evaluation
df = df.drop('aliasList', axis=1)
df = df.drop('rev_category', axis=1)
df = df.drop('det_contact:phone', axis=1)

In [627]:
# rename columns
df.rename(columns={'title': 'identifikator'}, inplace=True)
df.rename(columns={'category': 'kategorie'}, inplace=True)
df.rename(columns={'positionList': 'koordinaten'}, inplace=True)
df.rename(columns={'url': 'webseite'}, inplace=True)
df.rename(columns={'rev_type': 'funktion'}, inplace=True)
df.rename(columns={'rev_name': 'name'}, inplace=True)
df.rename(columns={'rev_display_name': 'adresse'}, inplace=True)
df.rename(columns={'det_opening_hours': 'oeffnungszeiten'}, inplace=True)
df.rename(columns={'det_wheelchair': 'rollstuhlgerechtigkeit'}, inplace=True)
df.rename(columns={'det_wheelchair:description': 'rollstuhlbeschreibung'}, inplace=True)

In [628]:
def restructure_address(address):
    # Split the address by commas
    parts = [part.strip() for part in address.split(",")]
    
    # Remove the copy of the name in front of the address
    if len(parts) == 9 and not re.search(r'(campus)', parts[0], re.IGNORECASE):
        parts = parts[1:]
    elif len(parts) == 10:
        parts = parts[2:]

    # Remove all remaining leading names that are not places (streets, places, allys, etc.)
    if parts and not (re.search(r'(strasse|straße|allee|ring|platz|weg|hof|park|heim|hafen|\d+)', parts[0], re.IGNORECASE) or parts[0] == "Campus Nord"):
            # print(parts[0]) # print what has been removed
            parts = parts[1:]
    if parts and re.search(r'^\d{1,2}\.\d{1,2}$', parts[0]):
            # print(parts[0]) # print what has been removed
            parts = parts[1:]

    # Remove all leading bulding IDs
    if parts and re.match(r'^\d', parts[0]):
        parts[0], parts[1] = parts[1], parts[0]

    # Remove trailing "Deutschland"
    parts = parts[0:-1]

    # Move PLZ to own list for separate column
    plz = parts.pop() if parts else None

    # Remove "Baden-Würtemberg" and "Karlsruhe"
    parts = parts[0:-2]
    if parts and parts[-1] == "Karlsruhe":
        parts = parts[0:-1]

    # Remove the less usefull of the two area/quarter names
    if parts:
        if len(parts) == 4:
            if parts[2] == "Leopoldshafen":
                parts.pop(3)
            else:
                parts.pop(2)
        elif len(parts) == 3:
            if parts[1] == "Leopoldshafen":
                parts.pop(2)
            else:
                parts.pop(1)

    # Move quarters to separate column
    quarter = parts.pop() if parts else None

    # Join the parts of the address together
    address = ", ".join(parts)

    # Remove the separator between the street name and house number
    if address and re.search(r',\s\d', address):
        address = address.replace(",", "", 1)
    
    return address, plz, quarter


for index, row in df.iterrows():
    # Example: Modify the value
    adress = row['adresse']
    if not pd.isna(adress):
        df.at[index, 'adresse'], df.at[index, 'postleitzahl'],  df.at[index, 'stadtviertel'] = restructure_address(adress)

# Move PLZ and quarter column behind the address column
plz = df.columns[-2]
quarter = df.columns[-1]
new_order = df.columns[:-2].tolist()
new_order.insert(8, plz)
new_order.insert(9, quarter)
df = df[new_order]

In [629]:
# export to separate file
data_base = Path("data")
df.to_csv(data_base / "campusplan_evaluation.csv", index=False)