In [181]:
import pandas as pd
import numpy as np
from pathlib import Path

df = pd.read_csv("data/campusplan_enriched.csv")

In [182]:
# remove building IDs from natural language names
ID_pattern = r' \(\d{2}\.\d{2}\)'
df['title'] = df['title'].astype(str).str.replace(ID_pattern, '', regex=True)

In [183]:
# replace categorys with German translations
df['category'] = df['category'].astype(str).str.replace('lecturehall', 'Hörsaal', regex=False)
df['category'] = df['category'].astype(str).str.replace('institute', 'Institut', regex=False)
df['category'] = df['category'].astype(str).str.replace('facility', 'Einrichtung', regex=False)
df.loc[df['category'].str.contains('building', na=False), 'category'] = np.nan

In [184]:
# remove all types that are not buildings
non_building_types = ['fire_extinguisher', 'house', 'outdoor_seating', 'first_aid_kit', 'pedestrian', 'reception_desk', 'unclassified', 'works', 'elevator', 'platform', 'bus_stop', 'grit_bin', 'insurance', 'track', 'internet_cafe', 'garden', 'yes', 'fire_alarm_box', 'university', 'path']
for type in non_building_types:
    df.loc[df['rev_type'].str.contains(type, na=False), 'rev_type'] = np.nan

In [185]:
# translate building types to German
building_types_en = ['library', 'bicycle_parking', 'atm', 'auditorium', 'toilets', 'doctors', 'kitchen', 'garage', 'language_school', 'administrative', 'artwork', 'stadium', 'food_sharing', 'greenhouse', 'parking', 'kindergarten', 'sports_centre', 'memorial', 'government', 'office', 'dormitory', 'research_institute', 'research', 'residential', 'apartments', 'restaurant', 'cafe', 'education_centre', 'canteen', 'service', 'fire_station', 'tower', 'company', 'commercial', 'retail', 'association', 'hackerspace']
building_types_de = ['Bibliothek', 'Fahrradstellplätze', 'Bankautomat', 'Auditorium', 'Toiletten', 'Ärzte', 'Küche', 'Garage', 'Sprachschule', 'Verwaltung', 'Kunstwerk', 'Stadion', 'Foodsharing', 'Gewächshaus', 'Parkplatz', 'Kindergarten', 'Sportzentrum', 'Denkmal', 'Regierung', 'Büro', 'Wohnheim', 'Forschungsinstitut', 'Forschung', 'Wohnen', 'Apartments', 'Restaurant', 'Cafe', 'Bildungszentrum', 'Kantine', 'Service', 'Feuerwehr', 'Turm', 'Unternehmen', 'Kommerziell', 'Einzelhandel', 'Verein', 'Hackerspace']

for index, type_en in enumerate(building_types_en):
    df.loc[df['rev_type'].str.contains(type_en, na=False), 'rev_type'] = building_types_de[index]

In [186]:
# remove names not fitting buildings
non_building_names = ['Bruchsaler Straße', 'Büchenauer Straße', 'Neureuter Straße', 'Friedrichstaler Straße', 'Eggensteiner Straße', 'Leopoldshafener Allee', 'Blankenlocher Straße', 'Karlsruher Allee', 'Linkenheimer Straße', 'Weingartener Straße', 'Spöcker Straße', 'Neutharder Straße', 'Campus Nord', 'Hochstetter Straße', 'Karlsruher Institut für Technologie', 'Büchiger Allee']
for name in non_building_names:
    df.loc[df['rev_name'].str.contains(name, na=False), 'rev_name'] = np.nan

name_id_pattern = r'\d{2}\.\d{2}'
df.loc[df['rev_name'].str.contains(pat = name_id_pattern, regex=True, na=False), 'rev_name'] = np.nan
name_num_pattern = r'\d{3}'
df.loc[df['rev_name'].str.contains(pat = name_num_pattern, regex=True, na=False), 'rev_name'] = np.nan

In [187]:
# remove any address that contains no further info than that the building is part of KIT
generic_KIT_address = 'Karlsruher Institut für Technologie, Ehingen (Donau), Gemeindeverwaltungsverband Ehingen (Donau), Alb-Donau-Kreis, Baden-Württemberg, 89584, Deutschland'
df.loc[df['rev_display_name'].str.contains(generic_KIT_address, regex=False, na=False), 'rev_display_name'] = np.nan

In [188]:
# remove columns that are unused in evaluation
df = df.drop('aliasList', axis=1)
df = df.drop('rev_category', axis=1)
df = df.drop('det_contact:phone', axis=1)

In [189]:
# export to separate file
data_base = Path("data")
df.to_csv(data_base / "campusplan_evaluation.csv", index=False)