# Get Geo-Information via google maps API

In [30]:
# import libraries

import pandas as pd
import googlemaps
from time import sleep

In [31]:
# load dataset

df = pd.read_csv('/Users/bastianlenkers/VSCode_Stuff/Repos/layered-populate-data-pool-da/tram_bus_data_modelling/bus_trams/sources/bus_tram_preliminary.csv')

In [None]:
# initialize gmaps client

api_key = '' # use your own key :P

gmaps = googlemaps.Client(key=api_key)

In [35]:
# create new columns
df['adress'] = ''
df['Ort'] = ''
df['district'] = ''
df['neighborhood'] = ''

# iterate thru rows
for index, row in df.iterrows():
    lat = row['lat']
    lon = row['lon']
    
    # request reverse geocode
    try:
        reverse_geocode_result = gmaps.reverse_geocode((lat, lon))
        
        if reverse_geocode_result:
            # get complete adress
            formatted_address = reverse_geocode_result[0]['formatted_address']
            df.at[index, 'adress'] = formatted_address

            # Parsen der einzelnen Adresskomponenten
            components = reverse_geocode_result[0]['address_components']
            for component in components:
                types = component['types']
                if 'locality' in types:
                    df.at[index, 'Ort'] = component['long_name']
                elif 'administrative_area_level_2' in types:
                    df.at[index, 'district'] = component['long_name']
                elif 'sublocality' in types or 'sublocality_level_1' in types:
                    df.at[index, 'neighborhood'] = component['long_name']

    except Exception as e:
        print(f"Error with coordinates: {lat}, {lng}: {e}")
        df.at[index, 'adress'] = 'Error'

    # delay for API limits
    sleep(0.1)

In [90]:
# create copy of dataframe only with valid columns

df2 = df[['id', 'name', 'lat', 'lon', 'tram', 'bus', 'adress']]

In [49]:
# save dataframe as csv

df2.to_csv('/Users/bastianlenkers/VSCode_Stuff/Repos/layered-populate-data-pool-da/tram_bus_data_modelling/bus_trams/sources/bus_tram_with_adress.csv')

# Create District and Neighborhood columns based on address

In [75]:
df2['adress']

0              Ollenhauerstraße 24, 13403 Berlin, Germany
1                   Adersleber Weg, 12685 Berlin, Germany
2                Ahrenshooper Str., 13051 Berlin, Germany
3           Berlin, Albertinenstr., 13088 Berlin, Germany
4            Alfred-Kowalke-Straße, 10319 Berlin, Germany
                              ...                        
3017                 Züricher Str., 12205 Berlin, Germany
3018              Zweibrücker Str., 13583 Berlin, Germany
3019    Zweiwinkelweg, Zweiwinkelweg, 13591 Berlin, Ge...
3020              Adlergestell 745, 12527 Berlin, Germany
3021      Kurt-Schumacher-Damm 205, 13405 Berlin, Germany
Name: adress, Length: 3022, dtype: object

In [None]:
# extract postal code from address

df2['postal_code'] = df2['adress'].apply(lambda x: x.split(",")[-2].split(" ")[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['postal_code'] = df2['adress'].apply(lambda x: x.split(",")[-2].split(" ")[1])


In [None]:
# there were a handfull of wrong values in the postal code column
# filter for only postal codes

df3 = df2[df2['postal_code'].str.startswith('1')]

In [125]:
# list of berlin zip codes

berlin_zip_codes = {
    10115: 'Mitte,Wedding',
    10117: 'Mitte,Tiergarten',
    10119: 'Pankow,Prenzlauer Berg',
    10178: 'Mitte,Mitte',
    10179: 'Friedrichshain-Kreuzberg,Kreuzberg',
    10243: 'Friedrichshain-Kreuzberg,Friedrichshain',
    10245: 'Treptow-Köpenick,Alt-Treptow',
    10247: 'Friedrichshain-Kreuzberg,Friedrichshain',
    10249: 'Pankow,Prenzlauer Berg',
    10315: 'Lichtenberg,Lichtenberg',
    10317: 'Lichtenberg,Rummelsburg',
    10318: 'Lichtenberg,Karlshorst',
    10319: 'Lichtenberg,Friedrichsfelde',
    10365: 'Lichtenberg,Alt-Hohen-Schönhausen',
    10367: 'Lichtenberg,Lichtenberg',
    10369: 'Lichtenberg,Fennpfuhl',
    10405: 'Pankow,Prenzlauer Berg',
    10407: 'Pankow,Prenzlauer Berg',
    10409: 'Pankow,Weißensee',
    10435: 'Pankow,Prenzlauer Berg',
    10437: 'Mitte,Gesundbrunnen',
    10439: 'Mitte,Gesundbrunnen',
    10551: 'Mitte,Moabit',
    10553: 'Mitte,Moabit',
    10555: 'Mitte,Hansaviertel',
    10557: 'Mitte,Hansaviertel',
    10559: 'Mitte,Moabit',
    10585: 'Charlottenburg-Wilmersdorf,Charlottenburg',
    10587: 'Charlottenburg-Wilmersdorf,Charlottenburg',
    10589: 'Charlottenburg-Wilmersdorf,Charlottenburg',
    10623: 'Charlottenburg-Wilmersdorf,Charlottenburg',
    10625: 'Charlottenburg-Wilmersdorf,Charlottenburg',
    10627: 'Charlottenburg-Wilmersdorf,Charlottenburg',
    10629: 'Charlottenburg-Wilmersdorf,Charlottenburg',
    10707: 'Charlottenburg-Wilmersdorf,Halensee',
    10709: 'Charlottenburg-Wilmersdorf,Halensee',
    10711: 'Charlottenburg-Wilmersdorf,Grunewald',
    10713: 'Charlottenburg-Wilmersdorf,Wilmersdorf',
    10715: 'Charlottenburg-Wilmersdorf,Wilmersdorf',
    10717: 'Charlottenburg-Wilmersdorf,Wilmersdorf',
    10719: 'Charlottenburg-Wilmersdorf,Wilmersdorf',
    10777: 'Tempelhof-Schöneberg,Schöneberg',
    10779: 'Tempelhof-Schöneberg,Schöneberg',
    10781: 'Tempelhof-Schöneberg,Schöneberg',
    10783: 'Tempelhof-Schöneberg,Schöneberg',
    10785: 'Tempelhof-Schöneberg,Schöneberg',
    10787: 'Charlottenburg-Wilmersdorf,Charlottenburg',
    10789: 'Charlottenburg-Wilmersdorf,Charlottenburg',
    10823: 'Tempelhof-Schöneberg,Schöneberg',
    10825: 'Tempelhof-Schöneberg,Schöneberg',
    10827: 'Tempelhof-Schöneberg,Friedenau',
    10829: 'Tempelhof-Schöneberg,Schöneberg',
    10961: 'Friedrichshain-Kreuzberg,Kreuzberg', 
    10963: 'Friedrichshain-Kreuzberg,Kreuzberg',
    10965: 'Friedrichshain-Kreuzberg,Kreuzberg',
    10967: 'Friedrichshain-Kreuzberg,Kreuzberg',
    10969: 'Friedrichshain-Kreuzberg,Kreuzberg',
    10997: 'Friedrichshain-Kreuzberg,Kreuzberg',
    10999: 'Friedrichshain-Kreuzberg,Kreuzberg',
    12043: 'Neukölln,Neukölln',
    12045: 'Neukölln,Neukölln',
    12047: 'Neukölln,Neukölln',
    12049: 'Neukölln,Neukölln',
    12051: 'Neukölln,Neukölln',
    12053: 'Neukölln,Neukölln',
    12055: 'Neukölln,Neukölln',
    12057: 'Treptow-Köpenick,Baumschulenweg',
    12059: 'Treptow-Köpenick,Alt-Treptow',
    12099: 'Neukölln,Britz',
    12101: 'Tempelhof-Schöneberg,Tempelhof',
    12103: 'Tempelhof-Schöneberg,Tempelhof',
    12105: 'Tempelhof-Schöneberg,Mariendorf',
    12107: 'Neukölln,Britz',
    12109: 'Tempelhof-Schöneberg,Mariendorf',
    12157: 'Steglitz-Zehlendorf,Steglitz',
    12159: 'Tempelhof-Schöneberg,Friedenau',
    12161: 'Tempelhof-Schöneberg,Friedenau',
    12163: 'Tempelhof-Schöneberg,Friedenau',
    12165: 'Steglitz-Zehlendorf,Lichterfelde',
    12167: 'Steglitz-Zehlendorf,Lankwitz',
    12169: 'Steglitz-Zehlendorf,Steglitz',
    12203: 'Steglitz-Zehlendorf,Dahlem',
    12205: 'Steglitz-Zehlendorf,Dahlem',
    12207: 'Steglitz-Zehlendorf,Lichterfelde',
    12209: 'Steglitz-Zehlendorf,Lankwitz',
    12247: 'Steglitz-Zehlendorf,Lankwitz',
    12249: 'Steglitz-Zehlendorf,Lankwitz',
    12277: 'Steglitz-Zehlendorf,Lankwitz',
    12279: 'Steglitz-Zehlendorf,Lankwitz',
    12305: 'Neukölln,Buckow',
    12307: 'Steglitz-Zehlendorf,Lichtenrade',
    12309: 'Steglitz-Zehlendorf,Lichtenrade',
    12347: 'Neukölln,Britz',
    12349: 'Neukölln,Britz',
    12351: 'Neukölln,Britz',
    12353: 'Neukölln,Buckow',
    12355: 'Neukölln,Rudow',
    12357: 'Neukölln,Gropiusstadt',
    12359: 'Neukölln,Rudow',
    12435: 'Neukölln,Neukölln',
    12437: 'Treptow-Köpenick,Johannisthal',
    12439: 'Treptow-Köpenick,Niederschöneweide',
    12459: 'Treptow-Köpenick,Köpenick',
    12487: 'Treptow-Köpenick,Baumschulenweg',
    12489: 'Treptow-Köpenick,Adlershof',
    12524: 'Treptow-Köpenick,Altglienicke',
    12526: 'Treptow-Köpenick,Bohnsdorf',
    12527: 'Treptow-Köpenick,Grünau',
    12529: 'Schönefeld,Schönefeld',
    12555: 'Marzahn-Hellersdorf,Biesdorf',
    12557: 'Treptow-Köpenick,Köpenick',
    12559: 'Treptow-Köpenick,Altglienicke',
    12587: 'Treptow-Köpenick,Friedrichshagen',
    12589: 'Treptow-Köpenick,Friedrichshagen',
    12619: 'Marzahn-Hellersdorf,Hellersdorf',
    12621: 'Marzahn-Hellersdorf,Kaulsdorf',
    12623: 'Marzahn-Hellersdorf,Kaulsdorf',
    12627: 'Marzahn-Hellersdorf,Hellersdorf',
    12629: 'Marzahn-Hellersdorf,Hellersdorf',
    12679: 'Marzahn-Hellersdorf,Marzahn',
    12681: 'Lichtenberg,Friedrichsfelde',
    12683: 'Marzahn-Hellersdorf,Biesdorf',
    12685: 'Marzahn-Hellersdorf,Marzahn',
    12687: 'Marzahn-Hellersdorf,Marzahn',
    12689: 'Lichtenberg,Falkenberg',
    13051: 'Lichtenberg,Alt-Hohenschönhausen',
    13053: 'Lichtenberg,Alt-Hohenschönhausen',
    13055: 'Lichtenberg,Alt-Hohenschönhausen',
    13057: 'Lichtenberg,Falkenberg',
    13059: 'Lichtenberg,Neu-Hohenschönhausen',
    13086: 'Pankow,Heinersdorf',
    13088: 'Pankow,Stadtrandsiedlung Malchow',
    13089: 'Pankow,Heinersdorf',
    13125: 'Pankow,Buch',
    13127: 'Pankow,Französisch Buchholz',
    13129: 'Pankow,Blankenburg',
    13156: 'Pankow,Niederschönhausen',
    13158: 'Pankow,Blankenburg',
    13159: 'Pankow,Blankenfelde',
    13187: 'Pankow,Niederschönhausen',
    13189: 'Pankow,Pankow',
    13347: 'Mitte,Gesundbrunnen',
    13349: 'Mitte,Wedding',
    13351: 'Mitte,Wedding',
    13353: 'Mitte,Gesundbrunnen',
    13355: 'Mitte,Gesundbrunnen',
    13357: 'Mitte,Gesundbrunnen',
    13359: 'Mitte,Gesundbrunnen',
    13403: 'Reinickendorf,Borsigwalde',
    13405: 'Reinickendorf,Reinickendorf',
    13407: 'Reinickendorf,Reinickendorf',
    13409: 'Mitte,Gesundbrunnen',
    13435: 'Reinickendorf,Märkisches Viertel',
    13437: 'Reinickendorf,Reinickendorf',
    13439: 'Reinickendorf,Lübars',
    13465: 'Reinickendorf,Frohnau',
    13467: 'Reinickendorf,Hermsdorf',
    13469: 'Reinickendorf,Lübars',
    13503: 'Reinickendorf,Heiligensee',
    13505: 'Reinickendorf,Heiligensee',
    13507: 'Reinickendorf,Reinickendorf',
    13509: 'Reinickendorf,Borsigwalde',
    13581: 'Spandau,Spandau',
    13583: 'Spandau,Falkenhagener Feld',
    13585: 'Spandau,Falkenhagener Feld',
    13587: 'Spandau,Falkenhagener Feld',
    13589: 'Spandau,Falkenhagener Feld',
    13591: 'Spandau,Falkenhagener Feld',
    13593: 'Spandau,Spandau',
    13595: 'Spandau,Spandau',
    13597: 'Spandau,Spandau',
    13599: 'Spandau,Hakenfelde',
    13627: 'Charlottenburg-Wilmersdorf,Charlottenburg',
    13629: 'Charlottenburg-Wilmersdorf,Charlottenburg-Nord',
    14050: 'Spandau,Siemensstadt',
    14052: 'Charlottenburg-Wilmersdorf,Westend',
    14053: 'Charlottenburg-Wilmersdorf,Westend',
    14055: 'Charlottenburg-Wilmersdorf,Westend',
    14057: 'Charlottenburg-Wilmersdorf,Westend',
    14059: 'Charlottenburg-Wilmersdorf,Westend',
    14089: 'Spandau,Gatow',
    14109: 'Spandau,Kladow',
    14129: 'Steglitz-Zehlendorf,Nikolassee',
    14163: 'Steglitz-Zehlendorf,Nikolassee',
    14165: 'Steglitz-Zehlendorf,Zehlendorf',
    14167: 'Steglitz-Zehlendorf,Zehlendorf',
    14169: 'Steglitz-Zehlendorf,Dahlem',
    14193: 'Charlottenburg-Wilmersdorf,Grunewald',
    14195: 'Charlottenburg-Wilmersdorf,Grunewald',
    14197: 'Tempelhof-Schöneberg,Friedenau',
    14199: 'Charlottenburg-Wilmersdorf,Schmargendorf'

}

In [None]:
# add district and neighborhood according to zip code

df3['district'] = ''
df3['neighborhood'] = ''

df3['postal_code'] = df3['postal_code'].astype(int)

for index, row in df3.iterrows():

    if  df3.loc[index, 'postal_code'] in (berlin_zip_codes):
    
        df3.loc[index, 'district'] = berlin_zip_codes[df3.loc[index, 'postal_code']].split(",")[0]
        df3.loc[index, 'neighborhood'] = berlin_zip_codes[df3.loc[index, 'postal_code']].split(",")[1]

    else:
        df3.loc[index, 'district'] = 'unknown'
        df3.loc[index, 'neighborhood'] = 'unknown'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['district'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['neighborhood'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['postal_code'] = df3['postal_code'].astype(int)


In [116]:
df3.head()

Unnamed: 0,id,name,lat,lon,tram,bus,adress,district,neighborhood,postal_code
0,900086106,Auguste-Viktoria-A./Humboldtstr. (Berlin),52.568845,13.329852,False,True,"Ollenhauerstraße 24, 13403 Berlin, Germany",Reinickendorf,Borsigwalde,13403
1,900170515,Adersleber Weg (Berlin),52.537896,13.560318,True,True,"Adersleber Weg, 12685 Berlin, Germany",Marzahn-Hellersdorf,Marzahn,12685
2,900151501,Ahrenshooper Str. (Berlin),52.566212,13.501888,True,True,"Ahrenshooper Str., 13051 Berlin, Germany",Lichtenberg,Alt-Hohenschönhausen,13051
3,900140005,Albertinenstr. (Berlin),52.549788,13.457778,True,True,"Berlin, Albertinenstr., 13088 Berlin, Germany",Pankow,Stadtrandsiedlung Malchow,13088
4,900161517,Alfred-Kowalke-Str. (Berlin),52.505714,13.519704,True,True,"Alfred-Kowalke-Straße, 10319 Berlin, Germany",Lichtenberg,Friedrichsfelde,10319


# Create district id column

In [127]:
# dict with district id's

berlin_district_ids = {
    'Mitte': 11001001,
    'Friedrichshain-Kreuzberg': 11002002,
    'Pankow': 11003003,
    'Charlottenburg-Wilmersdorf': 11004004,
    'Spandau': 11005005,
    'Steglitz-Zehlendorf': 11006006,
    'Tempelhof-Schöneberg': 11007007,
    'Neukölln': 11008008,
    'Treptow-Köpenick': 11009009,
    'Marzahn-Hellersdorf': 11010010,
    'Lichtenberg': 11011011,
    'Reinickendorf': 11012012
}

In [None]:
# add id according to district column


df3['district_id'] = ''

for index, row in df3.iterrows():

    if  df3.loc[index, 'district'] in berlin_district_ids:
    
        df3.loc[index, 'district_id'] = berlin_district_ids[df3.loc[index, 'district']]

    else:
        df3.loc[index, 'district_id'] = 'unknown'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['district_id'] = ''


In [None]:
# filter out zip codes outside berlin area

df4 = df3[df3['district'] != 'unknown']

In [131]:
df4

Unnamed: 0,id,name,lat,lon,tram,bus,adress,district,neighborhood,postal_code,district_id
0,900086106,Auguste-Viktoria-A./Humboldtstr. (Berlin),52.568845,13.329852,False,True,"Ollenhauerstraße 24, 13403 Berlin, Germany",Reinickendorf,Borsigwalde,13403,11012012
1,900170515,Adersleber Weg (Berlin),52.537896,13.560318,True,True,"Adersleber Weg, 12685 Berlin, Germany",Marzahn-Hellersdorf,Marzahn,12685,11010010
2,900151501,Ahrenshooper Str. (Berlin),52.566212,13.501888,True,True,"Ahrenshooper Str., 13051 Berlin, Germany",Lichtenberg,Alt-Hohenschönhausen,13051,11011011
3,900140005,Albertinenstr. (Berlin),52.549788,13.457778,True,True,"Berlin, Albertinenstr., 13088 Berlin, Germany",Pankow,Stadtrandsiedlung Malchow,13088,11003003
4,900161517,Alfred-Kowalke-Str. (Berlin),52.505714,13.519704,True,True,"Alfred-Kowalke-Straße, 10319 Berlin, Germany",Lichtenberg,Friedrichsfelde,10319,11011011
...,...,...,...,...,...,...,...,...,...,...,...
3017,900066255,Züricher Str. (Berlin),52.427103,13.290767,False,True,"Züricher Str., 12205 Berlin, Germany",Steglitz-Zehlendorf,Dahlem,12205,11006006
3018,900027456,Zweibrücker Str. (Berlin),52.546894,13.188254,False,True,"Zweibrücker Str., 13583 Berlin, Germany",Spandau,Falkenhagener Feld,13583,11005005
3019,900037152,Zweiwinkelweg (Berlin),52.532826,13.148378,False,True,"Zweiwinkelweg, Zweiwinkelweg, 13591 Berlin, Ge...",Spandau,Falkenhagener Feld,13591,11005005
3020,900185501,Zum Seeblick (Berlin),52.379173,13.644232,True,True,"Adlergestell 745, 12527 Berlin, Germany",Treptow-Köpenick,Grünau,12527,11009009


# Finalize

In [None]:
# re-organize data frame

bus_tram_stops_berlin_df = df4[['id','district_id', 'name', 'adress', 'lat', 'lon', 'neighborhood', 'district']]

In [135]:
# rename some columns

mapper = {
    'id':'stop_id',
    'district_id':'district_id',
    'name':'name',
    'adress':'address',
    'lat':'latitude',
    'lon':'longitude',
    'neighborhood':'neighborhood',
    'district':'district'
}

bus_tram_stops_berlin_df.rename(columns=mapper, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bus_tram_stops_berlin_df.rename(columns=mapper, inplace=True)


In [137]:
bus_tram_stops_berlin_df.reset_index(inplace=True)

In [140]:
# save dataframe as csv

bus_tram_stops_berlin_df.to_csv('/Users/bastianlenkers/VSCode_Stuff/Repos/layered-populate-data-pool-da/tram_bus_data_modelling/bus_trams/sources/bus_tram_stops.csv', index=False)