In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
import json

from zipfile import ZipFile, Path
from urllib.request import urlopen
from io import BytesIO

with urlopen("https://interactive.zeit.de/2017/strassennamen/datensatz_deutsche_strassennamen.zip") as f_zip:
    f = ZipFile(BytesIO(f_zip.read()), metadata_encoding="utf-8")

In [3]:
geojson = json.load((Path(f) / "Straßennamen" / "streets.geojson").open(encoding="latin-1"))
print(len(geojson["features"]))

1264915


In [6]:
def __encode(x):
    return bytes(x, "latin-1").decode("utf-8").strip()

df_dict = {"street_name": [], "municipality": [], "postcode": []}

for feature in geojson["features"]:
    props = feature["properties"]
    street_name = props["NAME"]
    municipality = props["GEMEINDE"]
    postcode = props["PLZ"]

    if None in (street_name, municipality, postcode):
        continue

    for p in postcode.split(","):
        df_dict["street_name"].append(__encode(street_name))
        df_dict["municipality"].append(__encode(municipality))
        df_dict["postcode"].append(__encode(p))

df = pd.DataFrame(df_dict, dtype=str)
df

Unnamed: 0,street_name,municipality,postcode
0,1,Konstanz,78465
1,10,Dresden,01099
2,10,Dresden,01109
3,10 AS Hörstel,Hörstel,48477
4,10 AS Richtung Amsterdam,Hörstel,48477
...,...,...,...
1324270,Zyriakusweg,Deidesheim,67146
1324271,Zywiecstraße,Unterhaching,82008
1324272,zZ nicht nutzbar,Berlin,10551
1324273,z.Z. schecht begehbar,Püttlingen,66346


In [7]:
df_streets = df[~df["street_name"].str.contains(r"\d")]
df_streets

Unnamed: 0,street_name,municipality,postcode
152,Aabauerschaft,Steinfurt,48493
153,Aabauerschaft,Steinfurt,48565
154,Aabauerschaft,Steinfurt,48493
155,Aabauerschaft,Steinfurt,48565
156,Aachbachstraße,Stockach,78333
...,...,...,...
1324270,Zyriakusweg,Deidesheim,67146
1324271,Zywiecstraße,Unterhaching,82008
1324272,zZ nicht nutzbar,Berlin,10551
1324273,z.Z. schecht begehbar,Püttlingen,66346


In [8]:
# consider each street name per municipality once
srs_street_name_freq = df_streets.drop_duplicates(["street_name", "municipality"]).value_counts("street_name")
srs_street_name_freq

street_name
Hauptstraße           5928
Schulstraße           4785
Gartenstraße          4494
Bahnhofstraße         4211
Dorfstraße            3904
                      ... 
Handwerkerpfad           1
Handwerkerallee          1
Handwerker-Passage       1
Handwercherstraße        1
Žitavská                 1
Name: count, Length: 446917, dtype: int64

In [9]:
# only keep street names in the 99% quantile (drops street names with less than 23 occurrences)
srs_street_name_freq_q99 = srs_street_name_freq[srs_street_name_freq >= srs_street_name_freq.quantile(.99)]
srs_street_name_freq_q99

street_name
Hauptstraße      5928
Schulstraße      4785
Gartenstraße     4494
Bahnhofstraße    4211
Dorfstraße       3904
                 ... 
Hopfenberg         23
Am Berghang        23
Brunnenberg        23
Brennerstraße      23
Stiege             23
Name: count, Length: 4549, dtype: int64

In [10]:
df_streets_frequent = df_streets[df_streets["street_name"].isin(srs_street_name_freq_q99.index)]
df_streets_frequent

Unnamed: 0,street_name,municipality,postcode
160,Aachener Straße,Aachen,52074
161,Aachener Straße,Bad Soden-Salmünster,63628
162,Aachener Straße,Blankenheim,53945
163,Aachener Straße,Bornheim,53332
164,Aachener Straße,Bremen,28327
...,...,...,...
1324264,Zypressenweg,Schenefeld,22869
1324265,Zypressenweg,Schmelz,66839
1324266,Zypressenweg,St. Leon-Rot,68789
1324267,Zypressenweg,Stockstadt am Main,63811


In [17]:
# pick 5% of rows at random
import numpy as np

rng = np.random.default_rng(1337)
arr_rand_mask = rng.random(len(df_streets_frequent)) < 0.05

In [18]:
print(len(df_streets), ">", len(df_streets_frequent), ">", len(df_streets_frequent[arr_rand_mask]))

1316954 > 545204 > 27163


In [19]:
df_streets_frequent[arr_rand_mask].value_counts().to_csv("street-municipality-postcode.csv")