In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_table("sweden.tsv")

### Cleaning data
Removing unneccessary columns and adding street addresses to locations with no given address but has longitude and latitude

In [3]:
df = df.drop(["alternative_names", 
                 "osm_type", 
                 "osm_id", 
                 "class", 
                 "type", 
                 "place_rank", 
                 "importance", 
                 "country", 
                 "country_code", 
                 "wikidata",
                 "wikipedia",
                 "east",
                 "south",
                 "west",
                 "north"], axis="columns")

In [5]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [6]:
geolocator = Nominatim(user_agent="geocode")

### Adding missing street address to locations
Gets the address from reverse geocoding the longitude and latitude

In [7]:
df_missing_street = df[df["street"].isna() & df["lat"].notna() & df["lon"].notna()]

In [8]:
def get_street(lat, lon):
    try:
        location = reverse((lat, lon), language='en')
        if location and location.raw.get("address"):
            return location.raw["address"].get("road")
    except:
        return None

In [9]:
for idx, row in df_missing_street.iterrows():
    street_name = get_street(row["lat"], row["lon"])
    df.at[idx, "street"] = street_name

Dropping all rows with missing streets to make sure no locations exists with missing streets.

In [11]:
df_nonan_street = df.dropna(subset=["street"])

In [12]:
df_nonan_street.to_csv("sweden_with_street.tsv", sep="\t", index=False)

In [None]:
df = pd.read_table("sweden_with_street.tsv")

In [None]:
from geopy.distance import geodesic

Defining the cities our attendant groups will be active in, the location is the most central spot in each city

In [None]:
# stad: (latitud, longitud)
cities = {
    "Stockholm": (59.3251172, 18.0710935),
    "Göteborg": (57.7072326, 11.9670171),
    "Malmö": (55.6052931, 13.0001566),
    "Uppsala": (59.8586126, 17.6387436),
    "Linköping": (58.4098135, 15.6245252),
    "Västerås": (59.6110992, 16.5463679),
    "Örebro": (59.2747287, 15.2151181),
    "Helsingborg": (56.0442098, 12.703706),
    "Jönköping": (57.7825634, 14.165719),
    "Norrköping": (58.5909124, 16.1903511)
}

A location belongs to that city if it is less than 30 km from the central spot

In [None]:
max_distance = 30

In [None]:
def find_nearest_city(lat, lon, cities):
    min_dist = float('inf')
    nearest = None
    for city, coords in cities.items():
        dist = geodesic((lat, lon), coords).km
        if dist < min_dist:
            min_dist = dist
            nearest = city
    if min_dist <= max_distance:
        return pd.Series([nearest, round(min_dist, 2)])
    else:
        return pd.Series([None, None])

In [None]:
df[['nearest_city', 'distance_km']] = df.apply(
    lambda row: find_nearest_city(row['lat'], row['lon'], cities),
    axis=1
)

In [None]:
nonc_df = df.dropna(subset=["nearest_city"])

Removing the letter from the housenumber if it exists and converts the string of numbers to a list instead

In [None]:
def extract_numeric_housenumber(housenumber):
    if not isinstance(housenumber, str):
        return []
    return [n.strip() for n in housenumber.split(',') if n.strip().isdigit()]

In [None]:
nonc_df = nonc_df.copy()
nonc_df["housenumbers"] = nonc_df["housenumbers"].apply(extract_numeric_housenumber)

In [None]:
nonc_df.to_csv('addresses_with_city.csv', index=False)