In [8]:
import pandas as pd
import geopandas as gpd
from pyproj import CRS
from shapely.geometry import Point

def get_gazetteer():
    """
    Retrieve GeoNames gazetteer and limit that to contain only the toponyms in Helsinki Metropolitan area.
    """
    #retrieve the toponyms from Geonames txt file
    geonames = pd.read_csv(r"data/FI.txt", sep="\t", header= None)
    #rename the columns
    geonames.columns = ["geonameid", "name", "asciiname", "alternatenames", "latitude", "longitude", "feature class", 
                        "feature code", "country code", "cc2", "admin1 code", "admin2 code", "admin3 code", "admin4 code",                                 "population", "elevation", "dem", "timezone", "modification date"]
      
    #make shapely points from latitude and longitude and convert to geodataframe
    geonames["geometry"] = geonames.apply(lambda row: Point(row["longitude"], row["latitude"]), axis=1)
    geonames = gpd.GeoDataFrame(geonames)

    #reproject geonames to ETRS89 / TM35FIN
    geonames.crs = CRS.from_epsg(4326).to_wkt()
    geonames = geonames.to_crs(epsg=3067)

    #get a shapefile of municipalities in Metropolitan Area
    hma = gpd.read_file(r"data/PKS_postinumeroalueet_2020.shp")
    hma = hma.to_crs(epsg=3067)
    
    #retrieve the placenames in Helsinki Metropolitan area
    hmanames = gpd.overlay(geonames, hma, how="intersection")
    hmanames.crs = CRS.from_epsg(3067)
    
    #convert all placenames to small letters for further processing
    hmanames["name"] = hmanames["name"].str.lower()
    
    #delete general city names Helsinki and Espoo for ambigous results (done in post-processing)
    #hmanames = hmanames[(hmanames.name != "helsinki") & (hmanames.name !="espoo")]
    
    print("Gazetteer ready for use")
    return hmanames


df = get_gazetteer()
df.to_file("data/hmagazetteer.shp")

Gazetteer ready for use
