In [1]:
from srai.embedders import Hex2VecEmbedder
from srai.joiners import IntersectionJoiner
from srai.loaders import OSMOnlineLoader
from srai.neighbourhoods import H3Neighbourhood
from srai.regionalizers import H3Regionalizer, geocode_to_region_gdf
# from srai.plotting import plot_regions, plot_numeric_data
from pytorch_lightning import seed_everything

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 71
seed_everything(SEED)

Seed set to 71


71

### Load data from OSM

First use geocoding to get the area

In [3]:
area_gdf = geocode_to_region_gdf("Wrocław, Poland")
# plot_regions(area_gdf, tiles_style="CartoDB positron")

Next, download the data for the selected region and the specified tags. We're using `OSMOnlineLoader` here, as it's faster for low numbers of tags. In a real life scenario with more tags, you would likely want to use the `OSMPbfLoader`.

In [4]:
tags = {
    "leisure": "park",
    "landuse": "forest",
    "amenity": ["bar", "restaurant", "cafe"],
    "water": "river",
    "sport": "soccer",
}

# {
#     "sport" :  [
#         "shooting", 
#         "cycling", 
#         "boxing", 
#         "horse", 
#         "table", 
#         "pelota", 
#         "athletics", 
#         "yoga", 
#         "gymnastics", 
#         "soccer", 
#         "boules", 
#         "surfing", 
#         "crossfit", 
#         "swimming", 
#         "basketball", 
#         "motocross", 
#         "judo", 
#         "climbing", 
#         "rowing", 
#         "canoe", 
#         "running", 
#         "volleyball", 
#         "multi", 
#         "badminton", 
#         "equestrian", 
#         "motor", 
#         "bmx", 
#         "fitness", 
#         "tennis", 
#         "skateboard", 
#         "karate", 
#         "beachvolleybal",
#     ],
#     "amenity" :  [
#         "cinema", 
#         "studio", 
#         "pub", 
#         "college", 
#         "planetarium", 
#         "language", 
#         "bbq", 
#         "nightclub", 
#         "arts", 
#         "music", 
#         "social", 
#         "theatre", 
#         "cafe", 
#         "food", 
#         "library", 
#         "casino", 
#         "kindergarten", 
#         "community", 
#         "fountain", 
#         "brothel", 
#         "restaurant", 
#         "university", 
#         "fast", 
#         "bar", 
#         "school", 
#         "ice"
#     ],
#     "office" :  [
#         "financial", 
#         "water", 
#         "company", 
#         "foundation", 
#         "lawyer", 
#         "quango", 
#         "government", 
#         "estate", 
#         "architect", 
#         "coworking", 
#         "accountant", 
#         "notary", 
#         "diplomatic", 
#         "telecommunication", 
#         "newspaper", 
#         "research", 
#         "ngo", 
#         "engineer", 
#         "advertising", 
#         "logistics", 
#         "it", 
#         "insurance", 
#         "yes", 
#         "association",
#     ],
#     "shop" :  [
#         "kiosk", 
#         "deli", 
#         "carpet", 
#         "lottery", 
#         "craft", 
#         "butcher", 
#         "military", 
#         "electronics", 
#         "fashion", 
#         "water", 
#         "hifi", 
#         "tailor", 
#         "shoes", 
#         "religion", 
#         "trade", 
#         "locksmith", 
#         "hardware", 
#         "farm", 
#         "convenience", 
#         "ticket", 
#         "vacant", 
#         "appliance", 
#         "frame", 
#         "chocolate", 
#         "lighting", 
#         "money", 
#         "copyshop", 
#         "confectionery", 
#         "jewelry", 
#         "dry", 
#         "garden", 
#         "car", 
#         "electrical", 
#         "ice", 
#         "anime", 
#         "nutrition", 
#         "houseware", 
#         "bag", 
#         "music", 
#         "books", 
#         "seafood", 
#         "agrarian", 
#         "chemist", 
#         "doityourself", 
#         "motorcycle", 
#         "perfumery", 
#         "fabric", 
#         "funeral", 
#         "baby", 
#         "pawnbroker", 
#         "stationery", 
#         "furniture", 
#         "radiotechnics", 
#         "pastry", 
#         "travel", 
#         "bicycle", 
#         "hearing", 
#         "tiles", 
#         "interior", 
#         "supermarket", 
#         "mall", 
#         "health", 
#         "pet", 
#         "second", 
#         "musical", 
#         "art", 
#         "tyres", 
#         "alcohol", 
#         "optician", 
#         "gas", 
#         "erotic", 
#         "cosmetics", 
#         "tobacco", 
#         "fishing", 
#         "medical", 
#         "beverages", 
#         "kitchen", 
#         "bed", 
#         "mobile", 
#         "coffee", 
#         "newsagent", 
#         "general", 
#         "photo", 
#         "florist", 
#         "sewing", 
#         "laundry", 
#         "outdoor", 
#         "department", 
#         "wholesale", 
#         "bakery", 
#         "variety", 
#         "glaziery", 
#         "toys", 
#         "gift", 
#         "sports", 
#         "beauty", 
#         "video", 
#         "herbalist", 
#         "party", 
#         "clothes", 
#         "hairdresser", 
#         "tea", 
#         "computer", 
#         "paint", 
#         "storage"
#     ],
#     "leisure" :  [
#         "fishing",
#         "track",
#         "water",
#         "playground",
#         "marina",
#         "horse",
#         "picnic",
#         "adult",
#         "stadium",
#         "dog",
#         "pitch",
#         "fitness",
#         "golf",
#         "park",
#         "summer",
#         "garden",
#         "dance",
#     ],
#     "aeroway" :  ["aerodrome" "helipad"],
#     "tourism" :  [
#         "artwork",
#         "picnic",
#         "motel",
#         "hotel",
#         "museum",
#         "zoo",
#         "attraction",
#         "information",
#         "viewpoint",
#         "guest",
#         "chalet", 
#         "camp", 
#         "hostel",
#         "apartment", 
#         "gallery", 
#         "theme",
#     ] 
# }

loader = OSMOnlineLoader()

features_gdf = loader.load(area_gdf, tags)

# folium_map = plot_regions(area_gdf, colormap=[rgba(0,0,0,0)], tiles_style=CartoDB positron)
# features_gdf.explore(m=folium_map)

Downloading leisure: park      :   0%|          | 0/7 [00:00<?, ?it/s]

Downloading sport: soccer      : 100%|██████████| 7/7 [00:50<00:00,  7.16s/it]


## Prepare the data for embedding

After downloading the data, we need to prepare it for embedding. Namely - we need to regionalize the selected area, and join the features with regions.

In [5]:
regionalizer = H3Regionalizer(resolution=9)
regions_gdf = regionalizer.transform(area_gdf)
# plot_regions(regions_gdf, tiles_style="CartoDB positron")

In [6]:
joiner = IntersectionJoiner()
joint_gdf = joiner.transform(regions_gdf, features_gdf)
# joint_gdf

## Embedding

After preparing the data we can proceed with generating embeddings for the regions.

In [7]:
import warnings

neighbourhood = H3Neighbourhood(regions_gdf)
embedder = Hex2VecEmbedder([15, 10])

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    embeddings = embedder.fit_transform(
        regions_gdf,
        features_gdf,
        joint_gdf,
        neighbourhood,
        trainer_kwargs={"max_epochs": 5, "accelerator": "cpu"},
        batch_size=100,
    )
# embeddings

100%|██████████| 3168/3168 [00:00<00:00, 10135.46it/s]
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /home/victor/Documentos/HAVANA-2.0/notebooks/lightning_logs

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 280   
---------------------------------------
280       Trainable params
0         Non-trainable params
280       Total params
0.001     Total estimated model params size (MB)


Epoch 4: 100%|██████████| 185/185 [00:04<00:00, 43.29it/s, v_num=0]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 185/185 [00:04<00:00, 43.15it/s, v_num=0]


### Visualizing the embeddings' similarity

In [None]:
# from sklearn.cluster import KMeans

# clusterizer = KMeans(n_clusters=5, random_state=SEED)
# clusterizer.fit(embeddings)

# embeddings["cluster"] = clusterizer.labels_
# embeddings

In [None]:
# plot_numeric_data(regions_gdf, "cluster", embeddings)