# Подготовка обучающей выборки

- Этап необходим для обучения модели предсказания СЭР `blocksnet.machine_learning.regression.social`. 
- Может быть пропущен, если модель уже обучена.

## 1. Получение данных о муниципальных образованиях

Для получения данных используется обертка `prostor.fetchers`, реализующая интерфейс над эндпоинтами `UrbanAPI`.

In [1]:
from prostor import fetchers
from blocksnet.config import log_config
import pandas as pd
import geopandas as gpd
from tqdm import tqdm

log_config.set_logger_level('ERROR')

### 1.1. Получение данных о регионах платформы

### 1.2. Вспомогательный код

Получение регионов. Будет использоваться внутри некоторых функций.

In [None]:
regions_df = fetchers.territories.get_regions()
regions_df.head()

Unnamed: 0_level_0,territory_type,parent,name,level,properties,admin_center,target_city_type,okato_code,oktmo_code,is_city,created_at,updated_at
territory_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,"{'id': 1, 'name': 'Субъект Федерации'}","{'id': 12639, 'name': 'Россия'}",Ленинградская область,2,"{'Малые города': 15, 'Крупные города': 0, 'Вуз...","{'id': 2082, 'name': 'город Гатчина'}",,41000000000,,False,2024-06-16T21:35:40.801621Z,2024-06-16T21:35:40.801621Z
3138,"{'id': 17, 'name': 'Город федерального значения'}","{'id': 12639, 'name': 'Россия'}",Санкт-Петербург,2,"{'Малые города': 10, 'Крупные города': 0, 'Чис...","{'id': 192713, 'name': 'город Санкт-Петербург'}","{'id': 15, 'name': 'Ядро агломерации', 'descri...",40000000000,,False,2024-06-16T21:35:40.801621Z,2025-05-07T12:13:57.865728Z
3268,"{'id': 17, 'name': 'Город федерального значения'}","{'id': 12639, 'name': 'Россия'}",Москва,2,"{'Малые города': 16, 'Крупные города': 1, 'Чис...","{'id': 192712, 'name': 'город Москва'}","{'id': 15, 'name': 'Ядро агломерации', 'descri...",45000000000,,False,2024-06-16T21:35:40.801621Z,2025-05-07T12:12:56.457146Z
3427,"{'id': 1, 'name': 'Субъект Федерации'}","{'id': 12639, 'name': 'Россия'}",Волгоградская область,2,"{'Малые города': 32, 'Крупные города': 1, 'Вуз...","{'id': 46184, 'name': 'город Волгоград'}",,18000000000,,False,2024-06-16T21:35:40.801621Z,2025-05-06T10:59:35.368223Z
3902,"{'id': 1, 'name': 'Субъект Федерации'}","{'id': 12639, 'name': 'Россия'}",Тульская область,2,"{'Малые города': 20, 'Крупные города': 1, 'Вуз...","{'id': 121476, 'name': 'город Тула'}",,70000000000,,False,2024-06-16T21:35:40.801621Z,2025-05-06T11:01:34.290377Z


Получение МО выбранного субъекта.

In [None]:
def get_region_mos(region_id : int) -> pd.DataFrame | None:
    territory_type_id = regions_df.loc[region_id, 'territory_type']['id']
    territories = fetchers.territories.get_territories(region_id)
    if territory_type_id == 17: # federal city type
        return territories[territories['territory_type'].apply(lambda tt : tt['id']) == 2] # territory type is MO
    if territory_type_id == 1: # subject type
        cities = territories[territories.is_city]
        mos_ids = cities.parent.apply(lambda p : p['id']).unique()
        return territories.loc[mos_ids]
    return None

Получение слоя функциональных зон выбранной территории:

- Получение источника функциональных зон. PZZ в данном случае приоритетнее, чем OSM, при этом выбирается наиболее свежий год.
- Получение слоя функциональных зон.
- Адаптация слоя в соответствие с требованиями `blocksnet`.

In [None]:
from prostor.adapters import adapt_functional_zones

SOURCES_PRIORITY = ['PZZ', 'OSM']

def _get_functional_zones_source(territory_id : int) -> tuple[int | None, str | None]:
    sources_df = fetchers.territories.get_functional_zones_sources(territory_id)
    sources = sources_df['source'].unique()
    for source in SOURCES_PRIORITY:
        if source in sources:
            sources_df = sources_df[sources_df['source'] == source]
            year = sources_df.year.max()
            return int(year), source
    return None, None

def get_functional_zones(territory_id : int) -> gpd.GeoDataFrame:
    year, source = _get_functional_zones_source(territory_id)
    functional_zones = fetchers.territories.get_functional_zones(territory_id, year, source)
    return adapt_functional_zones(functional_zones)

Получения слоя зданий выбранной территории:

- Получение слоя зданий.
- Адаптация слоя в соответствие с требованиями `blocksnet`.
- Импутация недостающих значений.

In [None]:
from prostor.adapters import adapt_buildings
from blocksnet.preprocessing.imputing import impute_buildings

def get_buildings(territory_id : int):
    buildings = fetchers.territories.get_physical_objects(territory_id, physical_object_function_id=1)
    buildings = adapt_buildings(buildings)
    buildings['number_of_floors'] = pd.to_numeric(buildings['number_of_floors'], errors='coerce')
    
    duplicated = buildings.index.duplicated()
    buildings = buildings.loc[~duplicated].copy()

    old_crs = buildings.crs
    new_crs = buildings.estimate_utm_crs()
    return impute_buildings(buildings.to_crs(new_crs)).to_crs(old_crs)

### 1.3. Получение признаков и таргетов 

Получение муниципальных образований.

In [6]:
from multiprocessing import Pool

mos_dfs = []

def _wrapper(region_id : int):
    return get_region_mos(region_id)

with Pool(processes=16) as pool:
    mos_dfs = list(tqdm(pool.imap_unordered(_wrapper, regions_df.index), total=len(regions_df.index)))

mos_df = pd.concat(mos_dfs).sort_index()

100%|██████████| 89/89 [00:20<00:00,  4.35it/s]


Получение данных о площади и населении (признаки)

In [93]:
import asyncio

In [None]:
POPULATION_INDICATOR_ID = 1
AREA_INDICATOR_ID = 4
METERS_IN_KILOMETER = 1_000

def _get_general_indicators_values(territory_id : int) -> dict:
    values_df = fetchers.territories.get_indicators_values(territory_id, [POPULATION_INDICATOR_ID, AREA_INDICATOR_ID])
    values_df['indicator_id'] = values_df['indicator'].apply(lambda i : i['indicator_id'])
    values_df = values_df.set_index('indicator_id')
    return {
        'population': int(values_df.loc[POPULATION_INDICATOR_ID, 'value']),
        'area': float(values_df.loc[AREA_INDICATOR_ID, 'value']) * METERS_IN_KILOMETER
    }

async def get_general_indicators_values(territory_id : int) -> dict:
    return await asyncio.to_thread(_get_general_indicators_values, territory_id)

Получение социальных индикаторов (таргеты)

In [None]:
social_indicators_df = fetchers.territories.get_indicators(308)

def _get_social_indicators_values(territory_id : int) -> dict:
    social_indicators_ids = list(social_indicators_df.index)
    values_df = fetchers.territories.get_indicators_values(territory_id, social_indicators_ids)
    values_df['indicator_name'] = values_df['indicator'].apply(lambda i : i['name_full'])
    return values_df.set_index('indicator_name')['value'].to_dict()

async def get_social_indicators_values(territory_id : int) -> dict:
    return await asyncio.to_thread(_get_social_indicators_values, territory_id)

Получение данных о зданиях (признаки)

In [96]:
def _get_buildings_indicators_values(territory_id : int) -> dict:
    buildings = get_buildings(territory_id)
    result = buildings.drop(columns=['geometry', 'number_of_floors', 'population']).sum().to_dict()
    return {
        **result,
        'buildings_count': len(buildings)
    }

async def get_buildings_indicators_values(territory_id : int) -> dict:
    return await asyncio.to_thread(_get_buildings_indicators_values, territory_id)

Получение данных о функциональных зонах (признаки)

In [None]:
from blocksnet.enums import LandUse

async def get_land_use_indicators_values(territory_id : int) -> dict:
    functional_zones, territory_geometry = await asyncio.gather(
        asyncio.to_thread(get_functional_zones, territory_id),
        asyncio.to_thread(fetchers.territories.get_territory_geometry, territory_id),
    )
    crs = functional_zones.estimate_utm_crs()
    functional_zones = functional_zones.to_crs(crs)

    territory_gdf= gpd.GeoDataFrame(geometry=[territory_geometry], crs=4326).to_crs(crs)
    functional_zones.geometry = functional_zones.make_valid()
    functional_zones = functional_zones.clip(territory_gdf)

    
    functional_zones = functional_zones[~functional_zones['functional_zone'].isna()].copy()
    functional_zones['functional_zone'] = functional_zones['functional_zone'].apply(lambda lu : lu.value)
    functional_zones['site_area'] = functional_zones.area
    
    result = functional_zones.groupby('functional_zone').agg({'site_area':'sum'})['site_area'].to_dict()
    for lu in LandUse:
        if lu.value not in result:
            result[lu.value] = 0.0
    return result

## 1.4. Получение данных для обучения по всем муниципальным образованиям

In [102]:
semaphore = asyncio.Semaphore(32)

async def get_data(territory_id : int) -> dict:
    async with semaphore:
        try:
            general_indicators, buildings_indicators, land_use_indicators, social_indicators = await asyncio.gather(
                # features
                get_general_indicators_values(territory_id),
                get_buildings_indicators_values(territory_id),
                get_land_use_indicators_values(territory_id),
                # targets
                get_social_indicators_values(territory_id)
            )
            return territory_id, {
                **general_indicators,
                **buildings_indicators,
                **land_use_indicators,
                **social_indicators
            }
        except:
            return territory_id, None

In [103]:
from tqdm.asyncio import tqdm as tqdm_asyncio

async def main(mo_ids : list[int] = []):
    if len(mo_ids) == 0:
        mo_ids = mos_df.index.tolist()
    tasks = [get_data(mo_id) for mo_id in mo_ids]
    results = await tqdm_asyncio.gather(*tasks, desc="Loading data")
    return {r[0]: r[1] for r in results}

#### 1.4.1 First try

In [None]:
mos_data = await main()

In [54]:
df = pd.DataFrame.from_dict({key:data for key,data in mos_data.items() if data is not None}, orient='index')
df.to_pickle('data.pickle')

#### 1.4.2 Second try

In [104]:
mos_ids = [key for key,data in mos_data.items() if data is None]
mos_data_1 = await main(mos_ids)

Loading data: 100%|██████████| 12089/12089 [2:55:04<00:00,  1.15it/s]  


In [105]:
df_1 = pd.DataFrame.from_dict({key:data for key,data in mos_data_1.items() if data is not None}, orient='index')
df_1.to_pickle('data_1.pickle')

In [106]:
df_1

Unnamed: 0,population,area,is_living,footprint_area,build_floor_area,living_area,non_living_area,buildings_count,agriculture,business,...,Количество хостелов,Количество туристических баз,Количество общеобразовательных учреждений,Количество плоскостных спортивных сооружений,Количество больничных учреждений (стационаров),Количество амбулаторно-поликлинических учреждений,Количество станций скорой медицинской помощи,Количество объектов общественного питания,Количество аптек,Количество лечебно-профилактических медицинских учреждений
65,9718,387000.0,279.0,768579.209704,877792.340407,166623.380255,711962.960152,3255,2.122423e+07,36095.399005,...,0.0,2.0,1.0,9.0,0.0,1.0,0.0,3.0,1.0,0.0
3285,108657,9000.0,12.0,254813.829746,254813.829746,38296.177317,216517.652429,264,2.188738e+04,355647.139730,...,0.0,0.0,17.0,73.0,3.0,0.0,0.0,0.0,0.0,0.0
3305,113015,10000.0,67.0,461484.676817,461484.676817,58130.367428,403354.309389,269,0.000000e+00,200134.032100,...,0.0,0.0,10.0,52.0,1.0,0.0,0.0,7.0,0.0,0.0
3306,112246,8000.0,9.0,70187.852011,70187.852011,11600.158619,58587.693392,153,0.000000e+00,82703.031778,...,0.0,0.0,20.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0
3316,64132,8000.0,25.0,109786.527935,109786.527935,29420.148564,80366.379371,157,2.430311e+02,37122.966967,...,0.0,0.0,5.0,27.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143404,265642,134000.0,0.0,0.000000,0.000000,0.000000,0.000000,888,0.000000e+00,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144069,134629,170000.0,0.0,0.000000,0.000000,0.000000,0.000000,415,1.891814e+06,43078.895796,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150232,368,55984000.0,0.0,0.000000,0.000000,0.000000,0.000000,1,0.000000e+00,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150233,256,35757000.0,0.0,0.000000,0.000000,0.000000,0.000000,1,0.000000e+00,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
