# Combine Component 1 and Component 2 Fields

This notebook combines Component 1 (Mass movement susceptibility) and Component 2 (Social Vulnerability Index) measuring the overall risk of an area

## Input
- Component 1: Mass movement susceptibility
  - Landslide susceptibility
  - Flows susceptibility
- Component 2: Social Vulnerability Index

> Input files are produced in `03_susceptibility_model/` and `04_index_calculation/`, respectively.

## Process
- To calculate the overall risk, we simply need to multiply the value of susceptibility and social vulnerability index

## Output 
- Combined dataset that includes the selected features and areas with the overall risk


In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import geopandas as gpd
import pandas as pd
import polars as pl
import sys
from datetime import datetime
import subprocess
from loguru import logger



In [2]:
# Util imports
sys.path.append("../../")  # include parent directory
from src.settings import DATA_DIR

In [23]:
COMPONENT_1_DIR = DATA_DIR / "output/component_1"
TARGET_LANDSLIDE_FILE = COMPONENT_1_DIR / "landslide_susceptibility.gpkg"
TARGET_FLOWS_FILE = COMPONENT_1_DIR / "flows_susceptibility.gpkg"

GCS_ROLLOUT_DIR = "gs://immap-susceptibility-model"

COMPONENT_2_DIR = DATA_DIR / "output/component_2"
TARGET_COMPONENT_2_FILE = COMPONENT_2_DIR / "cdc_simplified_features_20240528.gpkg"

VERSION = datetime.now().strftime("%Y%m%d")
OUTPUT_DIR = DATA_DIR / "output/risk_index"
OUTPUT_FPATH = OUTPUT_DIR / f"risk_index_{VERSION}.gpkg"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
OUTPUT_GCS_BUCKET = "gs://immap-risk-index"

## Verify target files

In [24]:
for target_file in [TARGET_FLOWS_FILE, TARGET_LANDSLIDE_FILE, TARGET_COMPONENT_2_FILE]:
    if not target_file.is_file():
        logger.info(f"File {target_file} does not exist!")
    else:
        logger.info(f"File {target_file} exists!")

2024-05-21 11:54:23.428 | INFO     | __main__:<module>:9 - File /home/jc_tm/project_repos/immap-evidem/data/output/component_1/flows_susceptibility.gpkg exists!
2024-05-21 11:54:23.429 | INFO     | __main__:<module>:9 - File /home/jc_tm/project_repos/immap-evidem/data/output/component_1/landslide_susceptibility.gpkg exists!
2024-05-21 11:54:23.430 | INFO     | __main__:<module>:9 - File /home/jc_tm/project_repos/immap-evidem/data/output/component_2/cdc_simplified_features_20240426.gpkg exists!


## Load and inspect components

In [9]:
component_1_landslide_gdf = gpd.read_file(TARGET_LANDSLIDE_FILE)
component_1_landslide_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 221152 entries, 0 to 221151
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype   
---  ------          --------------   -----   
 0   quadkey         221152 non-null  object  
 1   MPIO_CCNCT      221152 non-null  object  
 2   MPIO_CNMBR      221152 non-null  object  
 3   MPIO_CNMBR_EN   221152 non-null  object  
 4   DPTO_CNMBR      221152 non-null  object  
 5   DPTO_CNMBR_EN   221152 non-null  object  
 6   Municipio       221152 non-null  object  
 7   Municipio_EN    221152 non-null  object  
 8   DPTO_CCDGO      221152 non-null  object  
 9   MPIO_CCDGO      221152 non-null  object  
 10  MPIO_CRSLC      221152 non-null  object  
 11  MPIO_NAREA      221152 non-null  float64 
 12  MPIO_NANO       221152 non-null  int64   
 13  SHAPE_AREA      221152 non-null  float64 
 14  SHAPE_LEN       221152 non-null  float64 
 15  LANDSLIDE_SUSC  221152 non-null  float64 
 16  geometry        221152 non-nul

In [10]:
component_1_flows_gdf = gpd.read_file(TARGET_FLOWS_FILE)
component_1_flows_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 221152 entries, 0 to 221151
Data columns (total 17 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   quadkey        221152 non-null  object  
 1   MPIO_CCNCT     221152 non-null  object  
 2   MPIO_CNMBR     221152 non-null  object  
 3   MPIO_CNMBR_EN  221152 non-null  object  
 4   DPTO_CNMBR     221152 non-null  object  
 5   DPTO_CNMBR_EN  221152 non-null  object  
 6   Municipio      221152 non-null  object  
 7   Municipio_EN   221152 non-null  object  
 8   DPTO_CCDGO     221152 non-null  object  
 9   MPIO_CCDGO     221152 non-null  object  
 10  MPIO_CRSLC     221152 non-null  object  
 11  MPIO_NAREA     221152 non-null  float64 
 12  MPIO_NANO      221152 non-null  int64   
 13  SHAPE_AREA     221152 non-null  float64 
 14  SHAPE_LEN      221152 non-null  float64 
 15  FLOWS_SUSC     221152 non-null  float64 
 16  geometry       221152 non-null  geometry
dtypes:

In [25]:
# component_1_gdf = gpd.read_file(TARGET_COMPONENT_1_FILE)
# component_1_gdf.info()

In [26]:
component_2_gdf = gpd.read_file(TARGET_COMPONENT_2_FILE)
component_2_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 8079 entries, 0 to 8078
Data columns (total 14 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   quadkey                            8079 non-null   object  
 1   MPIO_CCNCT                         8079 non-null   object  
 2   MPIO_CNMBR_EN                      8079 non-null   object  
 3   poverty_index_p_rank               8079 non-null   float64 
 4   vul_pop_index_p_rank               8079 non-null   float64 
 5   vul_asset_index_p_rank             8079 non-null   float64 
 6   resource_deprivation_index_p_rank  8079 non-null   float64 
 7   svi_mean_p_rank                    8079 non-null   float64 
 8   svi_extreme_count                  8079 non-null   int64   
 9   poverty_index                      8079 non-null   float64 
 10  vul_pop_index                      8079 non-null   float64 
 11  vul_asset_index                    

## Select fields for final output

In [27]:
component_1_landslide_gdf.columns

Index(['quadkey', 'MPIO_CCNCT', 'MPIO_CNMBR', 'MPIO_CNMBR_EN', 'DPTO_CNMBR',
       'DPTO_CNMBR_EN', 'Municipio', 'Municipio_EN', 'DPTO_CCDGO',
       'MPIO_CCDGO', 'MPIO_CRSLC', 'MPIO_NAREA', 'MPIO_NANO', 'SHAPE_AREA',
       'SHAPE_LEN', 'LANDSLIDE_SUSC', 'geometry'],
      dtype='object')

In [28]:
component_1_landslide_fields = [
    "quadkey",
    "MPIO_CCNCT",
    "MPIO_CNMBR",
    "MPIO_CNMBR_EN",
    "DPTO_CNMBR",
    "DPTO_CNMBR_EN",
    "LANDSLIDE_SUSC",
    "geometry",
]
process_component_1_landslide_gdf = component_1_landslide_gdf.loc[
    :, component_1_landslide_fields
]
process_component_1_landslide_gdf.head(3)

Unnamed: 0,quadkey,MPIO_CCNCT,MPIO_CNMBR,MPIO_CNMBR_EN,DPTO_CNMBR,DPTO_CNMBR_EN,LANDSLIDE_SUSC,geometry
0,32232230100333331,86749,SIBUNDOY,SIBUNDOY,PUTUMAYO,PUTUMAYO,0.017111,"POLYGON ((-76.90567 1.31962, -76.90567 1.32099..."
1,32232230100333323,86749,SIBUNDOY,SIBUNDOY,PUTUMAYO,PUTUMAYO,0.017194,"POLYGON ((-76.90842 1.31824, -76.90842 1.31962..."
2,32232230100333332,86749,SIBUNDOY,SIBUNDOY,PUTUMAYO,PUTUMAYO,0.094674,"POLYGON ((-76.90704 1.31824, -76.90704 1.31962..."


In [29]:
component_1_flows_fields = [
    "quadkey",
    "FLOWS_SUSC",
]
process_component_1_flows_gdf = component_1_flows_gdf.loc[:, component_1_flows_fields]
process_component_1_flows_gdf.head(3)

Unnamed: 0,quadkey,FLOWS_SUSC
0,32232230100333331,0.020255
1,32232230100333323,0.024567
2,32232230100333332,0.014525


In [30]:
# ## Rename the probability fields based on the model type
# process_component_1_landslide_gdf = process_component_1_landslide_gdf.rename(
#     {
#         "pred_proba_1": "landslide_pred_probability",
#     },
#     axis=1,
# )
# process_component_1_flows_gdf = process_component_1_flows_gdf.rename(
#     {"pred_proba_2": "flows_pred_probability"}, axis=1
# )
# display(process_component_1_landslide_gdf.info())
# display(process_component_1_flows_gdf.info())

In [31]:
component_2_gdf.columns

Index(['quadkey', 'MPIO_CCNCT', 'MPIO_CNMBR_EN', 'poverty_index_p_rank',
       'vul_pop_index_p_rank', 'vul_asset_index_p_rank',
       'resource_deprivation_index_p_rank', 'svi_mean_p_rank',
       'svi_extreme_count', 'poverty_index', 'vul_pop_index',
       'vul_asset_index', 'resource_deprivation_index', 'geometry'],
      dtype='object')

In [34]:
component_2_fields = [
    "quadkey",
    "svi_mean_p_rank",
    # 'svi_extreme_count',
    "poverty_index_p_rank",
    "vul_pop_index_p_rank",
    "vul_asset_index_p_rank",
    "resource_deprivation_index_p_rank",
    # 'poverty_index',
    # 'vul_pop_index',
    # 'vul_asset_index',
    # 'resource_deprivation_index'
]

process_component_2_gdf = component_2_gdf.loc[:, component_2_fields]
process_component_2_gdf.head(10)

Unnamed: 0,quadkey,svi_mean_p_rank,poverty_index_p_rank,vul_pop_index_p_rank,vul_asset_index_p_rank,resource_deprivation_index_p_rank
0,32232230102303131,0.611848,0.979381,0.356083,0.111929,1.0
1,32232230102320131,0.594711,0.979381,0.353116,0.060383,0.985965
2,32232230102321232,0.730462,0.979381,0.905045,0.079529,0.957895
3,32232230102321322,0.720782,0.979381,0.910979,0.010309,0.982456
4,32232230102332031,0.740095,0.979381,0.908012,0.104566,0.968421
5,32232230102332132,0.719075,0.958763,0.913947,0.091311,0.912281
6,32232230102322331,0.541573,0.979381,0.661721,0.005891,0.519298
7,32232230102323220,0.591205,0.979381,0.661721,0.004418,0.719298
8,32232230102323223,0.553507,0.979381,0.667656,0.044183,0.522807
9,32232230120100133,0.494345,0.907216,0.709199,0.139912,0.221053


In [35]:
combined_gdf = pd.merge(
    process_component_1_landslide_gdf,
    process_component_1_flows_gdf,
    on="quadkey",
    how="left",
)
combined_gdf = pd.merge(combined_gdf, process_component_2_gdf, on="quadkey", how="left")
combined_gdf

Unnamed: 0,quadkey,MPIO_CCNCT,MPIO_CNMBR,MPIO_CNMBR_EN,DPTO_CNMBR,DPTO_CNMBR_EN,LANDSLIDE_SUSC,geometry,FLOWS_SUSC,svi_mean_p_rank,poverty_index_p_rank,vul_pop_index_p_rank,vul_asset_index_p_rank,resource_deprivation_index_p_rank
0,032232230100333331,86749,SIBUNDOY,SIBUNDOY,PUTUMAYO,PUTUMAYO,0.017111,"POLYGON ((-76.90567 1.31962, -76.90567 1.32099...",0.020255,,,,,
1,032232230100333323,86749,SIBUNDOY,SIBUNDOY,PUTUMAYO,PUTUMAYO,0.017194,"POLYGON ((-76.90842 1.31824, -76.90842 1.31962...",0.024567,,,,,
2,032232230100333332,86749,SIBUNDOY,SIBUNDOY,PUTUMAYO,PUTUMAYO,0.094674,"POLYGON ((-76.90704 1.31824, -76.90704 1.31962...",0.014525,,,,,
3,032232230100333333,86749,SIBUNDOY,SIBUNDOY,PUTUMAYO,PUTUMAYO,0.044252,"POLYGON ((-76.90567 1.31824, -76.90567 1.31962...",0.007499,,,,,
4,032232230101222222,86749,SIBUNDOY,SIBUNDOY,PUTUMAYO,PUTUMAYO,0.069744,"POLYGON ((-76.90430 1.31824, -76.90430 1.31962...",0.006635,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221147,032231022220323022,54250,EL TARRA,EL TARRA,NORTE DE SANTANDER,NORTE DE SANTANDER,0.112018,"POLYGON ((-73.07007 8.49954, -73.07007 8.50090...",0.099677,,,,,
221148,032231022220323023,54250,EL TARRA,EL TARRA,NORTE DE SANTANDER,NORTE DE SANTANDER,0.085717,"POLYGON ((-73.06870 8.49954, -73.06870 8.50090...",0.220074,,,,,
221149,032231022220323032,54250,EL TARRA,EL TARRA,NORTE DE SANTANDER,NORTE DE SANTANDER,0.011917,"POLYGON ((-73.06732 8.49954, -73.06732 8.50090...",0.227047,,,,,
221150,032231022220323033,54250,EL TARRA,EL TARRA,NORTE DE SANTANDER,NORTE DE SANTANDER,0.021103,"POLYGON ((-73.06595 8.49954, -73.06595 8.50090...",0.271647,,,,,


## Combine the two components
Method: Multiply component 1 and component 2 to get the combined risk index. We calculate combined risk index for landslides and flows separately.

In [36]:
combined_gdf["risk_index_landslide"] = (
    combined_gdf["LANDSLIDE_SUSC"] * combined_gdf["svi_mean_p_rank"]
)
combined_gdf["risk_index_flows"] = (
    combined_gdf["FLOWS_SUSC"] * combined_gdf["svi_mean_p_rank"]
)
reordered_columns = [col for col in combined_gdf.columns if col != "geometry"] + [
    "geometry"
]
combined_gdf = combined_gdf.loc[:, reordered_columns]
combined_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 221152 entries, 0 to 221151
Data columns (total 16 columns):
 #   Column                             Non-Null Count   Dtype   
---  ------                             --------------   -----   
 0   quadkey                            221152 non-null  object  
 1   MPIO_CCNCT                         221152 non-null  object  
 2   MPIO_CNMBR                         221152 non-null  object  
 3   MPIO_CNMBR_EN                      221152 non-null  object  
 4   DPTO_CNMBR                         221152 non-null  object  
 5   DPTO_CNMBR_EN                      221152 non-null  object  
 6   LANDSLIDE_SUSC                     221152 non-null  float64 
 7   FLOWS_SUSC                         221152 non-null  float64 
 8   svi_mean_p_rank                    8079 non-null    float64 
 9   poverty_index_p_rank               8079 non-null    float64 
 10  vul_pop_index_p_rank               8079 non-null    float64 
 11  vul_asset_index_p_

## Export to file

In [37]:
combined_gdf.to_file(OUTPUT_FPATH, index=False)