# Create Training Data

In this noteboook, our goal is to label each of our 150x150m grids to one of the three classes: landslide, flows, and non_landslide. Make sure to download the necssary input files and match them to the directories listed below.

### Inputs
- Master feature table
- Landslide coordinates from the landslide inventory
- Non-landslide samples

### Process
- Spatial intersectiono of the coordinates/polygons to add labels to each qaudkey

### Output
- Tranining data with quadkey, features and labels

# Imports and Set-up

*DO NOT SKIP THIS SECTION.* This section imports the packages needed to run this notebook and initializes the data file paths.

In [1]:
%load_ext autoreload
%autoreload 2

# Standard imports
import sys
import pandas as pd

# geospatial
import geopandas as gpd



In [2]:
# Util imports
sys.path.append("../../")  # include parent directory
from src.settings import DATA_DIR, LOCAL_CRS, PROJ_CRS

In [3]:
ALINGED_DIR = DATA_DIR / "aligned/csv"
MUNICIPALITIES_DIR = DATA_DIR / "admin_bounds"
VECTOR_DIR = DATA_DIR / "vectors"
TRAINING_DIR = VECTOR_DIR / "training_labels"
TRAINING_DIR.mkdir(exist_ok=True)

MASTERFILE_VERSION = "20240213"

dtype = {"quadkey": str}

AOI_FPATH = MUNICIPALITIES_DIR / "grids_landslide_w_xyz_zoomlevel18_20240320.parquet"
FEATURES_FPATH = ALINGED_DIR / "aligned_dataset_consolidated_20240507.csv"
IMMAP_DATA_FPATH = DATA_DIR / "vectors/client-data/cleaned_data"
IMMAP_DATA_FPATH.mkdir(exist_ok=True, parents=True)
MASTERFILE_FPATH = (
    IMMAP_DATA_FPATH / f"landslide_event_reference_file_{MASTERFILE_VERSION}.csv"
)

SAM_POLYGONS_DIR = DATA_DIR / "vectors/sam-outputs"
SAM_POLYGONS_DIR.mkdir(exist_ok=True)
SAM_POLYGONS_FPATH = SAM_POLYGONS_DIR / "sam_consolidated_w_qa20240213.gpkg"


NEG_LABEL_VERSION = "20240402"
NEG_LABEL_EXC_BUFFER_SIZE = 500
NEG_LABELS_FPATH = (
    TRAINING_DIR
    / f"non_landslide_sampled_grids_{NEG_LABEL_EXC_BUFFER_SIZE}m_{NEG_LABEL_VERSION}.gpkg"
)

VERSION = pd.to_datetime("today").strftime("%Y%m%d")
OUT_CSV_FPATH = DATA_DIR / f"models/training_data/training_data_{VERSION}.csv"
OUT_PARQUET_FPATH = DATA_DIR / f"models/training_data/training_data_{VERSION}.parquet"

REDUCE_LANDSLIDES = False  # set to True to create training data for Flows

## Load Data

### Labels

This section loads both the positive and negative labels. For the positive labels, this will come from the masterfile (combination of landslide coordinates from Catalog and Inventory). For negative labels, this was generated beforehand using the buffer approach method.

In [4]:
masterfile = pd.read_csv(MASTERFILE_FPATH)
neg_labels_gdf = gpd.read_file(NEG_LABELS_FPATH)

Reading the data from cached files...


In [7]:
masterfile.describe(), masterfile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43664 entries, 0 to 43663
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        43664 non-null  int64  
 1   OBJECTID          43664 non-null  int64  
 2   MOV_TYPE          43664 non-null  object 
 3   MOV_DATE          43654 non-null  object 
 4   DPTO              43664 non-null  object 
 5   MUNICIPIO         43664 non-null  object 
 6   VEREDA            43664 non-null  object 
 7   LAT               43664 non-null  float64
 8   LON               43664 non-null  float64
 9   source            43664 non-null  object 
 10  Comments/Actions  687 non-null    object 
 11  comments          33131 non-null  object 
 12  qa                676 non-null    object 
 13  notes             670 non-null    object 
dtypes: float64(2), int64(2), object(10)
memory usage: 4.7+ MB


(         Unnamed: 0      OBJECTID           LAT           LON
 count  43664.000000  43664.000000  43664.000000  43664.000000
 mean   21831.500000  14594.119641      5.139735    -74.677325
 std    12604.855414  10319.255293      1.884579      2.273906
 min        0.000000      1.000000     -4.211928    -78.781000
 25%    10915.750000   5458.750000      4.016927    -75.795556
 50%    21831.500000  12672.500000      5.360002    -75.287741
 75%    32747.250000  23588.250000      6.295846    -73.101608
 max    43663.000000  34504.000000     11.553918      0.000000,
 None)

In [8]:
masterfile["qa"].value_counts(dropna=False)

NaN     42988
pass      338
fail      338
Name: qa, dtype: int64

In [9]:
# filter out coordinates that were part of SAM pipeline
masterfile = masterfile[masterfile["qa"].isna()]

In [10]:
masterfile.columns

Index(['Unnamed: 0', 'OBJECTID', 'MOV_TYPE', 'MOV_DATE', 'DPTO', 'MUNICIPIO',
       'VEREDA', 'LAT', 'LON', 'source', 'Comments/Actions', 'comments', 'qa',
       'notes'],
      dtype='object')

In [11]:
masterfile.drop(
    columns=[
        "Unnamed: 0",
        "DPTO",
        "MUNICIPIO",
        "VEREDA",
        "Comments/Actions",
        "comments",
        "qa",
        "notes",
    ],
    inplace=True,
)

#### Filter to events 2000 onwards and to landslide, flows and topple 

In [12]:
# format date
masterfile["MOV_DATE"] = pd.to_datetime(masterfile.MOV_DATE, errors="coerce")

In [13]:
pos_label = masterfile[
    (masterfile.MOV_DATE >= pd.to_datetime("20000101", format="%Y%m%d"))
    & (masterfile.MOV_TYPE.isin(["flows", "landslide"]))
]

In [14]:
pos_label.shape

(31833, 6)

In [15]:
pos_label.groupby(["MOV_TYPE", "source"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,OBJECTID,MOV_DATE,LAT,LON
MOV_TYPE,source,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
flows,landslide_catalog,4437,4437,4437,4437
flows,landslide_inventory,1125,1125,1125,1125
landslide,landslide_catalog,21142,21142,21142,21142
landslide,landslide_inventory,5129,5129,5129,5129


In [16]:
pos_labels_gdf = gpd.GeoDataFrame(
    pos_label,
    geometry=gpd.points_from_xy(pos_label.LON, pos_label.LAT),
    crs="EPSG:4326",
)

#### Check duplicate `OBJECTID`

In [17]:
pos_labels_gdf[pos_labels_gdf.duplicated(subset=["OBJECTID"], keep=False)]

Unnamed: 0,OBJECTID,MOV_TYPE,MOV_DATE,LAT,LON,source,geometry
41,42,landslide,2000-01-04,4.757758,-76.227065,landslide_catalog,POINT (-76.22706 4.75776)
42,43,landslide,2000-01-06,4.082055,-76.189929,landslide_catalog,POINT (-76.18993 4.08206)
43,44,landslide,2000-01-11,3.738116,-76.268108,landslide_catalog,POINT (-76.26811 3.73812)
92,93,landslide,2008-11-27,4.238959,-76.307156,landslide_catalog,POINT (-76.30716 4.23896)
93,94,landslide,2008-11-27,4.228730,-76.300748,landslide_catalog,POINT (-76.30075 4.22873)
...,...,...,...,...,...,...,...
43657,9154,flows,2013-04-04,1.208444,-76.125528,landslide_inventory,POINT (-76.12553 1.20844)
43658,9155,landslide,2014-06-14,1.137611,-76.276833,landslide_inventory,POINT (-76.27683 1.13761)
43659,9156,landslide,2014-01-23,1.625639,-75.381106,landslide_inventory,POINT (-75.38111 1.62564)
43660,9157,landslide,2014-01-23,1.636456,-75.396614,landslide_inventory,POINT (-75.39661 1.63646)


In [18]:
pos_labels_gdf[pos_labels_gdf["OBJECTID"] == 94]

Unnamed: 0,OBJECTID,MOV_TYPE,MOV_DATE,LAT,LON,source,geometry
93,94,landslide,2008-11-27,4.22873,-76.300748,landslide_catalog,POINT (-76.30075 4.22873)
34597,94,landslide,2014-04-05,10.718889,-73.401667,landslide_inventory,POINT (-73.40167 10.71889)


#### Check if any coordinates still overlap with SAM polygons

In [5]:
sam_polygons = gpd.read_file(SAM_POLYGONS_FPATH)
sam_polygons.head()

Unnamed: 0,value,OBJECTID,notes,qa,geometry
0,255.0,1038,none,pass,"POLYGON ((-72.48084 7.48027, -72.48039 7.48027..."
1,255.0,1109,erroneous polygon,fail,"POLYGON ((-72.88634 7.86215, -72.88634 7.84445..."
2,255.0,1126,erroneous polygon,fail,"POLYGON ((-72.96072 7.81220, -72.96072 7.79441..."
3,255.0,11676,no image,fail,"POLYGON ((-76.64857 2.72037, -76.64830 2.72037..."
4,255.0,1217,none,pass,"POLYGON ((-72.80046 7.75102, -72.80028 7.75102..."


In [20]:
sam_polygons = sam_polygons[sam_polygons["qa"] == "pass"]

In [21]:
# first check for existing points that overlap with SAM polygons
check_overlaps = pos_labels_gdf.sjoin(sam_polygons)
check_overlaps

Unnamed: 0,OBJECTID_left,MOV_TYPE,MOV_DATE,LAT,LON,source,geometry,index_right,value,OBJECTID_right,notes,qa
20559,20560,flows,2015-10-29,6.857705,-72.132864,landslide_catalog,POINT (-72.13286 6.85771),63,255.0,20723,none,pass
20718,20719,flows,2019-08-28,6.942717,-72.186196,landslide_catalog,POINT (-72.18620 6.94272),60,255.0,20720,none,pass
27091,27092,flows,2015-01-01,7.46667,-72.541583,landslide_catalog,POINT (-72.54158 7.46667),269,255.0,26161,none,pass
27257,27258,landslide,2020-01-06,7.247175,-72.328458,landslide_catalog,POINT (-72.32846 7.24718),272,255.0,26185,none,pass
27257,27258,landslide,2020-01-06,7.247175,-72.328458,landslide_catalog,POINT (-72.32846 7.24718),308,255.0,26441,none,pass
27594,27595,landslide,2018-10-18,7.32405,-72.285842,landslide_catalog,POINT (-72.28584 7.32405),363,255.0,27286,none,pass
35625,1122,landslide,2010-11-01,7.749978,-72.800492,landslide_inventory,POINT (-72.80049 7.74998),4,255.0,1217,none,pass
41619,7116,landslide,2000-01-01,2.258422,-76.738869,landslide_inventory,POINT (-76.73887 2.25842),517,255.0,7173,none,pass


In [22]:
events_to_remove = check_overlaps["OBJECTID_left"].tolist()

# remove events that overlapped with SAM polygons
pos_labels_gdf = pos_labels_gdf[~pos_labels_gdf["OBJECTID"].isin(events_to_remove)]
pos_labels_gdf

Unnamed: 0,OBJECTID,MOV_TYPE,MOV_DATE,LAT,LON,source,geometry
41,42,landslide,2000-01-04,4.757758,-76.227065,landslide_catalog,POINT (-76.22706 4.75776)
42,43,landslide,2000-01-06,4.082055,-76.189929,landslide_catalog,POINT (-76.18993 4.08206)
43,44,landslide,2000-01-11,3.738116,-76.268108,landslide_catalog,POINT (-76.26811 3.73812)
92,93,landslide,2008-11-27,4.238959,-76.307156,landslide_catalog,POINT (-76.30716 4.23896)
93,94,landslide,2008-11-27,4.228730,-76.300748,landslide_catalog,POINT (-76.30075 4.22873)
...,...,...,...,...,...,...,...
43657,9154,flows,2013-04-04,1.208444,-76.125528,landslide_inventory,POINT (-76.12553 1.20844)
43658,9155,landslide,2014-06-14,1.137611,-76.276833,landslide_inventory,POINT (-76.27683 1.13761)
43659,9156,landslide,2014-01-23,1.625639,-75.381106,landslide_inventory,POINT (-75.38111 1.62564)
43660,9157,landslide,2014-01-23,1.636456,-75.396614,landslide_inventory,POINT (-75.39661 1.63646)


### AOI

In [23]:
AOI_FPATH

Path('/home/abbymoreno/immap-evidem/data/admin_bounds/grids_landslide_w_xyz_zoomlevel18_20240320.parquet')

In [24]:
AOI_FPATH.exists()

True

In [7]:
USE_CACHED_GRIDS = True  # use cached grids if available

In [8]:
simple_aoi_grids = pd.read_parquet(AOI_FPATH)

Loading aoi grids from file /home/abbymoreno/immap-evidem-risk-mapping/data/admin_bounds/grids_landslide_w_xyz_zoomlevel18_20240320.parquet
CPU times: user 496 ms, sys: 377 ms, total: 872 ms
Wall time: 820 ms


In [27]:
simple_aoi_grids_geom = gpd.read_file(
    MUNICIPALITIES_DIR / "grids_landslide_wadm_zoomlevel18_20240304.gpkg"
)
simple_aoi_grids_geom = simple_aoi_grids_geom[["quadkey", "geometry"]]
simple_aoi_grids_geom

Unnamed: 0,quadkey,geometry
0,032232230000210321,"POLYGON ((-77.31491 1.35257, -77.31491 1.35394..."
1,032232230000210330,"POLYGON ((-77.31354 1.35257, -77.31354 1.35394..."
2,032232230000210331,"POLYGON ((-77.31216 1.35257, -77.31216 1.35394..."
3,032232230000211220,"POLYGON ((-77.31079 1.35257, -77.31079 1.35394..."
4,032232230000210323,"POLYGON ((-77.31491 1.35119, -77.31491 1.35257..."
...,...,...
1798235,032231022220323022,"POLYGON ((-73.07007 8.49954, -73.07007 8.50090..."
1798236,032231022220323023,"POLYGON ((-73.06870 8.49954, -73.06870 8.50090..."
1798237,032231022220323032,"POLYGON ((-73.06732 8.49954, -73.06732 8.50090..."
1798238,032231022220323033,"POLYGON ((-73.06595 8.49954, -73.06595 8.50090..."


In [28]:
simple_aoi_grids = simple_aoi_grids.merge(simple_aoi_grids_geom, on="quadkey")
simple_aoi_grids = gpd.GeoDataFrame(simple_aoi_grids, geometry="geometry", crs=PROJ_CRS)
simple_aoi_grids

Unnamed: 0,quadkey,x,y,z,geometry
0,032232230000210321,74773,130086,18,"POLYGON ((-77.31491 1.35257, -77.31491 1.35394..."
1,032232230000210330,74774,130086,18,"POLYGON ((-77.31354 1.35257, -77.31354 1.35394..."
2,032232230000210331,74775,130086,18,"POLYGON ((-77.31216 1.35257, -77.31216 1.35394..."
3,032232230000211220,74776,130086,18,"POLYGON ((-77.31079 1.35257, -77.31079 1.35394..."
4,032232230000210323,74773,130087,18,"POLYGON ((-77.31491 1.35119, -77.31491 1.35257..."
...,...,...,...,...,...
1798235,032231022220323022,77864,124859,18,"POLYGON ((-73.07007 8.49954, -73.07007 8.50090..."
1798236,032231022220323023,77865,124859,18,"POLYGON ((-73.06870 8.49954, -73.06870 8.50090..."
1798237,032231022220323032,77866,124859,18,"POLYGON ((-73.06732 8.49954, -73.06732 8.50090..."
1798238,032231022220323033,77867,124859,18,"POLYGON ((-73.06595 8.49954, -73.06595 8.50090..."


In [29]:
simple_aoi_grids = gpd.GeoDataFrame(simple_aoi_grids, geometry="geometry", crs=PROJ_CRS)
simple_aoi_grids

Unnamed: 0,quadkey,x,y,z,geometry
0,032232230000210321,74773,130086,18,"POLYGON ((-77.31491 1.35257, -77.31491 1.35394..."
1,032232230000210330,74774,130086,18,"POLYGON ((-77.31354 1.35257, -77.31354 1.35394..."
2,032232230000210331,74775,130086,18,"POLYGON ((-77.31216 1.35257, -77.31216 1.35394..."
3,032232230000211220,74776,130086,18,"POLYGON ((-77.31079 1.35257, -77.31079 1.35394..."
4,032232230000210323,74773,130087,18,"POLYGON ((-77.31491 1.35119, -77.31491 1.35257..."
...,...,...,...,...,...
1798235,032231022220323022,77864,124859,18,"POLYGON ((-73.07007 8.49954, -73.07007 8.50090..."
1798236,032231022220323023,77865,124859,18,"POLYGON ((-73.06870 8.49954, -73.06870 8.50090..."
1798237,032231022220323032,77866,124859,18,"POLYGON ((-73.06732 8.49954, -73.06732 8.50090..."
1798238,032231022220323033,77867,124859,18,"POLYGON ((-73.06595 8.49954, -73.06595 8.50090..."


### Feature table

In [9]:
features_df = pd.read_csv(FEATURES_FPATH, dtype=dtype)

  features_df = pd.read_csv(FEATURES_FPATH, dtype=dtype)


# Spatial Joins

Label the 150x150m grids either Landslide, Flows, and Non-landslide using a geospatial join.

## Positive Labels

In [33]:
pos_labels_gdf.columns

Index(['OBJECTID', 'MOV_TYPE', 'MOV_DATE', 'LAT', 'LON', 'source', 'geometry'], dtype='object')

In [34]:
# add the labels to grids
simple_aoi_grids.sjoin(pos_labels_gdf).shape

(17788, 12)

In [35]:
landslide_grids = simple_aoi_grids.sjoin(pos_labels_gdf)

In [36]:
# get all duplicates
duplicate_grids = landslide_grids[
    landslide_grids.duplicated(subset=["quadkey"], keep=False)
]

For this next code block, it looks at the duplicates and checks the data source. If a grid intersected with a point from both Catalog and Inventory we exclude that grid. The section belows outputs a list of quadkeys to exclude. 

In [37]:
def get_inv_cat_dup(df):
    # Get quadkeys/grids that were tagged by both catalog and inventory
    # Group by quadkey and count occurrences of each landslide source
    grouped = df.groupby(by=["quadkey", "source"]).size().unstack(fill_value=0)
    # Filter quadkeys with duplicates in both 'catalog' and 'inventory'
    filtered_quadkeys = grouped[
        (grouped["landslide_catalog"] > 0) & (grouped["landslide_inventory"] > 0)
    ].index

    return filtered_quadkeys

In [38]:
# list of quadkeys to remove
remove_quad_cat = get_inv_cat_dup(duplicate_grids)

# remove from grids
landslide_grids = landslide_grids[~landslide_grids["quadkey"].isin(remove_quad_cat)]

In [39]:
# remove catalog
landslide_grids = landslide_grids[landslide_grids["source"] != "landslide_catalog"]

Th remaining grids are tagged by `landslide_inventory` at this point. Check the source column to make sure.

In [40]:
landslide_grids = landslide_grids.drop_duplicates(subset=["quadkey"], keep="first")

In [41]:
landslide_grids[landslide_grids.duplicated(subset=["quadkey"], keep=False)]

Unnamed: 0,quadkey,x,y,z,geometry,index_right,OBJECTID,MOV_TYPE,MOV_DATE,LAT,LON,source


In [42]:
landslide_grids.OBJECTID.nunique()

2538

In [43]:
landslide_grids.drop(
    columns=[
        "index_right",
        "OBJECTID",
        "MOV_DATE",
        "LAT",
        "LON",
        "geometry",
    ],
    inplace=True,
)

In [44]:
landslide_grids["MOV_TYPE"].value_counts()

landslide    2153
flows         385
Name: MOV_TYPE, dtype: int64

In [45]:
if REDUCE_LANDSLIDES:
    # reduce landslide types
    # choose only quadkeys
    landslide_type_samples = landslide_grids[landslide_grids["MOV_TYPE"] == "landslide"]
    reduce_samples = int(landslide_grids.shape[0] * 0.30)
    selected_landslide_samples = landslide_type_samples.sample(
        n=reduce_samples, random_state=1
    )
    selected_landslide_quadkeys = selected_landslide_samples["quadkey"].tolist()
    # retain only flows and chosen quadkeys
    is_in_reduced_landslide = (
        landslide_grids["quadkey"].isin(selected_landslide_quadkeys)
    ) & (landslide_grids["MOV_TYPE"] == "landslide")
    is_flows = landslide_grids["MOV_TYPE"] == "flows"

    filtered_landslide_grids = landslide_grids[is_in_reduced_landslide | is_flows]
    landslide_grids = filtered_landslide_grids

In [46]:
landslide_grids["MOV_TYPE"].value_counts()

landslide    2153
flows         385
Name: MOV_TYPE, dtype: int64

## Negative Labels

In [48]:
neg_labels_gdf.shape

(2920, 2)

Further reduce the negative samples (non_landslide) to match the positive labels count as much as possible.

In [49]:
# reduce sample negative labels
reduce_samples = int(landslide_grids.shape[0] * 0.30)
neg_labels_gdf = neg_labels_gdf.sample(n=reduce_samples, random_state=1)

In [50]:
neg_labels_gdf = neg_labels_gdf.merge(
    simple_aoi_grids.drop(columns=["geometry"]), how="left"
)
neg_labels_gdf

Unnamed: 0,quadkey,geometry,x,y,z
0,032232102213300010,"POLYGON ((-75.62714 4.43267, -75.62714 4.43404...",76002,127840,18
1,032230331121002203,"POLYGON ((-73.38730 6.82008, -73.38730 6.82144...",77633,126093,18
2,032232231022331200,"POLYGON ((-76.56372 1.06973, -76.56372 1.07110...",75320,130292,18
3,032230331120123220,"POLYGON ((-73.42163 6.79690, -73.42163 6.79826...",77608,126110,18
4,032232210133321230,"POLYGON ((-76.67084 2.47253, -76.67084 2.47390...",75242,129270,18
...,...,...,...,...,...
756,032232221022032213,"POLYGON ((-78.02078 1.10131, -78.02078 1.10268...",74259,130269,18
757,032232110220333333,"POLYGON ((-74.44473 5.00339, -74.44473 5.00476...",76863,127423,18
758,032230233220020311,"POLYGON ((-76.63101 5.76220, -76.63101 5.76357...",75271,126868,18
759,032233000121001103,"POLYGON ((-72.66769 5.43829, -72.66769 5.43966...",78157,127105,18


In [51]:
neg_labels_gdf = neg_labels_gdf.reset_index()
neg_labels_gdf

Unnamed: 0,index,quadkey,geometry,x,y,z
0,0,032232102213300010,"POLYGON ((-75.62714 4.43267, -75.62714 4.43404...",76002,127840,18
1,1,032230331121002203,"POLYGON ((-73.38730 6.82008, -73.38730 6.82144...",77633,126093,18
2,2,032232231022331200,"POLYGON ((-76.56372 1.06973, -76.56372 1.07110...",75320,130292,18
3,3,032230331120123220,"POLYGON ((-73.42163 6.79690, -73.42163 6.79826...",77608,126110,18
4,4,032232210133321230,"POLYGON ((-76.67084 2.47253, -76.67084 2.47390...",75242,129270,18
...,...,...,...,...,...,...
756,756,032232221022032213,"POLYGON ((-78.02078 1.10131, -78.02078 1.10268...",74259,130269,18
757,757,032232110220333333,"POLYGON ((-74.44473 5.00339, -74.44473 5.00476...",76863,127423,18
758,758,032230233220020311,"POLYGON ((-76.63101 5.76220, -76.63101 5.76357...",75271,126868,18
759,759,032233000121001103,"POLYGON ((-72.66769 5.43829, -72.66769 5.43966...",78157,127105,18


In [52]:
neg_labels_gdf["MOV_TYPE"] = "non_landslide"

In [53]:
neg_labels_gdf["OBJECTID"] = neg_labels_gdf["quadkey"]

In [54]:
neg_labels_gdf.drop(columns=["geometry", "index"], inplace=True)

In [55]:
neg_labels_gdf.head(2)

Unnamed: 0,quadkey,x,y,z,MOV_TYPE,OBJECTID
0,32232102213300010,76002,127840,18,non_landslide,32232102213300010
1,32230331121002203,77633,126093,18,non_landslide,32230331121002203


## Combine positive and negative labels into one dataframe

In [56]:
train_labels = pd.concat([landslide_grids, neg_labels_gdf], ignore_index=True)

In [57]:
train_labels["MOV_TYPE"].value_counts()

landslide        2153
non_landslide     761
flows             385
Name: MOV_TYPE, dtype: int64

In [58]:
quadkey_dup = train_labels["quadkey"].duplicated()

train_labels[(quadkey_dup)]

Unnamed: 0,quadkey,x,y,z,MOV_TYPE,source,OBJECTID


# Append features to be used for training

In [59]:
train_data = train_labels.merge(features_df)

In [60]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3299 entries, 0 to 3298
Data columns (total 56 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   quadkey              3299 non-null   object 
 1   x                    3299 non-null   int64  
 2   y                    3299 non-null   int64  
 3   z                    3299 non-null   int64  
 4   MOV_TYPE             3299 non-null   object 
 5   source               2538 non-null   object 
 6   OBJECTID             761 non-null    object 
 7   MPIO_CCNCT           3299 non-null   int64  
 8   MPIO_CNMBR           3299 non-null   object 
 9   MPIO_CNMBR_EN        3299 non-null   object 
 10  DPTO_CNMBR           3299 non-null   object 
 11  DPTO_CNMBR_EN        3299 non-null   object 
 12  Municipio            3299 non-null   object 
 13  Municipio_EN         3299 non-null   object 
 14  DPTO_CCDGO           3299 non-null   int64  
 15  MPIO_CCDGO           3299 non-null   i

In [61]:
train_data["OBJECTID"] = train_data["OBJECTID"].astype(str)

Drop the unnecessary administrative boundary columns.

In [62]:
train_data = train_data.drop(
    columns=[
        "MPIO_CRSLC",
        "MPIO_NAREA",
        "MPIO_NANO",
        "SHAPE_AREA",
        "SHAPE_LEN",
    ]
)

In [63]:
train_data

Unnamed: 0,quadkey,x,y,z,MOV_TYPE,source,OBJECTID,MPIO_CCNCT,MPIO_CNMBR,MPIO_CNMBR_EN,...,silt_5-15cm_mean,silt_100-200cm_mean,clay_5-15cm_mean,clay_100-200cm_mean,hillshade_min,hillshade_max,hillshade_count,hillshade_median,distance_m_roads,distance_m_rivers
0,032232230002112132,74806,130123,18,landslide,landslide_inventory,,52001,PASTO,PASTO,...,261.50,264.25,233.50,221.50,0.0,205.0,272,0.0,0.000000,4713.335684
1,032232230002213300,74780,130156,18,landslide,landslide_inventory,,52001,PASTO,PASTO,...,362.00,348.00,281.75,310.75,229.0,255.0,272,255.0,0.000000,200.900180
2,032232230002213223,74777,130159,18,landslide,landslide_inventory,,52001,PASTO,PASTO,...,363.75,352.00,273.00,306.00,175.0,238.0,256,219.0,5.646712,74.413568
3,032232230002330203,74801,130165,18,landslide,landslide_inventory,,52001,PASTO,PASTO,...,378.00,344.00,293.50,325.00,122.0,195.0,256,154.0,9.693275,1182.975557
4,032232230021030203,74833,130197,18,landslide,landslide_inventory,,52001,PASTO,PASTO,...,365.00,320.50,290.50,307.50,155.0,215.0,256,186.0,27.917916,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3294,032232221022032213,74259,130269,18,non_landslide,,032232221022032213,52612,RICAURTE,RICAURTE,...,292.00,275.00,374.50,384.00,33.0,166.0,272,109.0,5690.720254,2679.600653
3295,032232110220333333,76863,127423,18,non_landslide,,032232110220333333,25875,VILLETA,VILLETA,...,308.00,298.75,358.00,424.00,159.0,209.0,256,186.0,351.360550,2162.333911
3296,032230233220020311,75271,126868,18,non_landslide,,032230233220020311,27001,QUIBDÓ,QUIBDO,...,307.25,298.00,325.75,357.75,160.0,204.0,256,186.0,2989.125710,2856.010364
3297,032233000121001103,78157,127105,18,non_landslide,,032233000121001103,15047,AQUITANIA,AQUITANIA,...,241.25,237.25,236.25,235.00,0.0,255.0,272,0.0,4953.063466,3061.765584


In [65]:
train_data.to_csv(OUT_CSV_FPATH, index=False)

In [66]:
train_data.to_parquet(OUT_PARQUET_FPATH, index=False)