# Model Experimentation

This notebook runs model experiments and evaluates performance using train/test split and cross-fold validation. This is where you can experiment the impact of the type of feature you include in the model training and also test if the generated training dataset is effective in our use-case.

### Input
- training data
- model parameters

### Output
- metrics for CSV
- test data prediction

# Imports and Set-up

*DO NOT SKIP THIS SECTION.* This section imports the packages needed to run this notebook and initializes the data file paths.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import polars as pl
import pandas as pd

In [3]:
sys.path.append("../../")  # include parent directory
import src.model_utils as model_utils
from src.polars_utils import log_condition, log_duplicates

In [4]:
from src.settings import DATA_DIR

MODEL_DIR = DATA_DIR / "models"
OUTPUT_DIR = DATA_DIR / "output/component_1"

OUTPUT_VERSION = pd.to_datetime("today").strftime("%Y%m%d")

In [5]:
TRAIN_VERSION = "20240504"
LATTICE_RADIUS = 3
if LATTICE_RADIUS != 0:
    TRAIN_TABLE_FPATH = (
        MODEL_DIR
        / f"training_data/training_data_w_lattice{LATTICE_RADIUS}_{TRAIN_VERSION}.parquet"
    )
else:
    TRAIN_TABLE_FPATH = (
        MODEL_DIR / f"training_data/training_data_{TRAIN_VERSION}.parquet"
    )

In [6]:
KEY_COLS = ["quadkey"]
LABEL_COLS = ["label", "label_binary", "MOV_TYPE", "label_multiclass"]
MULTICLASS = True
if MULTICLASS:
    LABEL_COL = "label_multiclass"  # choose either label_binary or label_multiclass
    AVERAGING_METHOD = "weighted"  # binary if 2 classes, use weighted if multiclass
else:
    LABEL_COL = "label_binary"  # choose either label_binary or label_multiclass
    AVERAGING_METHOD = "binary"  # binary if 2 classes, use weighted if multiclass
AREA_COLS = [
    "MPIO_CCNCT",
    "MPIO_CNMBR",
    "MPIO_CNMBR_EN",
    "DPTO_CNMBR",
    "DPTO_CNMBR_EN",
    "Municipio",
    "Municipio_EN",
    "DPTO_CCDGO",
    "MPIO_CCDGO",
    "MPIO_CRSLC",
    "MPIO_NAREA",
    "MPIO_NANO",
    "SHAPE_AREA",
    "SHAPE_LEN",
    "OBJECTID",
    "source",
]
EXC_COLS = [
    "source",
    "x",
    "y",
    "z",
    "slope_count",
    "slope_min",
    "slope_max",
    "aspect_count",
    "aspect_max",
    "aspect_min",
    "elevation_min",
    "elevation_max",
    "elevation_count",
    "rainfall_mm_count",
    "rainfall_mm_min",
    "rainfall_mm_max",
    "hillshade_count",
    "hillshade_min",
    "hillshade_max",
    "hillshade_count",
    "ndvi2023_min",
    "ndvi2023_max",
    "ndvi2023_count",
    "ndvi2023_median",
    "__index_level_0__",
    "geometry",
]

## Dataset

In [7]:
if TRAIN_TABLE_FPATH.exists():
    features_df = pl.read_parquet(TRAIN_TABLE_FPATH)
else:
    TRAIN_TABLE_FNAME = TRAIN_TABLE_FPATH.name
    !gsutil -m cp gs://immap-models/training_data/$TRAIN_TABLE_FNAME $TRAIN_TABLE_FPATH
    features_df = pl.read_parquet(TRAIN_TABLE_FPATH)

In [8]:
features_df.head(2)

quadkey,x,y,z,MOV_TYPE,source,OBJECTID,MPIO_CCNCT,MPIO_CNMBR,MPIO_CNMBR_EN,DPTO_CNMBR,DPTO_CNMBR_EN,Municipio,Municipio_EN,DPTO_CCDGO,MPIO_CCDGO,slope_min,slope_max,slope_count,slope_median,aspect_min,aspect_max,aspect_count,aspect_median,soil_class,elevation_min,elevation_max,elevation_count,elevation_median,ndvi2023_min,ndvi2023_max,ndvi2023_count,ndvi2023_median,rainfall_mm_min,rainfall_mm_max,rainfall_mm_count,rainfall_mm_median,__index_level_0__,lithology_type,sand_5-15cm_mean,sand_100-200cm_mean,silt_5-15cm_mean,silt_100-200cm_mean,clay_5-15cm_mean,clay_100-200cm_mean,hillshade_min,hillshade_max,hillshade_count,hillshade_median,distance_m_roads,distance_m_rivers,lithology_type_lattice_3,soil_class_lattice_3,elevation_median_lattice_3,slope_median_lattice_3,aspect_median_lattice_3,hillshade_median_lattice_3,rainfall_mm_median_lattice_3,sand_5-15cm_mean_lattice_3,sand_100-200cm_mean_lattice_3,silt_5-15cm_mean_lattice_3,silt_100-200cm_mean_lattice_3,clay_5-15cm_mean_lattice_3,clay_100-200cm_mean_lattice_3,distance_m_roads_lattice_3,distance_m_rivers_lattice_3
str,i64,i64,i64,str,str,str,i64,str,str,str,str,str,str,i64,i64,f64,f64,i64,f64,f64,f64,i64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""03223010120123…",76381,123199,18,"""landslide""","""landslide_inve…","""nan""",8549,"""PIOJÓ""","""PIOJO""","""ATLÁNTICO""","""ATLANTICO""","""Piojó""","""Piojo""",8,549,2.0,17.0,36,8.0,0.0,352.0,256,309.0,0.0,297.0,326.0,256,317.0,0.114671,0.494776,240.0,0.317509,2184.866699,2184.866699,1.0,2184.866699,1138318,168.0,342.0,289.0,319.0,302.5,338.5,408.5,163.0,223.0,256,183.0,0.0,1493.422583,168.0,0.0,317.0,8.0,309.0,183.0,2184.866699,342.0,289.0,319.0,302.5,338.5,408.5,0.0,1493.422583
"""03223010120133…",76410,123189,18,"""non_landslide""",,"""03223010120133…",8549,"""PIOJÓ""","""PIOJO""","""ATLÁNTICO""","""ATLANTICO""","""Piojó""","""Piojo""",8,549,1.0,14.0,36,6.5,0.0,344.0,272,270.0,6.0,125.0,146.0,272,141.0,0.404474,0.562933,225.0,0.512943,1664.335571,1664.335571,1.0,1664.335571,1136886,168.0,339.5,311.0,304.0,276.0,356.5,412.5,166.0,215.0,272,186.0,784.422821,1003.273334,168.0,6.0,141.0,6.5,270.0,186.0,1664.335571,339.5,311.0,304.0,276.0,356.5,412.5,784.422821,1003.273334


In [9]:
features_df.shape

(3299, 66)

In [10]:
features_df = features_df.unique(subset=["quadkey"])

In [11]:
features_df.shape

(3299, 66)

In [12]:
features_df.select("MOV_TYPE").unique()

MOV_TYPE
str
"""flows"""
"""non_landslide"""
"""landslide"""


In [13]:
# Create numerical categorization of landslide

if MULTICLASS:
    LABEL_COL = "label_multiclass"
    # add multiclass labels
    features_df = features_df.with_columns(
        pl.when(pl.col("MOV_TYPE") == "landslide")
        .then(1)
        .when(pl.col("MOV_TYPE") == "flows")
        .then(2)
        .otherwise(0)
        .alias("label_multiclass")
    )
else:
    LABEL_COL = "label_binary"
    # add binary labels
    features_df = features_df.with_columns(
        pl.when(pl.col("label") == "landslide")
        .then(pl.lit(1))
        .otherwise(pl.lit(0))
        .alias("label_binary")
    )

In [14]:
features_df[LABEL_COL].unique()

label_multiclass
i32
0
1
2


In [15]:
feature_cols = [
    col
    for col in features_df.columns
    if col not in KEY_COLS + EXC_COLS + AREA_COLS + LABEL_COLS
]
len(feature_cols), feature_cols

(30,
 ['slope_median',
  'aspect_median',
  'soil_class',
  'elevation_median',
  'rainfall_mm_median',
  'lithology_type',
  'sand_5-15cm_mean',
  'sand_100-200cm_mean',
  'silt_5-15cm_mean',
  'silt_100-200cm_mean',
  'clay_5-15cm_mean',
  'clay_100-200cm_mean',
  'hillshade_median',
  'distance_m_roads',
  'distance_m_rivers',
  'lithology_type_lattice_3',
  'soil_class_lattice_3',
  'elevation_median_lattice_3',
  'slope_median_lattice_3',
  'aspect_median_lattice_3',
  'hillshade_median_lattice_3',
  'rainfall_mm_median_lattice_3',
  'sand_5-15cm_mean_lattice_3',
  'sand_100-200cm_mean_lattice_3',
  'silt_5-15cm_mean_lattice_3',
  'silt_100-200cm_mean_lattice_3',
  'clay_5-15cm_mean_lattice_3',
  'clay_100-200cm_mean_lattice_3',
  'distance_m_roads_lattice_3',
  'distance_m_rivers_lattice_3'])

In [16]:
features_df.shape

(3299, 67)

In [17]:
features_df.select(feature_cols).null_count()

slope_median,aspect_median,soil_class,elevation_median,rainfall_mm_median,lithology_type,sand_5-15cm_mean,sand_100-200cm_mean,silt_5-15cm_mean,silt_100-200cm_mean,clay_5-15cm_mean,clay_100-200cm_mean,hillshade_median,distance_m_roads,distance_m_rivers,lithology_type_lattice_3,soil_class_lattice_3,elevation_median_lattice_3,slope_median_lattice_3,aspect_median_lattice_3,hillshade_median_lattice_3,rainfall_mm_median_lattice_3,sand_5-15cm_mean_lattice_3,sand_100-200cm_mean_lattice_3,silt_5-15cm_mean_lattice_3,silt_100-200cm_mean_lattice_3,clay_5-15cm_mean_lattice_3,clay_100-200cm_mean_lattice_3,distance_m_roads_lattice_3,distance_m_rivers_lattice_3
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,4,0,0,0,0,0,0,0,0,68,30,0,0,0,0,0,0,4,0,0,0,0,0,0,68,30


In [18]:
features_df.select(feature_cols).describe()

statistic,slope_median,aspect_median,soil_class,elevation_median,rainfall_mm_median,lithology_type,sand_5-15cm_mean,sand_100-200cm_mean,silt_5-15cm_mean,silt_100-200cm_mean,clay_5-15cm_mean,clay_100-200cm_mean,hillshade_median,distance_m_roads,distance_m_rivers,lithology_type_lattice_3,soil_class_lattice_3,elevation_median_lattice_3,slope_median_lattice_3,aspect_median_lattice_3,hillshade_median_lattice_3,rainfall_mm_median_lattice_3,sand_5-15cm_mean_lattice_3,sand_100-200cm_mean_lattice_3,silt_5-15cm_mean_lattice_3,silt_100-200cm_mean_lattice_3,clay_5-15cm_mean_lattice_3,clay_100-200cm_mean_lattice_3,distance_m_roads_lattice_3,distance_m_rivers_lattice_3
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",3299.0,3299.0,3299.0,3299.0,3295.0,3299.0,3299.0,3299.0,3299.0,3299.0,3299.0,3299.0,3299.0,3231.0,3269.0,3299.0,3299.0,3299.0,3299.0,3299.0,3299.0,3295.0,3299.0,3299.0,3299.0,3299.0,3299.0,3299.0,3231.0,3269.0
"""null_count""",0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,68.0,30.0
"""mean""",18.777054,172.782207,5.206729,1576.055168,2161.849488,106.150045,332.744998,297.236056,321.862155,306.267505,328.641103,379.754092,174.35147,518.093301,1786.32768,106.12913,5.233404,1575.86458,18.770518,172.737065,174.316408,2161.84088,332.75045,297.232341,321.902756,306.299681,328.674273,379.806199,518.006583,1785.541821
"""std""",8.897397,92.936857,1.697482,781.69836,1096.411884,47.800985,53.766788,59.149344,49.037256,47.848541,52.301527,67.979602,45.336041,1297.388703,1821.785008,47.894254,1.663131,781.175386,8.466544,83.55679,41.776732,1096.084236,52.108572,57.934516,47.209123,46.025765,50.506822,65.874321,1295.840885,1819.687169
"""min""",0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",12.5,94.0,3.0,1041.0,1526.689209,75.0,310.25,264.5,302.25,286.0,309.5,352.0,143.0,0.0,400.481442,75.0,6.0,1041.0,13.333333,106.0,147.0,1526.414795,310.0,264.5,302.0,286.0,309.5,352.5,0.0,398.172823
"""50%""",18.0,165.5,6.0,1526.0,1892.342041,104.0,334.5,293.25,323.0,308.0,333.5,391.75,177.0,47.77716,1266.440849,104.0,6.0,1523.0,18.0,168.0,176.833333,1893.949707,334.0,292.75,323.0,308.0,334.0,392.1875,67.076199,1262.528984
"""75%""",25.0,254.0,6.0,2060.0,2392.361572,139.0,362.0,332.5,348.5,333.0,357.0,420.5,206.0,403.895198,2514.154954,139.0,6.0,2054.5,24.5,240.0,203.0,2392.55835,361.5,332.2,347.708333,331.208333,357.0,419.8,390.925401,2518.834341
"""max""",56.0,347.0,29.0,4460.0,8623.12207,193.0,481.0,522.0,453.0,440.5,481.75,561.0,255.0,9997.204725,9931.370429,193.0,29.0,4460.0,51.0,347.0,255.0,8623.12207,481.0,522.0,453.0,440.5,473.5,530.5625,9997.204725,9931.370429


In [82]:
features_df["MOV_TYPE"].value_counts()

MOV_TYPE,count
str,u32
"""non_landslide""",353
"""flows""",377
"""landslide""",751


# Experiment with only a 80-20 train test split

In the following code block, we set the needed function arguments to conduct our model training and validation. These arguments will also be the same for the k-fold validation approach and will be used in the next section.

In [19]:
EVAL_SIMPLE_TRAIN_TEST_SPLIT = True
RANDOM_SEED = 47
MODEL_TYPE = "classification"
MODEL_NAME = "xgboost"
APPLY_LOG_TRANSFORM = False
REVERSE_LOG_TRANSFORM = False
TRAIN_WITH_PREDICTION_INTERVALS = False

label_strip = LABEL_COL.split("_")[1]
OUTPUT_PATH = (
    OUTPUT_DIR / f"{OUTPUT_VERSION}_{MODEL_TYPE}_{MODEL_NAME}_{label_strip}.parquet"
)

CV_METRICS_OUTPUT_PATH = (
    OUTPUT_DIR
    / f"{OUTPUT_VERSION}_{MODEL_TYPE}_{MODEL_NAME}_{label_strip}_cv_metrics.parquet"
)

In [21]:
eval_metrics = None
if EVAL_SIMPLE_TRAIN_TEST_SPLIT:
    train_df, test_df = model_utils.key_based_train_test_split(
        df=features_df,
        key_column="quadkey",
        train_proportion=0.8,
        shuffle=True,
        random_seed=RANDOM_SEED,
    )

    if MODEL_TYPE == "regression":
        plot_actual_vs_pred = True
    elif MODEL_TYPE == "classification":
        plot_actual_vs_pred = False

    kwargs = {
        "feature_cols": feature_cols,
        "label_col": LABEL_COL,  # need to update to multiclass
        "model_name": MODEL_NAME,
        "model_type": MODEL_TYPE,
        "train_df": train_df,
        "val_df": None,
        "test_df": test_df,
        "apply_log_transform": APPLY_LOG_TRANSFORM,
        "reverse_log_transform": REVERSE_LOG_TRANSFORM,
        "train_with_prediction_intervals": TRAIN_WITH_PREDICTION_INTERVALS,
        "plot_actual_vs_pred": plot_actual_vs_pred,
        "averaging_method": AVERAGING_METHOD,
        "random_seed": RANDOM_SEED,
    }
    model, eval_metrics = model_utils.train_and_eval_model(**kwargs)
eval_metrics

{'train_accuracy': 1.0,
 'train_precision': 1.0,
 'train_recall': 1.0,
 'train_f1': 1.0,
 'train_confusion_matrix': array([[ 623,    0,    0],
        [   0, 1715,    0],
        [   0,    0,  301]]),
 'test_accuracy': 0.8045454545454546,
 'test_precision': 0.7876390066922325,
 'test_recall': 0.8045454545454546,
 'test_f1': 0.7864235938861096,
 'test_confusion_matrix': array([[104,  32,   2],
        [ 18, 405,  15],
        [  3,  59,  22]])}

# Train and validate the model using K-fold Split approach

`GROUP_COL` is a parameter that sets the split of the validaiton approach. In our case below, we made use of the Department codes as the split for the k-fold validation. Make sure to choose the split that you would want to experiment and validate on. 

In [22]:
GROUP_COL = "DPTO_CCDGO"

NUM_CV_FOLDS = features_df[GROUP_COL].n_unique()

In [23]:
NUM_CV_FOLDS

20

In [24]:
%%time
kfolds = model_utils.key_based_k_fold_cross_validation(
    df=features_df, key_column="quadkey", group_column=GROUP_COL, n_splits=NUM_CV_FOLDS
)

cv_metrics = []
output_preds_df = []

for i, (train_df, val_df) in enumerate(kfolds, start=1):
    kwargs = {
        "feature_cols": feature_cols,
        "label_col": LABEL_COL,
        "model_name": MODEL_NAME,
        "model_type": MODEL_TYPE,
        "train_df": train_df,
        "val_df": val_df,
        "test_df": None,
        "apply_log_transform": APPLY_LOG_TRANSFORM,
        "reverse_log_transform": REVERSE_LOG_TRANSFORM,
        "train_with_prediction_intervals": TRAIN_WITH_PREDICTION_INTERVALS,
        "plot_actual_vs_pred": False,
        "averaging_method": AVERAGING_METHOD,
        "random_seed": RANDOM_SEED,
        "return_val_pred": True,
    }
    model, eval_metrics, output_val_pred = model_utils.train_and_eval_model(**kwargs)
    eval_metrics["val_fold"] = i
    # eval_metrics.insert(1, "val_group", eval_metrics.pop("val_group"))
    cv_metrics.append(eval_metrics)
    output_val_pred = output_val_pred.drop(EXC_COLS)
    output_preds_df.append(output_val_pred)

predictions_kfolds_df = pl.concat(output_preds_df)

cv_metrics = pl.from_dicts(cv_metrics)
reordered_cols = cv_metrics.columns[-2:] + cv_metrics.columns[:-2]
cv_metrics = cv_metrics.select(reordered_cols)
cv_metrics.head()

Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
A single label was found in 'y_true' and 'y_pred'. For the confusion matrix to have the correct shape, use the 'labels' parameter to pass all known labels.
Precision 

CPU times: user 4min 41s, sys: 1.12 s, total: 4min 42s
Wall time: 24.2 s


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Recall is ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.


val_group,val_fold,train_accuracy,train_precision,train_recall,train_f1,train_confusion_matrix,val_accuracy,val_precision,val_recall,val_f1,val_confusion_matrix
str,i64,f64,f64,f64,f64,object,f64,f64,f64,f64,object
"""BOYACA""",1,1.0,1.0,1.0,1.0,[[ 567 0 0]  [ 0 1687 0]  [ 0 0 343]],0.764957,0.721868,0.764957,0.742674,[[133 59 2]  [ 60 404 2]  [ 4 38 0]]
"""NORTE DE SANTA…",2,1.0,1.0,1.0,1.0,[[ 710 0 0]  [ 0 1752 0]  [ 0 0 284]],0.703436,0.643783,0.703436,0.657469,[[ 34 15 2]  [ 38 352 11]  [ 22 76 3]]
"""CAUCA""",3,1.0,1.0,1.0,1.0,[[ 710 0 0]  [ 0 1743 0]  [ 0 0 372]],0.852321,0.852261,0.852321,0.849794,[[ 34 17 0]  [ 40 370 0]  [ 2 11 0]]
"""CUNDINAMARCA""",4,1.0,1.0,1.0,1.0,[[ 719 0 0]  [ 0 1936 0]  [ 0 0 233]],0.552311,0.349688,0.552311,0.41695,[[ 15 27 0]  [ 5 212 0]  [ 6 146 0]]
"""VALLE DEL CAUC…",5,1.0,1.0,1.0,1.0,[[ 736 0 0]  [ 0 1999 0]  [ 0 0 383]],0.889503,0.882923,0.889503,0.882483,[[ 13 12 0]  [ 4 148 2]  [ 0 2 0]]


In [25]:
cv_metrics.drop(["val_confusion_matrix", "train_confusion_matrix", "val_group"]).mean()

val_fold,train_accuracy,train_precision,train_recall,train_f1,val_accuracy,val_precision,val_recall,val_f1
f64,f64,f64,f64,f64,f64,f64,f64,f64
10.5,1.0,1.0,1.0,1.0,0.780312,0.756173,0.780312,0.752443


In [26]:
output_cv_metrics = cv_metrics.drop(["val_confusion_matrix", "train_confusion_matrix"])
output_cv_metrics.write_parquet(CV_METRICS_OUTPUT_PATH)

In [27]:
predictions_kfolds_df

quadkey,MOV_TYPE,OBJECTID,MPIO_CCNCT,MPIO_CNMBR,MPIO_CNMBR_EN,DPTO_CNMBR,DPTO_CNMBR_EN,Municipio,Municipio_EN,DPTO_CCDGO,MPIO_CCDGO,slope_median,aspect_median,soil_class,elevation_median,rainfall_mm_median,lithology_type,sand_5-15cm_mean,sand_100-200cm_mean,silt_5-15cm_mean,silt_100-200cm_mean,clay_5-15cm_mean,clay_100-200cm_mean,hillshade_median,distance_m_roads,distance_m_rivers,lithology_type_lattice_3,soil_class_lattice_3,elevation_median_lattice_3,slope_median_lattice_3,aspect_median_lattice_3,hillshade_median_lattice_3,rainfall_mm_median_lattice_3,sand_5-15cm_mean_lattice_3,sand_100-200cm_mean_lattice_3,silt_5-15cm_mean_lattice_3,silt_100-200cm_mean_lattice_3,clay_5-15cm_mean_lattice_3,clay_100-200cm_mean_lattice_3,distance_m_roads_lattice_3,distance_m_rivers_lattice_3,label_multiclass,val_group,department_name,label_multiclass_pred_class,pred_proba_0,pred_proba_1,pred_proba_2
str,str,str,i64,str,str,str,str,str,str,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i32,i32,str,i64,f32,f32,f32
"""03223211130321…","""landslide""","""nan""",15299,"""GARAGOA""","""GARAGOA""","""BOYACÁ""","""BOYACA""","""Garagoa""","""Garagoa""",15,299,10.0,287.0,6.0,1526.0,1538.036865,75.0,325.5,265.25,350.5,316.5,324.25,418.5,201.0,0.0,364.285824,75.0,6.0,1522.5,11.25,281.5,208.0,1538.036865,325.75,265.125,350.5,315.5,323.875,419.5,0.0,328.741391,1,15,"""BOYACA""",1,0.016925,0.875729,0.107346
"""03223122031302…","""landslide""","""nan""",15180,"""CHISCAS""","""CHISCAS""","""BOYACÁ""","""BOYACA""","""Chiscas""","""Chiscas""",15,180,17.5,145.0,6.0,2249.5,1886.560181,106.0,294.75,262.5,359.5,323.0,346.0,414.25,154.0,0.0,516.289099,106.0,6.0,2249.833333,15.666667,143.0,154.0,1886.560181,293.333333,265.833333,359.25,323.25,347.5,410.916667,0.0,467.426799,1,15,"""BOYACA""",1,0.001139,0.997489,0.001371
"""03223211132332…","""landslide""","""nan""",15425,"""MACANAL""","""MACANAL""","""BOYACÁ""","""BOYACA""","""Macanal""","""Macanal""",15,425,17.5,81.0,6.0,1861.0,2244.408936,75.0,336.75,273.25,346.5,329.75,316.5,397.25,127.0,307.723415,1567.062822,75.0,6.0,1863.5,19.5,83.0,121.333333,2244.408936,337.416667,272.75,345.666667,327.916667,316.666667,399.416667,343.472478,1615.522839,1,15,"""BOYACA""",1,0.001431,0.996372,0.002197
"""03223122220313…","""non_landslide""","""03223122220313…",15693,"""SANTA ROSA DE …","""SANTA ROSA DE …","""BOYACÁ""","""BOYACA""","""Santa Rosa De …","""Santa Rosa De …",15,693,32.5,297.0,3.0,2828.0,981.300232,24.0,294.5,308.0,369.0,335.0,336.25,356.75,253.0,562.543144,4226.667207,24.0,3.0,2828.0,32.5,297.0,253.0,981.300232,294.5,308.0,369.0,335.0,336.25,356.75,562.543144,4226.667207,0,15,"""BOYACA""",0,0.893177,0.075941,0.030882
"""03223211112233…","""landslide""","""nan""",15804,"""TIBANÁ""","""TIBANA""","""BOYACÁ""","""BOYACA""","""Tibaná""","""Tibana""",15,804,17.0,65.0,6.0,2029.5,1263.649048,113.0,281.5,256.0,390.75,352.0,328.0,392.0,134.0,152.577564,286.562947,113.0,3.0,2012.25,19.5,89.0,128.0,1263.649048,280.375,255.0,369.375,330.75,350.5,414.375,97.47562,283.914126,1,15,"""BOYACA""",1,0.042016,0.947338,0.010646
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""03223010120133…","""non_landslide""","""03223010120133…",8549,"""PIOJÓ""","""PIOJO""","""ATLÁNTICO""","""ATLANTICO""","""Piojó""","""Piojo""",8,549,6.5,270.0,6.0,141.0,1664.335571,168.0,339.5,311.0,304.0,276.0,356.5,412.5,186.0,784.422821,1003.273334,168.0,6.0,141.0,6.5,270.0,186.0,1664.335571,339.5,311.0,304.0,276.0,356.5,412.5,784.422821,1003.273334,0,8,"""ATLANTICO""",0,0.900895,0.00872,0.090385
"""03223010120123…","""landslide""","""nan""",8549,"""PIOJÓ""","""PIOJO""","""ATLÁNTICO""","""ATLANTICO""","""Piojó""","""Piojo""",8,549,8.0,309.0,0.0,317.0,2184.866699,168.0,342.0,289.0,319.0,302.5,338.5,408.5,183.0,0.0,1493.422583,168.0,0.0,317.0,8.0,309.0,183.0,2184.866699,342.0,289.0,319.0,302.5,338.5,408.5,0.0,1493.422583,1,8,"""ATLANTICO""",0,0.586861,0.367986,0.045153
"""03223010121202…","""non_landslide""","""03223010121202…",8549,"""PIOJÓ""","""PIOJO""","""ATLÁNTICO""","""ATLANTICO""","""Piojó""","""Piojo""",8,549,3.0,180.0,6.0,60.0,1478.869629,157.0,361.0,322.0,323.0,298.0,315.0,380.0,180.0,1224.548501,466.116468,157.0,6.0,60.0,3.0,180.0,180.0,1478.869629,361.0,322.0,323.0,298.0,315.0,380.0,1224.548501,466.116468,0,8,"""ATLANTICO""",0,0.970593,0.00605,0.023358
"""03223010120201…","""non_landslide""","""03223010120201…",8549,"""PIOJÓ""","""PIOJO""","""ATLÁNTICO""","""ATLANTICO""","""Piojó""","""Piojo""",8,549,2.0,134.0,29.0,9.0,1318.163574,189.0,313.25,331.5,279.25,265.0,407.5,403.75,177.0,307.870608,1304.524066,189.0,29.0,9.0,2.0,134.0,177.0,1318.163574,313.25,331.5,279.25,265.0,407.5,403.75,307.870608,1304.524066,0,8,"""ATLANTICO""",0,0.99626,0.000614,0.003126


In [28]:
predictions_kfolds_df.write_parquet(OUTPUT_PATH)