# Task2

## Setup & Load

In [1]:
# Setup & load
from pathlib import Path
import pandas as pd
import numpy as np

# Define project paths
project_root      = Path.cwd().parent
TASK1_CLEAN_DIR   = project_root / "reports" / "task1" / "cleaned"
TASK2_DIR         = project_root / "reports" / "task2"
TASK2_DIR.mkdir(parents=True, exist_ok=True)

# Define file paths
csv_paths = {
    "dh_orig":   TASK1_CLEAN_DIR / "drillhole_original_clean.csv",
    "dh_dnn":    TASK1_CLEAN_DIR / "drillhole_dnn_clean.csv",
    "surf_orig": TASK1_CLEAN_DIR / "surface_original_clean.csv",
    "surf_dnn":  TASK1_CLEAN_DIR / "surface_dnn_clean.csv",
}

# Load CSVs
dfs = {k: pd.read_csv(v) for k, v in csv_paths.items()}

df_dh_orig_clean   = dfs["dh_orig"]
df_dh_dl_clean     = dfs["dh_dnn"]
df_surf_orig_clean = dfs["surf_orig"]
df_surf_dl_clean   = dfs["surf_dnn"]

# Print basic info
for k, df in dfs.items():
    print(f"{k:10s}: shape={df.shape}")

dh_orig   : shape=(2466130, 9)
dh_dnn    : shape=(920225, 8)
surf_orig : shape=(981318, 6)
surf_dnn  : shape=(544886, 5)


### Summarize Latitude/Longitude Extent

In [2]:
# Latitude/Longitude coverage summary

def extent_for(df: pd.DataFrame, lat_col: str, lon_col: str) -> dict:
    lat = pd.to_numeric(df[lat_col], errors="coerce").dropna()
    lon = pd.to_numeric(df[lon_col], errors="coerce").dropna()
    return {
        "LatMin": lat.min() if not lat.empty else None,
        "LatMax": lat.max() if not lat.empty else None,
        "LonMin": lon.min() if not lon.empty else None,
        "LonMax": lon.max() if not lon.empty else None,
        "Count": int(df.shape[0])
    }

rows = []
datasets = {
    "Drillhole Original": (df_dh_orig_clean, "LATITUDE", "LONGITUDE"),
    "Drillhole DNN":      (df_dh_dl_clean,   "LATITUDE", "LONGITUDE"),
    "Surface Original":   (df_surf_orig_clean, "DLAT", "DLONG"),
    "Surface DNN":        (df_surf_dl_clean,   "DLAT", "DLONG"),
}

for name, (df, lat_col, lon_col) in datasets.items():
    if {lat_col, lon_col}.issubset(df.columns):
        e = extent_for(df, lat_col, lon_col)
        e["Dataset"] = name
        rows.append(e)

extents_df = pd.DataFrame(rows)[["Dataset","Count","LatMin","LatMax","LonMin","LonMax"]]
display(extents_df)

Unnamed: 0,Dataset,Count,LatMin,LatMax,LonMin,LonMax
0,Drillhole Original,2466130,-34.945354,-27.025372,114.67557,122.131294
1,Drillhole DNN,920225,-34.945354,-27.025372,114.67638,122.10708
2,Surface Original,981318,-34.990154,-26.915041,114.51993,122.13507
3,Surface DNN,544886,-34.927277,-27.102,114.547104,122.122314


## Samples

A subset for testing (here I set up the same region as in the paper).
Can either use this smaller dataset to focus only on the study area, or use the full cleaned datasets for broader analysis. 

In [3]:
# Define a bounding box (latitude/longitude region)
lat_min, lat_max = -30.0, -27.5
lon_min, lon_max = 120.0, 121.5

def filter_by_bbox(df: pd.DataFrame, lat_col: str, lon_col: str) -> pd.DataFrame:
    lat = pd.to_numeric(df[lat_col], errors="coerce")
    lon = pd.to_numeric(df[lon_col], errors="coerce")
    mask = (
        lat.notna() & lon.notna() &
        (lat >= lat_min) & (lat <= lat_max) &
        (lon >= lon_min) & (lon <= lon_max)
    )
    return df.loc[mask].copy()

# Use relative path (like in Task1)
SAMPLE_DIR = Path("../reports/task2/sample_csv")
SAMPLE_DIR.mkdir(parents=True, exist_ok=True)

# Filter and export
samples = {}
for name, (df, lat_col, lon_col) in datasets.items():
    if {lat_col, lon_col}.issubset(df.columns):
        sub = filter_by_bbox(df, lat_col, lon_col)
        samples[name] = sub
        out_path = SAMPLE_DIR / f"{name.lower().replace(' ', '_')}_sample.csv"
        sub.to_csv(out_path, index=False)
        print(f"[SAVE] {name:18s} -> rows={sub.shape[0]}")

print("Saved Task2 sample datasets to:", SAMPLE_DIR)

[SAVE] Drillhole Original -> rows=403
[SAVE] Drillhole DNN      -> rows=294
[SAVE] Surface Original   -> rows=210
[SAVE] Surface DNN        -> rows=531
Saved Task2 sample datasets to: ../reports/task2/sample_csv


## Test