# Task2

## Setup & Load

In [84]:
# Setup & load
from pathlib import Path
import pandas as pd
import numpy as np

# Define project paths
project_root      = Path.cwd().parent
TASK1_CLEAN_DIR   = project_root / "reports" / "task1" / "cleaned"
TASK2_DIR         = project_root / "reports" / "task2"
TASK2_DIR.mkdir(parents=True, exist_ok=True)

# Define file paths
csv_paths = {
    "dh_orig":   TASK1_CLEAN_DIR / "drillhole_original_clean.csv",
    "dh_dnn":    TASK1_CLEAN_DIR / "drillhole_dnn_clean.csv",
    "surf_orig": TASK1_CLEAN_DIR / "surface_original_clean.csv",
    "surf_dnn":  TASK1_CLEAN_DIR / "surface_dnn_clean.csv",
}

# Load CSVs
dfs = {k: pd.read_csv(v) for k, v in csv_paths.items()}

df_dh_orig_clean   = dfs["dh_orig"]
df_dh_dl_clean     = dfs["dh_dnn"]
df_surf_orig_clean = dfs["surf_orig"]
df_surf_dl_clean   = dfs["surf_dnn"]

# Print basic info
for k, df in dfs.items():
    print(f"{k:10s}: shape={df.shape}")

dh_orig   : shape=(2466130, 9)
dh_dnn    : shape=(920225, 8)
surf_orig : shape=(981318, 6)
surf_dnn  : shape=(544886, 5)


### Summarize Latitude/Longitude Extent

In [85]:
# Latitude/Longitude coverage summary

def extent_for(df: pd.DataFrame, lat_col: str, lon_col: str) -> dict:
    lat = pd.to_numeric(df[lat_col], errors="coerce").dropna()
    lon = pd.to_numeric(df[lon_col], errors="coerce").dropna()
    return {
        "LatMin": lat.min() if not lat.empty else None,
        "LatMax": lat.max() if not lat.empty else None,
        "LonMin": lon.min() if not lon.empty else None,
        "LonMax": lon.max() if not lon.empty else None,
        "Count": int(df.shape[0])
    }

rows = []
datasets = {
    "Drillhole Original": (df_dh_orig_clean, "LATITUDE", "LONGITUDE"),
    "Drillhole DNN":      (df_dh_dl_clean,   "LATITUDE", "LONGITUDE"),
    "Surface Original":   (df_surf_orig_clean, "DLAT", "DLONG"),
    "Surface DNN":        (df_surf_dl_clean,   "DLAT", "DLONG"),
}

for name, (df, lat_col, lon_col) in datasets.items():
    if {lat_col, lon_col}.issubset(df.columns):
        e = extent_for(df, lat_col, lon_col)
        e["Dataset"] = name
        rows.append(e)

extents_df = pd.DataFrame(rows)[["Dataset","Count","LatMin","LatMax","LonMin","LonMax"]]
display(extents_df)

Unnamed: 0,Dataset,Count,LatMin,LatMax,LonMin,LonMax
0,Drillhole Original,2466130,-34.945354,-27.025372,114.67557,122.131294
1,Drillhole DNN,920225,-34.945354,-27.025372,114.67638,122.10708
2,Surface Original,981318,-34.990154,-26.915041,114.51993,122.13507
3,Surface DNN,544886,-34.927277,-27.102,114.547104,122.122314


## Samples

A subset for testing (here I set up the same region as in the paper).
Can either use this smaller dataset to focus only on the study area, or use the full cleaned datasets for broader analysis. 

In [86]:
# Define a bounding box (latitude/longitude region)
lat_min, lat_max = -30.0, -27.5
lon_min, lon_max = 120.0, 121.5

def filter_by_bbox(df: pd.DataFrame, lat_col: str, lon_col: str) -> pd.DataFrame:
    lat = pd.to_numeric(df[lat_col], errors="coerce")
    lon = pd.to_numeric(df[lon_col], errors="coerce")
    mask = (
        lat.notna() & lon.notna() &
        (lat >= lat_min) & (lat <= lat_max) &
        (lon >= lon_min) & (lon <= lon_max)
    )
    return df.loc[mask].copy()

# Use relative path (like in Task1)
SAMPLE_DIR = Path("../reports/task2/sample_csv")
SAMPLE_DIR.mkdir(parents=True, exist_ok=True)

# Filter and export
samples = {}
for name, (df, lat_col, lon_col) in datasets.items():
    if {lat_col, lon_col}.issubset(df.columns):
        sub = filter_by_bbox(df, lat_col, lon_col)
        samples[name] = sub
        out_path = SAMPLE_DIR / f"{name.lower().replace(' ', '_')}_sample.csv"
        sub.to_csv(out_path, index=False)
        print(f"[SAVE] {name:18s} -> rows={sub.shape[0]}")

print("Saved Task2 sample datasets to:", SAMPLE_DIR)

[SAVE] Drillhole Original -> rows=403
[SAVE] Drillhole DNN      -> rows=294
[SAVE] Surface Original   -> rows=210
[SAVE] Surface DNN        -> rows=531
Saved Task2 sample datasets to: ../reports/task2/sample_csv


## Test

## Drillhole Comparison: ORIG vs DL

This notebook processes two drillhole datasets:
- **ORIG**: original assay values  
- **DL**: values predicted by the deep learning model  

We align them on both:
1. Longitude/latitude grid cells  
2. Overlapping depth intervals  

**Diff logic**
diff = DL - ORIG (default)  

**Outputs**
1. `*_overlaps.csv`: detailed overlapping intervals  
2. `*_points.csv`: representative points (depth midpoints), used for 3D visualization in Streamlit.

### Envrionment & Parameters

In [110]:
# Input: cleaned DataFrames 
# - df_dh_orig_clean
# - df_dh_dl_clean

# Output directory (relative to project root)
OUT_DIR = Path("../reports/task2")
BASE_NAME = "drillhole"

# Key parameters
GRID_STEP_DEG = 1e-4   # grid size in degrees (~10 m)
Z_STEP_M      = 1.0    # depth bin size in meters
DIFF_MODE     = "dl_minus_orig"  # or "orig_minus_dl"

In [111]:
# Helper Functions 
def to_numeric(df: pd.DataFrame, cols: list) -> pd.DataFrame:
    df = df.copy()
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    return df

def snap_xy_to_grid(df: pd.DataFrame,
                    lon_col="LONGITUDE",
                    lat_col="LATITUDE",
                    step=1e-4) -> pd.DataFrame:
    """Snap longitude/latitude to regular grid."""
    df = df.copy()
    df["_gx"] = np.floor(df[lon_col] / step).astype(np.int64)
    df["_gy"] = np.floor(df[lat_col] / step).astype(np.int64)
    return df

In [112]:
# Align by XY & Depth Interval
def align_by_xy_and_depth_interval(
    df_orig, df_dl,
    lon_col="LONGITUDE", lat_col="LATITUDE",
    from_col="FROMDEPTH", to_col="TODEPTH",
    cu_col="Cu_ppm",
    grid_step=1e-4,
    diff_mode="dl_minus_orig"
) -> pd.DataFrame:
    """
    Align ORIG and DL by (lon,lat) grid and overlapping depth intervals.
    Returns overlaps with both Cu values and diff.
    """
    need = [lon_col, lat_col, from_col, to_col, cu_col]
    A = to_numeric(df_orig[need].dropna(), need)
    B = to_numeric(df_dl  [need].dropna(), need)

    A = A.loc[A[from_col] < A[to_col]].copy()
    B = B.loc[B[from_col] < B[to_col]].copy()

    A = snap_xy_to_grid(A, lon_col, lat_col, step=grid_step)
    B = snap_xy_to_grid(B, lon_col, lat_col, step=grid_step)

    parts = []
    for (gx, gy), gA in A.groupby(["_gx", "_gy"]):
        gB = B[(B["_gx"] == gx) & (B["_gy"] == gy)]
        if gB.empty: continue

        gA = gA.rename(columns={from_col:"fromA", to_col:"toA", cu_col:"Cu_orig",
                                lon_col:"lonA", lat_col:"latA"})
        gB = gB.rename(columns={from_col:"fromB", to_col:"toB", cu_col:"Cu_dl",
                                lon_col:"lonB", lat_col:"latB"})
        gA["__k"] = 1; gB["__k"] = 1
        M = gA.merge(gB, on="__k").drop(columns="__k")

        M = M.loc[(M["fromA"] < M["toB"]) & (M["fromB"] < M["toA"])].copy()
        if M.empty: continue

        M["from"] = M[["fromA","fromB"]].max(axis=1)
        M["to"]   = M[["toA","toB"]].min(axis=1)
        M = M.loc[M["from"] < M["to"]].copy()
        if M.empty: continue

        if diff_mode == "dl_minus_orig":
            M["diff"] = M["Cu_dl"] - M["Cu_orig"]
        else:
            M["diff"] = M["Cu_orig"] - M["Cu_dl"]

        M["_gx"] = gx; M["_gy"] = gy
        parts.append(M[["from","to","Cu_orig","Cu_dl","diff",
                        "lonA","latA","lonB","latB","_gx","_gy"]])

    return pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()

In [113]:
# Convert Overlaps to Points
def overlaps_to_points(aligned, grid_step=1e-4, z_step=1.0, agg="mean"):
    """
    Convert overlap intervals into representative 3D points.
    DEPTH = midpoint; LONG/LAT = average of both sides.
    """
    if aligned is None or len(aligned) == 0:
        return pd.DataFrame(columns=["LONGITUDE","LATITUDE","DEPTH","CU_ORIG","CU_DL","DIFF"])

    df = aligned.copy()
    df["DEPTH"] = (df["from"] + df["to"]) / 2.0
    df["LONGITUDE"] = (df["lonA"] + df["lonB"]) / 2.0
    df["LATITUDE"]  = (df["latA"] + df["latB"]) / 2.0
    df = df.rename(columns={"Cu_orig":"CU_ORIG","Cu_dl":"CU_DL","diff":"DIFF"})

    df["_gx"] = np.floor(df["LONGITUDE"]/grid_step).astype(np.int64)
    df["_gy"] = np.floor(df["LATITUDE"]/grid_step).astype(np.int64)
    df["_gz"] = np.floor(df["DEPTH"]/z_step).astype(np.int64)

    grouped = (
        df.groupby(["_gx","_gy","_gz"], as_index=False)
          .agg({"LONGITUDE":agg,"LATITUDE":agg,"DEPTH":agg,
                "CU_ORIG":agg,"CU_DL":agg,"DIFF":agg})
    )
    return grouped[["LONGITUDE","LATITUDE","DEPTH","CU_ORIG","CU_DL","DIFF"]]

In [114]:
# Pipeline Function
def process_drillhole_pair_csv(
    df_orig, df_dl,
    out_dir="reports/task2",
    base_name="drillhole",
    grid_step=1e-4,
    z_step=1.0,
    diff_mode="dl_minus_orig"
):
    """Run pipeline and save CSVs."""
    Path(out_dir).mkdir(parents=True, exist_ok=True)

    aligned = align_by_xy_and_depth_interval(
        df_orig, df_dl, grid_step=grid_step, diff_mode=diff_mode
    )
    print(f"[INFO] overlaps: {len(aligned)}")

    points = overlaps_to_points(aligned, grid_step=grid_step, z_step=z_step, agg="mean")
    print(f"[INFO] points: {len(points)}")

    overlaps_csv = Path(out_dir) / f"{base_name}_overlaps.csv"
    points_csv   = Path(out_dir) / f"{base_name}_points.csv"
    aligned.to_csv(overlaps_csv, index=False)
    points.to_csv(points_csv, index=False)

    print(f"[SAVE] {overlaps_csv}")
    print(f"[SAVE] {points_csv}")
    return str(overlaps_csv), str(points_csv)


In [115]:
overlap_path, points_path = process_drillhole_pair_csv(
    df_dh_orig_clean, df_dh_dl_clean,
    out_dir=OUT_DIR,
    base_name=BASE_NAME,
    grid_step=GRID_STEP_DEG,
    z_step=Z_STEP_M,
    diff_mode=DIFF_MODE
)

[INFO] overlaps: 260797
[INFO] points: 90439
[SAVE] ../reports/task2/drillhole_overlaps.csv
[SAVE] ../reports/task2/drillhole_points.csv


**Method**

**Drillhole Diff Method**
1. Grid snapping (XY):

Longitude & latitude are snapped to grid cells of size grid_step.

2.Depth intervals (Z):
Each record is an interval [FROM, TO] rather than a single depth.

3. Overlap rule:
Two intervals are compared only if they overlap in the same XY cell.

Intersection only:
If [fromA,toA] and [fromB,toB] overlap, keep the intersecting part.
- Diff calculation:
	diff = Cu_dl – Cu_orig (or the reverse, depending on mode).

Outputs:
- Overlap intervals with both Cu values and diff
- Point cloud representation (midpoints of overlaps) for 3D mapping