In [13]:
import pandas as pd

# Load formatted dataset from previous step
df = pd.read_csv("../../data/processed/train_dataset_formatted.csv")
df.head()

Unnamed: 0,longitude,latitude,label,gravity_iso_residual,gravity_cscba,gravity_cscba_1vd,gravity_iso_residual_stddev3x3,gravity_cscba_stddev3x3,mag_uc_1_2km,mag_uc_2_4km,...,mag_uc_12_16km,mag_uc_2_4km_1vd,mag_uc_2_4km_thd,mag_uc_2_4km_stddev3x3,radio_k_pct,radio_th_ppm,radio_u_ppm,radio_th_k_ratio,radio_u_k_ratio,radio_u_th_ratio
0,145.277695,-34.107134,0,52.223591,349.392853,-40.59006,0.752113,37.011612,0.584576,0.99297,...,1.204403,-0.00078,0.001489,0.038044,1.104598,10.030174,1.367533,9.082856,1.238717,0.136452
1,125.714108,-23.943929,0,-195.990967,-534.08844,-556.97266,2.474874,47.704494,3.931087,5.403521,...,-0.07108,-0.013591,0.021547,0.549094,0.158148,15.495625,1.302552,77.478119,6.512759,0.084177
2,148.821727,-21.846854,1,376.498871,-531.526184,198.01955,0.918559,46.13804,-15.808146,0.000513,...,-2.160016,-0.091868,0.096375,2.462595,0.327474,3.212502,0.507891,9.933004,1.592497,0.159617
3,134.851273,-19.145144,0,10.045643,-709.562378,-201.86922,1.362985,60.54647,-12.336065,-17.813541,...,-10.113371,0.02599,0.028927,0.738085,0.281605,7.375026,0.806524,26.118765,2.877724,0.111031
4,142.587968,-28.984738,0,-34.105846,114.804726,-124.66106,2.21875,87.28493,-2.577452,-4.013026,...,-2.26458,-0.006811,0.014678,0.374207,0.442369,3.28747,0.641557,7.432631,1.465061,0.198236


## 1.Missing Values Summary
Radiometric features such as radio_k_pct, radio_th_ppm, radio_u_ppm, and derived ratios show ~3% missing values, Gravity feature, gravity_iso_residual show ~0.3% missing values. Other geophysical features are complete.

In [14]:
# Count missing values for each column
missing_summary = df.isnull().sum()
missing_summary = missing_summary[missing_summary > 0].sort_values(ascending=False)

print("Missing Values Summary:")
display(missing_summary)

# Show percentage for easier interpretation
missing_pct = (df.isnull().sum() / len(df)) * 100
print("Missing Value Percentage (%):")
display(missing_pct[missing_pct > 0].sort_values(ascending=False))

Missing Values Summary:


radio_k_pct             229
radio_th_ppm            229
radio_u_ppm             229
radio_th_k_ratio        229
radio_u_k_ratio         229
radio_u_th_ratio        229
gravity_iso_residual      6
gravity_cscba             1
dtype: int64

Missing Value Percentage (%):


radio_k_pct             4.645973
radio_th_ppm            4.645973
radio_u_ppm             4.645973
radio_th_k_ratio        4.645973
radio_u_k_ratio         4.645973
radio_u_th_ratio        4.645973
gravity_iso_residual    0.121729
gravity_cscba           0.020288
dtype: float64

## 2. Missing Value Imputation Strategy

In [16]:
# List of features to impute (with missing values)
radiometric_features = [
    'radio_k_pct', 'radio_th_ppm', 'radio_u_ppm',
    'radio_th_k_ratio', 'radio_u_k_ratio', 'radio_u_th_ratio'
]

gravity_features = ['gravity_iso_residual', 'gravity_cscba']

magnetic_features = ['mag_uc_1_2km', 'mag_uc_2_4km', 'mag_uc_4_8km', 'mag_uc_8_12km', 'mag_uc_12_16km']

for col in radiometric_features + gravity_features + magnetic_features:
    df[col] = df.groupby('label')[col].transform(lambda x: x.fillna(x.median()))
print("Applied group-wise median imputation by source.")


Applied group-wise median imputation by source.


In [17]:
# Recheck missing values after imputation
remaining_missing = df.isnull().sum()
print("Remaining missing values:")
display(remaining_missing[remaining_missing > 0])

Remaining missing values:


Series([], dtype: int64)

## 3.Delete Duplicate Values

In [18]:
print(f"Duplicate rows before drop: {df.duplicated().sum()}")
df = df.drop_duplicates()
print(f"Duplicate rows after drop: {df.duplicated().sum()}")

Duplicate rows before drop: 10
Duplicate rows after drop: 0


In [19]:
df.to_csv("../../data/processed/train_dataset_formatted_no_missing.csv", index=False)
print("Cleaned dataset saved to 'train_dataset_formatted_no_missing.csv'")

Cleaned dataset saved to 'train_dataset_formatted_no_missing.csv'


## Missing Value Handling Summary

- **Columns affected**: Radiometric element concentrations and ratios, gravity_iso_residual, and magnetic features.
- **Imputation method used**: Median imputation.
  - Chosen because these features are highly skewed and contain physical/geochemical measurement values.
  - Median is robust to outliers and preserves distributional integrity.
- **Validation**: All missing values resolved after imputation.
- **Domain Justification**: Radiometric values are spatially interpolated from field measurements. Median preserves range while avoiding overfitting to extreme anomalies.
