In [1]:
import pandas as pd

df = pd.read_csv("../../data/processed/model_input_dataset.csv") 
df.head()

Unnamed: 0,LONGITUDE,LATITUDE,LABEL,gravity_iso_residual,gravity_cscba,gravity_cscba_1vd,gravity_iso_residual_stddev3x3,gravity_cscba_stddev3x3,mag_uc_1_2km,mag_uc_2_4km,...,mag_uc_12_16km,mag_uc_2_4km_1vd,mag_uc_2_4km_thd,mag_uc_2_4km_stddev3x3,radio_K_pct,radio_Th_ppm,radio_U_ppm,radio_Th_K_ratio,radio_U_K_ratio,radio_U_Th_ratio
0,145.277695,-34.107134,0,52.223591,349.392853,-40.59006,0.752113,37.011612,0.584576,0.99297,...,1.204403,-0.00078,0.001489,0.038044,1.104598,10.030174,1.367533,9.082856,1.238717,0.136452
1,125.714108,-23.943929,0,-195.990967,-534.08844,-556.97266,2.474874,47.704494,3.931087,5.403521,...,-0.07108,-0.013591,0.021547,0.549094,0.158148,15.495625,1.302552,77.478119,6.512759,0.084177
2,148.821727,-21.846854,1,376.498871,-531.526184,198.01955,0.918559,46.13804,-15.808146,0.000513,...,-2.160016,-0.091868,0.096375,2.462595,0.327474,3.212502,0.507891,9.933004,1.592497,0.159617
3,134.851273,-19.145144,0,10.045643,-709.562378,-201.86922,1.362985,60.54647,-12.336065,-17.813541,...,-10.113371,0.02599,0.028927,0.738085,0.281605,7.375026,0.806524,26.118765,2.877724,0.111031
4,142.587968,-28.984738,0,-34.105846,114.804726,-124.66106,2.21875,87.28493,-2.577452,-4.013026,...,-2.26458,-0.006811,0.014678,0.374207,0.442369,3.28747,0.641557,7.432631,1.465061,0.198236


### Formatting and Column Standardization

- Ensured all column names use lowercase with underscores (`snake_case`) for consistency.
- Converted categorical variables (e.g., `source`) to Pandas `category` dtype.
- Ensured the target column `label` is of integer type.
- No one-hot encoding was applied at this stage, as most ML models used (e.g., XGBoost, RF) handle categorical variables natively or do not require dummy variables.

In [2]:
# Step 1: Standardize Column Names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print("Standardized columns:", list(df.columns))

Standardized columns: ['longitude', 'latitude', 'label', 'gravity_iso_residual', 'gravity_cscba', 'gravity_cscba_1vd', 'gravity_iso_residual_stddev3x3', 'gravity_cscba_stddev3x3', 'mag_uc_1_2km', 'mag_uc_2_4km', 'mag_uc_4_8km', 'mag_uc_8_12km', 'mag_uc_12_16km', 'mag_uc_2_4km_1vd', 'mag_uc_2_4km_thd', 'mag_uc_2_4km_stddev3x3', 'radio_k_pct', 'radio_th_ppm', 'radio_u_ppm', 'radio_th_k_ratio', 'radio_u_k_ratio', 'radio_u_th_ratio']


In [4]:
# Step 2: Check and Correct Data Types
print("Before type check:\n", df.dtypes)

# Convert LABEL to int
df['label'] = df['label'].astype(int)

print("\nAfter type check:\n", df.dtypes)


Before type check:
 longitude                         float64
latitude                          float64
label                               int32
gravity_iso_residual              float64
gravity_cscba                     float64
gravity_cscba_1vd                 float64
gravity_iso_residual_stddev3x3    float64
gravity_cscba_stddev3x3           float64
mag_uc_1_2km                      float64
mag_uc_2_4km                      float64
mag_uc_4_8km                      float64
mag_uc_8_12km                     float64
mag_uc_12_16km                    float64
mag_uc_2_4km_1vd                  float64
mag_uc_2_4km_thd                  float64
mag_uc_2_4km_stddev3x3            float64
radio_k_pct                       float64
radio_th_ppm                      float64
radio_u_ppm                       float64
radio_th_k_ratio                  float64
radio_u_k_ratio                   float64
radio_u_th_ratio                  float64
dtype: object

After type check:
 longitude             

In [5]:
# Step 3: Save Cleaned Dataset
df.to_csv("../../data/processed/train_dataset_formatted.csv", index=False)
print("Saved cleaned dataset to 'train_dataset_formatted.csv'")

Saved cleaned dataset to 'train_dataset_formatted.csv'
