In [None]:
import pandas as pd
from sklearn.utils import resample

In [None]:
# Load dataframe (already preprocessed to weather-only, etc)
df = pd.read_csv("atis_dataset_converted.csv", sep=";")
label_cols = ["ILS", "RNAV", "RNP", "VISUAL"]

In [None]:
# Count positives per label
positive_counts = {label: df[label].sum() for label in label_cols}
print("Positives per label:", positive_counts)

In [None]:
# Find the largest class size
max_count = max(positive_counts.values())

In [None]:
# oversample each label
frames = []

for label in label_cols:
    # All rows where this label is positive (1)
    df_pos = df[df[label] == 1]
    # Upsample to max_count for this label (with replacement)
    df_pos_upsampled = resample(
        df_pos,
        replace=True,
        n_samples=max_count,
        random_state=42
    )
    frames.append(df_pos_upsampled)

In [None]:
# Concatenate, then drop duplicates (since some rows may be positive for >1 label)
df_balanced = pd.concat(frames).drop_duplicates().reset_index(drop=True)

In [None]:
df_balanced = df_balanced.sample(frac=1, random_state=42)
df_balanced.to_csv("atis_dataset_balanced.csv", index=False, sep=";")