In [4]:
import pandas as pd

# === 1. Charger le dataset original ===
df = pd.read_csv(r"C:\Users\pc\Desktop\DM\datasets\merged_final_clean_no_duplicates.csv")

print("=== Aperçu du dataset original ===")
print(df.head())

# === 2. Supprimer la feature inutile REF_BULK_log ===
df = df.drop(columns=["REF_BULK_log"])

# === 3. Créer les nouvelles features tmean et trange ===
df["tmean"] = (df["tmin"] + df["tmax"]) / 2
df["trange"] = df["tmax"] - df["tmin"]

# === 4. Supprimer les colonnes redondantes tmin et tmax ===
df = df.drop(columns=["tmin", "tmax"])

# === 5. Sauvegarder le dataset final ===
output_file = r"C:\Users\pc\Desktop\DM\datasets\merged_feature_final.csv"
df.to_csv(output_file, index=False)

print(f"✔ Dataset final optimisé sauvegardé sous : {output_file}")
print("=== Nouvelle forme du dataset ===")
print(df.shape)

=== Aperçu du dataset original ===
   latitude  longitude   TEB_log  REF_BULK_log  CEC_CLAY  GYPSUM_log  \
0  36.74886    6.25409  2.995732      0.970779      53.0    1.960095   
1  35.87978    4.44782  3.806662      1.040277      48.0    2.208274   
2  35.70751    5.53337  3.806662      1.040277      48.0    2.208274   
3  32.27667    3.98647  3.806662      0.996949      71.0    2.028148   
4  32.40079    4.00642  3.806662      0.996949      71.0    2.028148   

   TEXTURE_SOTER_encoded  ORG_CARBON_log  LCCCODE_encoded  ELEC_COND_log  \
0                    3.0        2.205193              1.0       1.945910   
1                    3.0        2.099734              3.0       2.079442   
2                    3.0        2.099734             16.0       2.079442   
3                    3.0        2.040571             18.0       2.197225   
4                    3.0        2.040571              3.0       2.197225   

   COARSE  TEXTURE_USDA_encoded  class      tmin      tmax      prec  \
0  

In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from collections import Counter

# === 1. Charger le dataset final après feature engineering ===
df = pd.read_csv(r"C:\Users\pc\Desktop\DM\datasets\merged_feature_final.csv")

# === 2. Séparer features et target ===
X = df.drop(columns=['class'])
y = df['class']

# === 3. Normaliser les features numériques ===
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

print("✅ Normalisation terminée. Aperçu :")
print(X_scaled.head())

# === 4. Appliquer SMOTE (oversampling) ===
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_scaled, y)
print("✅ SMOTE appliqué :")
print("Distribution des classes après SMOTE :", Counter(y_smote))

# === 5. Appliquer TOMEK Links (undersampling) ===
tomek = TomekLinks()
X_tomek, y_tomek = tomek.fit_resample(X_smote, y_smote)
print("✅ TOMEK Links appliqué :")
print("Distribution des classes après SMOTE + TOMEK :", Counter(y_tomek))

# === 6. Créer un DataFrame final équilibré ===
df_final = pd.DataFrame(X_tomek, columns=X.columns)
df_final['class'] = y_tomek

# === 7. Sauvegarder le dataset équilibré ===
output_file = r"C:\Users\pc\Desktop\DM\datasets\merged_feature_balanced.csv"
df_final.to_csv(output_file, index=False)
print(f"✔ Dataset équilibré sauvegardé sous : {output_file}")

✅ Normalisation terminée. Aperçu :
   latitude  longitude   TEB_log  CEC_CLAY  GYPSUM_log  TEXTURE_SOTER_encoded  \
0  1.599941   0.424514 -0.873085 -0.186051   -0.359351               0.414534   
1  1.008210  -0.100769  0.326187 -0.472797    0.205265               0.414534   
2  0.890917   0.214921  0.326187 -0.472797    0.205265               0.414534   
3 -1.445041  -0.234935  0.326187  0.846235   -0.204527               0.414534   
4 -1.360532  -0.229133  0.326187  0.846235   -0.204527               0.414534   

   ORG_CARBON_log  LCCCODE_encoded  ELEC_COND_log    COARSE  \
0        0.923640        -1.239120      -0.436378  0.537499   
1        0.430136        -1.002590       0.023074 -0.631298   
2        0.430136         0.534857       0.023074 -0.631298   
3        0.153275         0.771388       0.428338 -1.020897   
4        0.153275        -1.002590       0.428338 -1.020897   

   TEXTURE_USDA_encoded      prec  elevation_scaled     tmean    trange  
0              0.737545  

In [7]:
import pandas as pd

# Charger le dataset final (ou équilibré)
df = pd.read_csv(r"C:\Users\pc\Desktop\DM\datasets\merged_feature_balanced.csv")

# Compter les occurrences
class_counts = df['class'].value_counts()
total = len(df)

# Afficher le nombre et le pourcentage
print("=== Distribution des classes ===")
for cls, count in class_counts.items():
    percent = (count / total) * 100
    label = "fire" if cls == 1 else "non-fire"
    print(f"{label}: {count} instances ({percent:.2f}%)")

=== Distribution des classes ===
non-fire: 112132 instances (50.11%)
fire: 111619 instances (49.89%)
