# Partie 2 : Nettoyage avance avec Pandas (3-4h)

## Etape 2.1 : Nettoyage des donnees meteo

In [94]:
import pandas as pd
import numpy as np
from datetime import datetime

# chargement des données via pandas
df_meteo_raw = pd.read_csv("../data/meteo_raw.csv", sep=",")
df_meteo_raw.head(10)


Unnamed: 0,commune,timestamp,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh,precipitation_mm
0,Saint-Etienne,09/15/2024 15:00:00,17.1,143.3,244.9,14.3,0.0
1,Bordeaux,21/07/2023 15:00,19.6,50.6,414.9,3.2,0.0
2,Montpellier,2023-09-18 20:00:00,18.3,65.7,218.4,13.6,0.0
3,Le Havre,01/03/2024 22:00:00,3.7,94.9,6.8,18.6,11.6
4,Lille,29/10/2024 20:00,14.0,42.9,781.8,4.0,0.0
5,Bordeaux,22/12/2023 13:00,4.4,36.9,796.4,6.1,0.0
6,Marseille,09/15/2023 21:00:00,22.5,86.8,5.8,32.6,0.0
7,Toulouse,30/05/2023 00:00,8.3,66.3,26.4,31.4,7.2
8,Bordeaux,2024-10-05T09:00:00,11.5,69.8,71.4,34.4,0.0
9,Toulon,2024-09-28T21:00:00,19.2,79.0,13.1,31.8,0.0


In [95]:
def quality_report(df, name="DataFrame"):
    print(f"RAPPORT QUALITE - {name}")
    print(f"Lignes: {len(df):,}")
    print(f"Colonnes: {len(df.columns)}")
    
    report = []
    for col in df.columns:
        total = len(df)
        missing = df[col].isna().sum() + (df[col] == '').sum() if df[col].dtype == 'object' else df[col].isna().sum()
        completude = (1 - missing / total) * 100
        unique = df[col].nunique()
        dtype = df[col].dtype
        
        report.append({
            'Colonne': col,
            'Type': str(dtype),
            'Manquants': missing,
            'Completude %': round(completude, 2),
            'Uniques': unique
        })
    
    return pd.DataFrame(report)

quality_report(df_meteo_raw, "Weather Raw")

RAPPORT QUALITE - Weather Raw
Lignes: 252,612
Colonnes: 7


Unnamed: 0,Colonne,Type,Manquants,Completude %,Uniques
0,commune,str,0,100.0,15
1,timestamp,str,0,100.0,69066
2,temperature_c,str,1229,99.51,810
3,humidite_pct,float64,0,100.0,1094
4,rayonnement_solaire_wm2,float64,0,100.0,8685
5,vitesse_vent_kmh,float64,0,100.0,401
6,precipitation_mm,float64,0,100.0,151


#### Standardiser les formats de dates

In [96]:
def parse_timestamp(ts):
    """Parse les timestamps multi-formats."""
    if pd.isna(ts):
        return pd.NaT
    
    formats = [
        "%Y-%m-%d %H:%M:%S",
        "%d/%m/%Y %H:%M",
        "%m/%d/%Y %H:%M:%S",
        "%Y-%m-%dT%H:%M:%S",
    ]
    
    for fmt in formats:
        try:
            return datetime.strptime(str(ts), fmt)
        except ValueError:
            continue
    
    return pd.NaT

df_weather = df_meteo_raw.copy()

# 1. Parser les timestamps
print("[1/5] Parsing des timestamps...")
df_weather['timestamp'] = df_weather['timestamp'].apply(parse_timestamp)
invalid_ts = df_weather['timestamp'].isna().sum()
print(f"  Timestamps invalides: {invalid_ts}")

# Supprimer les lignes sans timestamp valide
df_weather = df_weather.dropna(subset=['timestamp'])

df_weather.head(10)

[1/5] Parsing des timestamps...
  Timestamps invalides: 0


Unnamed: 0,commune,timestamp,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh,precipitation_mm
0,Saint-Etienne,2024-09-15 15:00:00,17.1,143.3,244.9,14.3,0.0
1,Bordeaux,2023-07-21 15:00:00,19.6,50.6,414.9,3.2,0.0
2,Montpellier,2023-09-18 20:00:00,18.3,65.7,218.4,13.6,0.0
3,Le Havre,2024-01-03 22:00:00,3.7,94.9,6.8,18.6,11.6
4,Lille,2024-10-29 20:00:00,14.0,42.9,781.8,4.0,0.0
5,Bordeaux,2023-12-22 13:00:00,4.4,36.9,796.4,6.1,0.0
6,Marseille,2023-09-15 21:00:00,22.5,86.8,5.8,32.6,0.0
7,Toulouse,2023-05-30 00:00:00,8.3,66.3,26.4,31.4,7.2
8,Bordeaux,2024-10-05 09:00:00,11.5,69.8,71.4,34.4,0.0
9,Toulon,2024-09-28 21:00:00,19.2,79.0,13.1,31.8,0.0


#### Convertir les colonnes numeriques en gerant les erreurs

In [104]:
# Examiner les valeurs problematiques dans temperature_c
print("Valeurs uniques non numeriques dans temperature_c:")
temp_non_numeric = df_meteo_raw[
    df_meteo_raw['temperature_c'].astype(str).str.match(r'^-?[0-9]+[.,]?[0-9]*$', na=False)
]['temperature_c'].unique()
print(temp_non_numeric)

Valeurs uniques non numeriques dans temperature_c:
<StringArray>
['17.1', '19.6', '18.3',  '3.7', '14.0',  '4.4', '22.5',  '8.3', '11.5',
 '19.2',
 ...
 '-2,1', '32,1', '34,3', '32,9', '34,0', '-2,9', '32,8', '34,4', '32,6',
 '-3,9']
Length: 810, dtype: str


In [105]:
df_weather['temperature_c'] = (
    df_weather['temperature_c']
    .astype(str)
    .str.strip()                                
    .str.replace('"', '', regex=False)          
    .str.replace("'", '', regex=False)           
    .str.replace(',', '.', regex=False)          
    .astype(float)
)


df_weather['temperature_c'] = pd.to_numeric(
    df_weather['temperature_c'],
    errors='coerce'
)


print("Distribution temperature:")
print(df_weather['temperature_c'].describe())


Distribution temperature:
count    252612.000000
mean         62.833935
std          18.942379
min          30.000000
25%          46.500000
50%          62.800000
75%          79.100000
max         100.000000
Name: temperature_c, dtype: float64


### Valeurs aberrantes

#### Température hors [-40, 50]

In [99]:
def quality_report(df, name="DataFrame"):
    print(f"RAPPORT QUALITE - {name}")
    print(f"Lignes: {len(df):,}")
    print(f"Colonnes: {len(df.columns)}")
    
    report = []
    for col in df.columns:
        total = len(df)
        missing = df[col].isna().sum() + (df[col] == '').sum() if df[col].dtype == 'object' else df[col].isna().sum()
        completude = (1 - missing / total) * 100
        unique = df[col].nunique()
        dtype = df[col].dtype
        
        report.append({
            'Colonne': col,
            'Type': str(dtype),
            'Manquants': missing,
            'Completude %': round(completude, 2),
            'Uniques': unique
        })
    
    return pd.DataFrame(report)

quality_report(df_weather, "Weather Raw")

RAPPORT QUALITE - Weather Raw
Lignes: 252,612
Colonnes: 7


Unnamed: 0,Colonne,Type,Manquants,Completude %,Uniques
0,commune,str,0,100.0,15
1,timestamp,datetime64[us],0,100.0,17544
2,temperature_c,float64,1229,99.51,404
3,humidite_pct,float64,0,100.0,1094
4,rayonnement_solaire_wm2,float64,0,100.0,8685
5,vitesse_vent_kmh,float64,0,100.0,401
6,precipitation_mm,float64,0,100.0,151


In [100]:

print(f"\nTemperatures < -40: {(df_weather['temperature_c'] < -40).sum()}")
print(f"Temperatures > 50: {(df_weather['temperature_c'] > 50).sum()}")

mask = (df_weather['temperature_c'] < -40) | (df_weather['temperature_c'] > 50)

# ajout de Nan sur les valeurs absurdes
df_weather.loc[mask, 'temperature_c'] = np.nan

# interpolation
df_weather['temperature_c'] = df_weather['temperature_c'].interpolate(method='linear')


#clipping
df_weather['temperature_c'] = df_weather['humidite_pct'].clip(0, 100)


Temperatures < -40: 695
Temperatures > 50: 1282


#### Traiter les valeurs manquantes

In [108]:
df_weather['temperature_c'].interpolate(method='linear')
df_weather['humidite_pct'].interpolate(method='linear')


#   - Forward fill pour precipitation
df_weather['precipitation_mm'] = df_weather['precipitation_mm'].ffill()

# - Ajouter des colonnes temporelles (jour, mois, saison, jour de semaine)

df_weather['day'] = df_weather['timestamp'].dt.day
df_weather['month'] = df_weather['timestamp'].dt.month
df_weather['season'] = df_weather['timestamp'].dt.quarter
df_weather['weekday'] = df_weather['timestamp'].dt.weekday

df_weather['temperature_c'].isna().sum()

np.int64(0)

In [109]:
# convertir le dataframe en csv
df_weather.to_csv("../data/output/meteo_clean.csv", index=False)

####  Rapport

In [107]:
# Comptes de base
total_lignes = len(df_meteo_raw)
total_lignes_apres = len(df_weather)
lignes_supprimees = total_lignes - total_lignes_apres

# Colonnes importantes
colonnes = ['temperature_c', 'humidite_pct', 'precipitation_mm']

# Statistiques sur aberrations / NaN
aberrations = {}
for col in colonnes:
    n_nan = df_weather[col].isna().sum()
    min_val = df_weather[col].min()
    max_val = df_weather[col].max()
    aberrations[col] = {'NaN': n_nan, 'Min': min_val, 'Max': max_val}

# Affichage rapport
print("========== RAPPORT DE NETTOYAGE METEO ==========")
print(f"Lignes en entrée :        {total_lignes}")
print(f"Lignes supprimées :       {lignes_supprimees}")
print(f"Lignes en sortie :        {total_lignes_apres}\n")

print("Colonnes principales après nettoyage :")
for col, stats in aberrations.items():
    print(f" - {col}: NaN={stats['NaN']}, Min={stats['Min']}, Max={stats['Max']}")

print("===============================================")


Lignes en entrée :        252612
Lignes supprimées :       0
Lignes en sortie :        252612

Colonnes principales après nettoyage :
 - temperature_c: NaN=0, Min=30.0, Max=100.0
 - humidite_pct: NaN=0, Min=30.0, Max=150.0
 - precipitation_mm: NaN=0, Min=0.0, Max=15.0
