# EDA complète (Polars-first) — KOI & TOI

Ce notebook regroupe:
- Structure, types, valeurs manquantes, distributions
- Problèmes: outliers (IQR), duplicats, déséquilibre des classes
- Relations: corrélations, paires très corrélées

Toutes les analyses sont réalisées séparément pour KOI et TOI.


In [None]:
# Imports et configuration
from __future__ import annotations
import os
import importlib
import matplotlib
matplotlib.use("Agg")  # images sauvegardées; on affichera via IPython.display
import matplotlib.pyplot as plt
from IPython.display import display, Markdown, Image

pl = importlib.import_module("polars")
pd = importlib.import_module("pandas")

# Utilitaires pour l'I/O
from analyses_hugo.analyse_explo_1 import (
    read_csv_polars,
    read_csv_pandas_then_to_polars,
    schema_and_missing,
    get_numeric_columns,
    get_categorical_columns,
    numeric_distributions,
    outliers_iqr,
    duplicates_summary,
    class_imbalance,
    correlations,
    high_correlation_pairs,
    save_histograms,
    save_barplots,
    save_corr_heatmap,
)

OUT_DIR = os.path.join("analyses_hugo", "outputs")
FIGS_DIR = os.path.join(OUT_DIR, "figures")
TABLES_DIR = os.path.join(OUT_DIR, "tables")
os.makedirs(FIGS_DIR, exist_ok=True)
os.makedirs(TABLES_DIR, exist_ok=True)

def read_df(path: str):
    try:
        df, meta = read_csv_polars(path)
    except Exception:
        df, meta = read_csv_pandas_then_to_polars(path)
    display(Markdown(f"**Lecture** `{path}`: `{meta}`"))
    return df



: 

In [None]:
# Charger KOI et TOI
KOI_PATH = os.path.join("datasets", "KOI.csv")
TOI_PATH = os.path.join("datasets", "TOI.csv")

df_koi = read_df(KOI_PATH)
df_toi = read_df(TOI_PATH)

display(Markdown("### Dimensions"))
if isinstance(df_koi, pl.DataFrame):
    display(Markdown(f"KOI: {df_koi.height} lignes × {len(df_koi.columns)} colonnes"))
    display(Markdown(f"TOI: {df_toi.height} lignes × {len(df_toi.columns)} colonnes"))
else:
    display(Markdown(f"KOI: {len(df_koi)} lignes × {len(df_koi.columns)} colonnes"))
    display(Markdown(f"TOI: {len(df_toi)} lignes × {len(df_toi.columns)} colonnes"))


In [None]:
# 1) Comprendre les données — structure, manquants, distributions

def show_structure_and_missing(df, name: str):
    schema_df = schema_and_missing(df)
    display(Markdown(f"#### {name} — Structure & manquants"))
    display(schema_df.head(20).to_pandas() if hasattr(schema_df, 'to_pandas') else schema_df.head(20))

    num_cols = get_numeric_columns(df)
    num_summary = numeric_distributions(df, num_cols)
    display(Markdown(f"#### {name} — Distributions numériques (résumé)"))
    display(num_summary.head(20).to_pandas() if hasattr(num_summary, 'to_pandas') else num_summary.head(20))

show_structure_and_missing(df_koi, "KOI")
show_structure_and_missing(df_toi, "TOI")


In [None]:
# 2) Détection de problèmes — outliers, duplicats, déséquilibre

def detect_problems(df, name: str):
    num_cols = get_numeric_columns(df)
    out_iqr = outliers_iqr(df, num_cols)
    display(Markdown(f"#### {name} — Outliers (IQR)"))
    display(out_iqr.sort(by=["outlier_pct"], reverse=True).head(20).to_pandas() if hasattr(out_iqr, 'sort') else out_iqr.sort_values("outlier_pct", ascending=False).head(20))

    dups = duplicates_summary(df)
    display(Markdown(f"#### {name} — Duplicats"))
    display({"total": dups["total"], "duplicates": dups["duplicates"]})
    if dups.get("sample") is not None:
        sample = dups["sample"]
        display(Markdown("Échantillon de duplicats (max 50):"))
        display(sample if not hasattr(sample, 'to_pandas') else sample.to_pandas())

    cat_cols = get_categorical_columns(df, max_unique=20)
    class_imb = class_imbalance(df, cat_cols)
    display(Markdown(f"#### {name} — Déséquilibre de classes (top catégories)"))
    display(class_imb.sort(by=["top_pct"], reverse=True).head(20).to_pandas() if hasattr(class_imb, 'sort') else class_imb.sort_values("top_pct", ascending=False).head(20))

detect_problems(df_koi, "KOI")
detect_problems(df_toi, "TOI")


In [None]:
# 3) Relations — corrélations, paires très corrélées, figures

def relations_and_figures(df, name: str):
    num_cols = get_numeric_columns(df)
    corr_df, _ = correlations(df, num_cols)
    display(Markdown(f"#### {name} — Corrélations (aperçu)"))
    if hasattr(corr_df, 'to_pandas'):
        display(corr_df.to_pandas().head(10))
    else:
        display(corr_df.head(10))

    high_pairs = high_correlation_pairs(corr_df, threshold=0.9)
    display(Markdown(f"#### {name} — Paires corrélées (|r|≥0.9)"))
    display(high_pairs.head(30).to_pandas() if hasattr(high_pairs, 'to_pandas') else high_pairs.head(30))

    # Générer figures et afficher inline
    figs_dir = FIGS_DIR
    hist_paths = save_histograms(df, num_cols, figs_dir, max_plots=12, name_prefix=name)
    heatmap_path = save_corr_heatmap(corr_df, os.path.join(figs_dir, "corr_heatmap.png"), name_prefix=name)

    from IPython.display import display
    for p in ([heatmap_path] if heatmap_path else []) + hist_paths[:6]:
        display(Image(filename=p))

    # Barplots pour catégorielles
    cat_cols = get_categorical_columns(df, max_unique=20)
    bar_paths = save_barplots(df, cat_cols, figs_dir, top_k=12, name_prefix=name)
    for p in bar_paths[:6]:
        display(Image(filename=p))

relations_and_figures(df_koi, "KOI")
relations_and_figures(df_toi, "TOI")
