# EDA Polars — KOI & TOI (Notebook Kaggle autonome)

Ce notebook:
- installe les dépendances nécessaires (Polars, Matplotlib, Seaborn)
- charge les CSV KOI/TOI (à déposer dans l'environnement Kaggle ou via un Dataset Kaggle)
- réalise: structure/manquants/distributions, détection de problèmes (outliers IQR, duplicats, déséquilibre), corrélations et figures inline

Exécutez les cellules dans l'ordre; adaptez les chemins des fichiers si vous utilisez un Dataset Kaggle (`/kaggle/input/...`).


In [None]:
# Installation des dépendances (Kaggle)
# - polars: traitement de données rapide (colonnaire)
# - matplotlib/seaborn: visualisations
!pip -q install polars matplotlib seaborn --upgrade



In [None]:
# Imports et configuration globale
# - pl/pd: manipulation de données (Polars prioritaire)
# - matplotlib/seaborn: graphiques
# - display/Markdown: affichage propre dans le notebook
import os
import polars as pl
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown

# Style par défaut pour les figures
sns.set_theme(style="whitegrid")

# Répertoires de sortie (les figures seront sauvegardées ici)
OUT_DIR = "./outputs"
FIGS_DIR = os.path.join(OUT_DIR, "figures")
TABLES_DIR = os.path.join(OUT_DIR, "tables")
os.makedirs(FIGS_DIR, exist_ok=True)
os.makedirs(TABLES_DIR, exist_ok=True)



In [None]:
# Fonctions utilitaires (autonomes)
# - sniff_delimiter: détection robuste du séparateur CSV
# - read_csv_pl: lecture KOI/TOI avec Polars (fallback Pandas)
# - get_*_columns: identification colonnes numériques/catégorielles
# - schema_and_missing: synthèse types et valeurs manquantes
# - numeric_distributions: statistiques + quantiles
# - outliers_iqr: détection outliers par règle IQR
# - duplicates_summary: comptage + échantillon de doublons
# - class_imbalance: top catégorie dominante et part
# - correlations/high_correlation_pairs: matrice de corrélation et paires |r|≥seuil
# - save_*: génération des figures (histogrammes, barplots, heatmap)
import csv

def sniff_delimiter(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        head = f.read(64 * 1024)
    try:
        dialect = csv.Sniffer().sniff(head, delimiters=",;\t|")
        return dialect.delimiter
    except Exception:
        return ","

def read_csv_pl(path: str) -> pl.DataFrame:
    sep = sniff_delimiter(path)
    for enc in ("utf8", "utf8-lossy"):
        try:
            return pl.read_csv(path, separator=sep, encoding=enc, infer_schema_length=5000, ignore_errors=True)
        except Exception:
            pass
    # fallback pandas -> polars (tolère lignes corrompues)
    df_pd = pd.read_csv(path, sep=sep, on_bad_lines="skip")
    return pl.from_pandas(df_pd)

def get_numeric_columns(df: pl.DataFrame):
    return [c for c, dt in zip(df.columns, df.dtypes) if pl.datatypes.is_numeric(dt)]

def get_categorical_columns(df: pl.DataFrame, max_unique: int = 50):
    cats = []
    for c, dt in zip(df.columns, df.dtypes):
        if pl.datatypes.is_utf8(dt) or pl.datatypes.is_boolean(dt):
            # Nombre de modalités distinctes (limité pour éviter les cardinalités trop élevées)
            n_unique = int(df.select(pl.col(c).n_unique()).item())
            if n_unique <= max_unique:
                cats.append(c)
    return cats

def schema_and_missing(df: pl.DataFrame) -> pl.DataFrame:
    rows = []
    n = df.height
    for c, dt in zip(df.columns, df.dtypes):
        nulls = int(df.select(pl.col(c).is_null().sum()).item())
        rows.append({"column": c, "dtype": str(dt), "null_count": nulls, "null_pct": (nulls / n) if n else 0.0})
    return pl.DataFrame(rows)

def numeric_distributions(df: pl.DataFrame, numeric_cols):
    qs = [0.0, 0.25, 0.5, 0.75, 0.9, 0.95, 1.0]
    out_rows = []
    for c in numeric_cols:
        desc = df.select([
            pl.col(c).count().alias("count"),
            pl.col(c).mean().alias("mean"),
            pl.col(c).std().alias("std"),
            pl.col(c).min().alias("min"),
            pl.col(c).max().alias("max"),
        ]).to_dicts()[0]
        quants = df.select(pl.col(c).quantile(qs)).to_series().to_list()
        row = {"column": c, **desc}
        for q, val in zip(qs, quants):
            row[f"q{int(q*100):02d}"] = val
        out_rows.append(row)
    return pl.DataFrame(out_rows)

def outliers_iqr(df: pl.DataFrame, numeric_cols):
    rows = []
    n = df.height
    for c in numeric_cols:
        q1 = df.select(pl.col(c).quantile(0.25)).item()
        q3 = df.select(pl.col(c).quantile(0.75)).item()
        if q1 is None or q3 is None:
            rows.append({"column": c, "lower": None, "upper": None, "outliers": 0, "outlier_pct": 0.0})
            continue
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        cnt = int(df.select(((pl.col(c) < lower) | (pl.col(c) > upper)).sum()).item())
        rows.append({"column": c, "lower": lower, "upper": upper, "outliers": cnt, "outlier_pct": (cnt / n) if n else 0.0})
    return pl.DataFrame(rows)

def duplicates_summary(df: pl.DataFrame):
    n = df.height
    n_unique_rows = int(df.unique().height)
    dup_count = n - n_unique_rows
    sample = None
    if dup_count > 0:
        sample = (
            df.with_row_count()
              .groupby(df.columns)
              .len()
              .filter(pl.col("len") > 1)
              .limit(30)
        )
    return {"total": n, "duplicates": dup_count, "sample": sample}

def class_imbalance(df: pl.DataFrame, categorical_cols):
    rows = []
    for c in categorical_cols:
        vc = df.group_by(c).len().sort("len", descending=True)
        total = int(vc.select(pl.col("len").sum()).item())
        if total == 0:
            continue
        top = vc.row(0)
        rows.append({"column": c, "top_value": str(top[0]), "top_count": int(top[1]), "top_pct": int(top[1]) / total})
    return pl.DataFrame(rows)

def correlations(df: pl.DataFrame, numeric_cols):
    # Corrélation Pearson (conversion pandas pour compatibilité)
    corr = df.select(numeric_cols).to_pandas().corr(method="pearson")
    return pl.from_pandas(corr)

def high_correlation_pairs(corr_df: pl.DataFrame, threshold: float = 0.9):
    cols = list(corr_df.columns)
    rows = []
    pdf = corr_df.to_pandas()
    for i in range(len(cols)):
        for j in range(i + 1, len(cols)):
            r = pdf.iloc[i, j]
            if r is not None and abs(r) >= threshold:
                rows.append({"var1": cols[i], "var2": cols[j], "corr": float(r)})
    return pl.DataFrame(rows)

def save_histograms(df: pl.DataFrame, numeric_cols, out_dir: str, max_plots: int = 12, name_prefix: str = ""):
    # Histogrammes sur les variables numériques (tri par variance décroissante)
    os.makedirs(out_dir, exist_ok=True)
    paths = []
    variances = []
    for c in numeric_cols:
        try:
            var_val = df.select(pl.col(c).var()).item()
            if var_val is None:
                continue
            variances.append((c, float(var_val)))
        except Exception:
            continue
    variances.sort(key=lambda kv: kv[1], reverse=True)
    for c, _ in variances[:max_plots]:
        fig, ax = plt.subplots(figsize=(6, 4))
        try:
            data = df[c].drop_nulls().to_numpy()
            ax.hist(data, bins=50, color="#4a90e2", alpha=0.85)
            ax.set_title(f"Histogramme: {c}")
            ax.set_xlabel(c)
            ax.set_ylabel("Fréquence")
            fig.tight_layout()
            prefix = f"{name_prefix}_" if name_prefix else ""
            out_path = os.path.join(out_dir, f"{prefix}hist_{c}.png")
            fig.savefig(out_path, dpi=140)
            paths.append(out_path)
        finally:
            plt.close(fig)
    return paths

def save_barplots(df: pl.DataFrame, categorical_cols, out_dir: str, top_k: int = 15, name_prefix: str = ""):
    # Barplots des top modalités par colonne catégorielle
    os.makedirs(out_dir, exist_ok=True)
    paths = []
    for c in categorical_cols:
        vc = df.group_by(c).len().sort("len", descending=True).limit(top_k)
        labels = [str(row[0]) for row in vc.iter_rows()]
        values = [int(row[1]) for row in vc.iter_rows()]
        fig, ax = plt.subplots(figsize=(8, 4.5))
        ax.barh(range(len(values)), values, color="#7b9acc")
        ax.set_yticks(range(len(labels)))
        ax.set_yticklabels(labels, fontsize=8)
        ax.invert_yaxis()
        ax.set_title(f"Top catégories: {c}")
        ax.set_xlabel("Comptes")
        fig.tight_layout()
        prefix = f"{name_prefix}_" if name_prefix else ""
        safe_c = c.replace("/", "_").replace("\\", "_")
        out_path = os.path.join(out_dir, f"{prefix}bar_{safe_c}.png")
        fig.savefig(out_path, dpi=140)
        paths.append(out_path)
        plt.close(fig)
    return paths

def save_corr_heatmap(corr_df: pl.DataFrame, out_path: str, name_prefix: str = ""):
    # Heatmap de la matrice de corrélations
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    corr_pd = corr_df.to_pandas()
    fig, ax = plt.subplots(figsize=(9, 7))
    im = ax.imshow(corr_pd.values, cmap="coolwarm", vmin=-1, vmax=1)
    ax.set_xticks(range(len(corr_pd.columns)))
    ax.set_yticks(range(len(corr_pd.index)))
    ax.set_xticklabels(corr_pd.columns, rotation=90, fontsize=7)
    ax.set_yticklabels(corr_pd.index, fontsize=7)
    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    fig.tight_layout()
    base_dir = os.path.dirname(out_path)
    base_name = os.path.basename(out_path)
    if name_prefix:
        base_name = f"{name_prefix}_" + base_name
    final_path = os.path.join(base_dir, base_name)
    fig.savefig(final_path, dpi=150)
    plt.close(fig)
    return final_path



In [None]:
# Chemins d'entrées (à adapter sur Kaggle si besoin)
KOI_PATH = "./KOI.csv"  # ou '/kaggle/input/<dataset>/KOI.csv'
TOI_PATH = "./TOI.csv"  # ou '/kaggle/input/<dataset>/TOI.csv'

# Chargement
try:
    df_koi = read_csv_pl(KOI_PATH)
    display(Markdown(f"**Lecture KOI**: `{KOI_PATH}` — {df_koi.height} lignes, {len(df_koi.columns)} colonnes"))
except Exception as e:
    display(Markdown(f"**Erreur de lecture KOI**: {e}"))
    raise

try:
    df_toi = read_csv_pl(TOI_PATH)
    display(Markdown(f"**Lecture TOI**: `{TOI_PATH}` — {df_toi.height} lignes, {len(df_toi.columns)} colonnes"))
except Exception as e:
    display(Markdown(f"**Erreur de lecture TOI**: {e}"))
    raise


In [None]:
# 1) Comprendre les données — structure & distributions

def overview(df: pl.DataFrame, name: str):
    display(Markdown(f"### {name} — Structure & manquants"))
    schema_df = schema_and_missing(df)
    display(schema_df.to_pandas().head(20))
    num_cols = get_numeric_columns(df)
    num_summary = numeric_distributions(df, num_cols)
    display(Markdown(f"### {name} — Distributions numériques (résumé)"))
    display(num_summary.to_pandas().head(20))

overview(df_koi, "KOI")
overview(df_toi, "TOI")


In [None]:
# 2) Problèmes — outliers, duplicats, déséquilibres

def problems(df: pl.DataFrame, name: str):
    num_cols = get_numeric_columns(df)
    out_iqr = outliers_iqr(df, num_cols)
    display(Markdown(f"### {name} — Outliers (IQR)"))
    display(out_iqr.sort(by=["outlier_pct"], reverse=True).to_pandas().head(20))

    dups = duplicates_summary(df)
    display(Markdown(f"### {name} — Duplicats"))
    display({"total": dups["total"], "duplicates": dups["duplicates"]})
    if dups.get("sample") is not None:
        display(Markdown("Échantillon duplicats (max 30):"))
        display(dups["sample"].to_pandas())

    cat_cols = get_categorical_columns(df, max_unique=20)
    class_imb = class_imbalance(df, cat_cols)
    display(Markdown(f"### {name} — Déséquilibre de classes"))
    display(class_imb.sort(by=["top_pct"], reverse=True).to_pandas().head(20))

problems(df_koi, "KOI")
problems(df_toi, "TOI")


In [None]:
# 3) Relations — corrélations & figures

def relations(df: pl.DataFrame, name: str):
    num_cols = get_numeric_columns(df)
    corr_df = correlations(df, num_cols)
    display(Markdown(f"### {name} — Corrélations (aperçu)"))
    display(corr_df.to_pandas().head(10))

    high_pairs = high_correlation_pairs(corr_df, threshold=0.9)
    display(Markdown(f"### {name} — Paires très corrélées (|r|≥0.9)"))
    display(high_pairs.to_pandas().head(30))

    hist_paths = save_histograms(df, num_cols, FIGS_DIR, max_plots=12, name_prefix=name)
    heatmap_path = save_corr_heatmap(corr_df, os.path.join(FIGS_DIR, "corr_heatmap.png"), name_prefix=name)
    bar_paths = save_barplots(df, get_categorical_columns(df, 20), FIGS_DIR, top_k=12, name_prefix=name)

    display(Markdown("#### Figures enregistrées"))
    display({
        "heatmap": heatmap_path,
        "hist": hist_paths[:5],
        "bar": bar_paths[:5],
    })

relations(df_koi, "KOI")
relations(df_toi, "TOI")


In [None]:
# Affichages inline — helpers (hist, barplots, heatmap)
import math
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown

sns.set_theme(style="whitegrid")

def show_hist_inline(df: pl.DataFrame, cols, bins: int = 50, max_plots: int = 9):
    cols = list(cols)[:max_plots]
    n = len(cols)
    if n == 0:
        display(Markdown("Aucune colonne numérique à tracer."))
        return
    rows, cols_per_row = math.ceil(n / 3), 3
    fig, axes = plt.subplots(rows, cols_per_row, figsize=(15, 4 * rows))
    try:
        axes = axes.flatten()
    except Exception:
        axes = [axes]
    for ax, col in zip(axes, cols):
        s = df[col].drop_nulls().to_numpy()
        ax.hist(s, bins=bins, color="#4a90e2", alpha=0.85)
        ax.set_title(col, fontsize=9)
    for k in range(len(cols), len(axes)):
        fig.delaxes(axes[k])
    plt.tight_layout()
    plt.show()


def show_barplots_inline(df: pl.DataFrame, cat_cols, top_k: int = 12, max_plots: int = 6):
    cat_cols = list(cat_cols)[:max_plots]
    for col in cat_cols:
        vc = df.group_by(col).len().sort("len", descending=True).limit(top_k)
        labels = [str(row[0]) for row in vc.iter_rows()]
        counts = [int(row[1]) for row in vc.iter_rows()]
        plt.figure(figsize=(8, 4))
        plt.barh(labels, counts, color="#7b9acc")
        plt.gca().invert_yaxis()
        plt.title(col)
        plt.tight_layout()
        plt.show()


def show_corr_heatmap_inline(df: pl.DataFrame, num_cols):
    if not num_cols:
        display(Markdown("Pas de colonnes numériques pour la corrélation."))
        return
    corr = df.select(num_cols).to_pandas().corr("pearson")
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr, cmap="coolwarm", vmin=-1, vmax=1)
    plt.title("Matrice de corrélation")
    plt.tight_layout()
    plt.show()


In [None]:
# Afficher inline — KOI
num_cols_koi = get_numeric_columns(df_koi)
cat_cols_koi = get_categorical_columns(df_koi, 20)

display(Markdown("## KOI — Histogrammes"))
show_hist_inline(df_koi, num_cols_koi, bins=50, max_plots=9)

display(Markdown("## KOI — Barplots"))
show_barplots_inline(df_koi, cat_cols_koi, top_k=12, max_plots=6)

display(Markdown("## KOI — Heatmap corrélation"))
show_corr_heatmap_inline(df_koi, num_cols_koi)


In [None]:
# Afficher inline — TOI
num_cols_toi = get_numeric_columns(df_toi)
cat_cols_toi = get_categorical_columns(df_toi, 20)

display(Markdown("## TOI — Histogrammes"))
show_hist_inline(df_toi, num_cols_toi, bins=50, max_plots=9)

display(Markdown("## TOI — Barplots"))
show_barplots_inline(df_toi, cat_cols_toi, top_k=12, max_plots=6)

display(Markdown("## TOI — Heatmap corrélation"))
show_corr_heatmap_inline(df_toi, num_cols_toi)


In [None]:
# Afficher les images sauvegardées (si vous avez utilisé les fonctions save_*)
from IPython.display import Image
import glob, os

# Heatmaps
for p in sorted(glob.glob(os.path.join(FIGS_DIR, "*corr_heatmap*.png"))):
    display(Markdown(f"#### {os.path.basename(p)}"))
    display(Image(filename=p))

# Quelques histogrammes
for p in sorted(glob.glob(os.path.join(FIGS_DIR, "*hist_*.png")))[:10]:
    display(Markdown(f"#### {os.path.basename(p)}"))
    display(Image(filename=p))

# Quelques barplots
for p in sorted(glob.glob(os.path.join(FIGS_DIR, "*bar_*.png")))[:10]:
    display(Markdown(f"#### {os.path.basename(p)}"))
    display(Image(filename=p))

