In [1]:
import os
os.getcwd()

'/Users/khlil/Documents/GitHub/projet_election'

In [2]:
import pandas as pd 
import numpy as np 
import warnings
from src.model_utils import train_test_model_rf, train_test_model_HGR, train_test_model_oblique_rf_regressor
from src.data_utils import augment_df, from_years_to_delta
warnings.filterwarnings('ignore')

In [3]:
path_data_full = r"data/data_merged_20250922.parquet"

In [4]:
cols_explicatives = [
    "annee",
    "type",
    "codecommune",
    "pvotepreviouspvoteD",
    "pvotepvoteD",
    'lat',
    'long',


    "popcommunes/pop",
    "popcommunesvbbm/vbbm",
    "agesexcommunes/prop014",
    "agesexcommunes/prop60p",
    "agesexcommunes/perage",
    "diplomescommunes/pbac",
    "diplomescommunes/psup",
    "diplomescommunes/nodip",
    "cspcommunes/pouvr",
    "cspcommunes/pcadr",
    "cspcommunes/pchom",
    "revcommunes/revratio",
]

In [None]:
sample = pd.read_parquet(path_data_full, ).sample(10)

In [None]:
data = pd.read_parquet(path_data_full, columns=cols_explicatives)
data.dropna(inplace=True)
data = data[data["type"]==0]
data.shape 

In [None]:
data.to_pickle(r"data/data_eco_soc_no_nan.pkl")

In [5]:
data = pd.read_pickle(r"data/data_eco_soc_no_nan.pkl")

In [6]:
years = np.unique(data.annee)

## Import dep

In [7]:
import requests
from io import BytesIO

def commune_to_departement_dict(year: int = 2025) -> dict[str, str]:
    """
    Retourne un dict {code_commune_INSEE (5 chars): code_departement (2-3 chars)} à partir du COG Insee.
    """
    # Page COG 2025: https://www.insee.fr/fr/information/8377162 (et fichier v_commune_2025.csv)
    url = f"https://www.insee.fr/fr/statistiques/fichier/8377162/v_commune_{year}.csv"

    # Téléchargement
    r = requests.get(url, timeout=60)
    r.raise_for_status()

    # Détection simple du séparateur (Insee est souvent en ';', mais on sécurise)
    head = r.content[:2048].decode("utf-8", errors="replace")
    sep = ";" if head.count(";") > head.count(",") else ","

    df = pd.read_csv(BytesIO(r.content), sep=sep, dtype=str)

    # Colonnes attendues (COG) : COM, DEP, TYPECOM, etc.
    # On garde uniquement les lignes où COM et DEP sont bien renseignés
    df = df.dropna(subset=["COM", "DEP"])
    df["COM"] = df["COM"].str.zfill(5)  # sécurité

    # Si tu veux seulement les "vraies" communes (pas arrondissements municipaux, communes déléguées...),
    # décommente la ligne suivante :
    # df = df[df["TYPECOM"] == "COM"]

    return dict(zip(df["COM"], df["DEP"]))

d = commune_to_departement_dict(2025)

# Exemples (Paris, Marseille)
print(d["75056"], d["13055"])
data["dep"] = data["codecommune"].apply(lambda x: d.get(str(x), 'None'))

75 13


## RF


### Model 1: base

In [None]:
y = data["pvotepvoteD"]
X = data.drop(columns=["pvotepvoteD", "annee", "codecommune", "pvotepreviouspvoteD"])

In [None]:
train_test_model_rf(
    X, y
)

### Model 2: 3 steps stretch

In [None]:
# Last 3 years
df_17, df_22, df_12 = data[data["annee"]==2017], data[data["annee"]==2022], data[data["annee"]==2012]
df_17.shape, df_22.shape, df_12.shape


In [None]:
cols_to_track = [
    "popcommunes/pop",
    "popcommunesvbbm/vbbm",
    "agesexcommunes/prop014",
    "agesexcommunes/prop60p",
    "agesexcommunes/perage",
    "diplomescommunes/pbac",
    "diplomescommunes/psup",
    "diplomescommunes/nodip",
    "cspcommunes/pouvr",
    "cspcommunes/pcadr",
    "cspcommunes/pchom",
    "revcommunes/revratio"
    ]
delta_12_17 = augment_df(df_12, df_17, cols_to_track)
delta_17_22 = augment_df(df_17, df_22, cols_to_track)

In [None]:
histories_delta = delta_12_17.merge(delta_17_22, on="codecommune", how="inner", suffixes=("17_22_", "12_17_"))
histories_delta.shape 

In [None]:
df = pd.merge(df_22, histories_delta, on="codecommune", how="inner")

In [None]:
y = df["pvotepvoteD"]
X = df.drop(columns=["pvotepvoteD", "annee", "codecommune", "pvotepreviouspvoteD"])

In [None]:
rf, pi = train_test_model_rf(
    X, y
)

### Model 3: Less features - Time series depth increased

In [None]:
cols_to_track = [
    "popcommunes/pop",
    "diplomescommunes/psup",
    "diplomescommunes/nodip",
    "revcommunes/revratio",
    "cspcommunes/pcadr",
    "pvotepreviouspvoteD"
    ]

In [None]:
data_no_22 = data[data["annee"]!=2022]
data_22 = data[data["annee"]==2022]

In [None]:
delta_df = from_years_to_delta(years[:-1], data_no_22, cols_to_track)
delta_df.head()

In [None]:
new_df = pd.merge(
    delta_df,
    data_22[["lat", "long", "codecommune"]+cols_to_track],
    on="codecommune",
    how="right"
)

In [None]:
X = new_df.drop(columns=["codecommune"])
y = data_22["pvotepvoteD"] - data_22["pvotepreviouspvoteD"]

In [None]:
rf, pi = train_test_model_rf(
    X, y, n_estimators=1000
)

In [None]:
years

In [None]:
rf, pi = train_test_model_HGR(
    X, y
)

**Comments**
#### Importance des features


- La position géo apparaît comme le facteur principal lors de la prédiction

## New approach: 

Features: résultat chaque élection -> ajouter l’année 
- Fit avec les données d’avant
- Stationnarise les cdonnes (rangs, proportion, relatives)
- Regharder où les dfeatures apparaiesent dans l’arbre (profondeur)
- Oblique RF (model)

In [8]:
cols_to_track = [

    # Main cols
    "annee", 
    "codecommune", 
    "pvotepreviouspvoteD", 
    "pvotepvoteD", 
    "lat", 
    "long",

    # Socio - Eco
    "popcommunes/pop",
    "diplomescommunes/psup",
    "diplomescommunes/nodip",
    "revcommunes/revratio",
    "cspcommunes/pcadr",
    "pvotepreviouspvoteD"
    ]

cols_socio_eco = [
    "popcommunes/pop",
    "diplomescommunes/psup",
    "diplomescommunes/nodip",
    "revcommunes/revratio",
    "cspcommunes/pcadr",
    "pvotepreviouspvoteD"
]
years

array([1965, 1969, 1974, 1981, 1988, 1995, 2002, 2007, 2012, 2017, 2022])

#### 2017 -> 2022

In [10]:
X = augment_df(
    data[data["annee"]==2017],
    data[data["annee"]==2022],
    cols_socio_eco,
)

X = X.merge(
        data[data["annee"]==2022][cols_to_track],
        on="codecommune",
        how="right",
    ).merge(
        data[data["annee"]==2017][["codecommune"] + cols_socio_eco],
        on="codecommune",
        how="left",
        suffixes=("_22", "_17")
    )
X.dropna(subset=X.columns, inplace=True)
y = X[X["annee"]==2022]["pvotepvoteD"]

X.drop(columns=[
        "codecommune",
        "annee",
        "pvotepvoteD"
        ]
    , inplace=True
    )


In [11]:
X.shape, y.shape 

((34637, 21), (34637,))

In [12]:
rf_22, pi_22 = train_test_model_rf(X, y)
fi = (
    pd.Series(rf_22.feature_importances_, index=X.columns)
      .sort_values(ascending=False)
)
print(fi)

R2: 0.7271737636869361
RMSE: 0.05187587525136693
Std de y: 0.099 --- Moyenne de y: 0.399
Permutation Importance:                           feature  importance_mean  importance_std
14        pvotepreviouspvoteD_22         1.023127        0.012186
6         pvotepreviouspvoteD_22         1.023127        0.012186
20        pvotepreviouspvoteD_17         0.052245        0.002093
16      diplomescommunes/psup_17         0.004066        0.000716
8                           long         0.001375        0.000202
5      delta_pvotepreviouspvoteD         0.000651        0.000133
9             popcommunes/pop_22         0.000634        0.000155
19          cspcommunes/pcadr_17         0.000416        0.000072
15            popcommunes/pop_17         0.000289        0.000155
7                            lat         0.000259        0.000076
17     diplomescommunes/nodip_17         0.000214        0.000101
12       revcommunes/revratio_22         0.000204        0.000069
11     diplomescommunes/nodi

In [42]:
rf_22, pi_22 = train_test_model_oblique_rf_regressor(X, y)
fi = (
    pd.Series(rf_22.feature_importances_, index=X.columns)
      .sort_values(ascending=False)
)
print(fi)

R2: 0.7326931378405119
RMSE: 0.05134846024527382
Std de y: 0.099 --- Moyenne de y: 0.399




Permutation Importance:
                          feature  importance_mean  importance_std
14        pvotepreviouspvoteD_22         0.908949        0.013071
6         pvotepreviouspvoteD_22         0.908949        0.013071
20        pvotepreviouspvoteD_17         0.059208        0.001702
8                           long         0.006508        0.000587
16      diplomescommunes/psup_17         0.003306        0.000388
9             popcommunes/pop_22         0.001944        0.000167
7                            lat         0.001486        0.000234
15            popcommunes/pop_17         0.001257        0.000126
5      delta_pvotepreviouspvoteD         0.001023        0.000189
19          cspcommunes/pcadr_17         0.000675        0.000125
10      diplomescommunes/psup_22         0.000638        0.000109
18       revcommunes/revratio_17         0.000551        0.000087
17     diplomescommunes/nodip_17         0.000367        0.000067
12       revcommunes/revratio_22         0.000366  

In [13]:
from src.oblique_rf import ObliqueRF

model = ObliqueRF()
model.tune_cv_hyperparams(X, y)



Best params: {'feature_combinations': None, 'max_depth': None, 'n_estimators': 900}
Best CV score: 0.7401479483288455


(ObliqueRandomForestRegressor(min_samples_leaf=2, min_samples_split=4,
                              n_estimators=900, n_jobs=-1, random_state=42),
     mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0      289.746951      1.101154         2.095576        0.113607   
 1      196.853808      0.631920         1.631610        0.181417   
 2      208.616776     35.834588         1.434742        0.711304   
 3      102.384449      0.874857         0.587801        0.082257   
 4      164.497006      1.488960         1.359710        0.072865   
 5       83.445171      0.281168         0.623786        0.087994   
 6      185.578837      0.789869         0.849419        0.051770   
 7      123.587942      0.734133         0.500187        0.086581   
 8       62.368387      1.153420         0.144632        0.055442   
 9       72.996393     28.979432         0.273384        0.088490   
 10      51.397191      0.318427         0.172740        0.055719   
 11      84.331382     1

#### 2012 -> 2017


In [None]:
X = augment_df(
    data[data["annee"]==2012],
    data[data["annee"]==2017],
    cols_socio_eco,
)

X = X.merge(
        data[data["annee"]==2017][cols_to_track],
        on="codecommune",
        how="right",
    ).merge(
        data[data["annee"]==2012][["codecommune"] + cols_socio_eco],
        on="codecommune",
        how="left",
        suffixes=("_17", "_12")
    ).drop(columns=[
        "codecommune",
        "annee",
        "pvotepvoteD"
        ]
    )

y = data[data["annee"]==2017]["pvotepvoteD"]

In [None]:
rf_17, pi_17 = train_test_model_rf(X, y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits




Best params: {'feature_combinations': None, 'max_depth': None, 'n_estimators': 900}
Best CV score: 0.7401479483288455


(ObliqueRandomForestRegressor(min_samples_leaf=2, min_samples_split=4,
                              n_estimators=900, n_jobs=-1, random_state=42),
     mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0      307.293339      4.424411         2.565110        0.154713   
 1      213.041602      0.841695         1.978431        0.210275   
 2      193.191145     49.326996         1.488050        0.961388   
 3      110.256007      0.545905         0.846177        0.116524   
 4      163.256188      1.524431         1.676079        0.111278   
 5       82.339935      1.569068         0.628911        0.055154   
 6      181.611546      1.278785         0.961419        0.067752   
 7      121.153594      0.405167         0.640437        0.013324   
 8       63.642939      0.451105         0.190256        0.094390   
 9      110.059973     12.659241         0.502139        0.103301   
 10      51.023336      0.326819         0.193558        0.055157   
 11     102.164892      

#### 2007 -> 2012


In [None]:
X = augment_df(
    data[data["annee"]==2012],
    data[data["annee"]==2007],
    cols_socio_eco,
)

X = X.merge(
        data[data["annee"]==2012][cols_to_track],
        on="codecommune",
        how="right",
    ).merge(
        data[data["annee"]==2007][["codecommune"] + cols_socio_eco],
        on="codecommune",
        how="left",
        suffixes=("_12", "_07")
    ).drop(columns=[
        "codecommune",
        "annee",
        "pvotepvoteD"
        ]
    )
y = data[data["annee"]==2012]["pvotepvoteD"]


In [None]:
rf_12, pi_12 = train_test_model_rf(X, y)


## First pres

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

metrics = pd.DataFrame({
    "period": ["2012", "2017", "2022"],
    "r2": [0.6487262892331658, 0.7271526710001446, 0.711679691527578],
    "rmse": [0.04424747641949413, 0.05059977795386046, 0.05176082803652073],
    "y_std": [0.075, 0.097, 0.096],
    "y_mean": [0.235, 0.335, 0.398],
})

plt.figure()
plt.plot(metrics["period"], metrics["r2"], marker="o")
plt.ylabel("R²")
plt.xlabel("Election pair")
plt.title("Model performance across election pairs")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

plt.figure()
plt.plot(metrics["period"], metrics["rmse"], marker="o")
plt.ylabel("RMSE (vote share)")
plt.xlabel("Election pair")
plt.title("RMSE across election pairs")
plt.tight_layout()
plt.show()


In [None]:
def plot_perm_importance(df, title, topn=12, drop_contains=None, logx=False):
    d = df.copy()
    if drop_contains:
        mask = np.ones(len(d), dtype=bool)
        for s in drop_contains:
            mask &= ~d["feature"].str.contains(s, regex=False)
        d = d[mask]
    d = d.sort_values("importance_mean", ascending=False).head(topn)

    plt.figure()
    plt.barh(d["feature"][::-1], d["importance_mean"][::-1], xerr=d["importance_std"][::-1])
    if logx:
        plt.xscale("log")
    plt.title(title)
    plt.xlabel("Permutation importance (mean)")
    plt.tight_layout()
    plt.show()

# usage:
plot_perm_importance(pi, "2007→2012 Top importances (raw)")
plot_perm_importance(pi, "2007→2012 Importances (excluding previous vote)", drop_contains=["pvoteprevious"], topn=12)
plot_perm_importance(pi_22, "2017→2022 Importances (log scale)", topn=10, logx=True)
plot_perm_importance(pi_17, "2012→2017 Importances (log scale)", topn=10, logx=True)
plot_perm_importance(pi_12, "2007→2012 Importances (log scale)", topn=10, logx=True)
