In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("data/licenses_by_year_region_fed.csv")

df

Unnamed: 0,annee,region,nom_fed,total_license,total_f,total_h,h_1_9,h_10_19,h_20_29,h_30_59,h_60_74,h_75,f_1_9,f_10_19,f_20_29,f_30_59,f_60_74,f_75
0,2012,Auvergne-Rh√¥ne-Alpes,AEROMODELISME,2494.0,61.0,2433.0,20.0,298.0,142.0,1251.0,622.0,100.0,2.0,8.0,5.0,29.0,14.0,3.0
1,2012,Auvergne-Rh√¥ne-Alpes,AEROSTATION,90.0,20.0,70.0,0.0,2.0,3.0,55.0,9.0,1.0,0.0,0.0,3.0,13.0,4.0,0.0
2,2012,Auvergne-Rh√¥ne-Alpes,AIKIDO AIKIBUDO ET AFFINITAIRES,2433.0,664.0,1769.0,114.0,477.0,139.0,935.0,93.0,11.0,62.0,173.0,80.0,317.0,31.0,1.0
3,2012,Auvergne-Rh√¥ne-Alpes,AIKIDO ET BUDO,2486.0,600.0,1875.0,114.0,502.0,139.0,955.0,125.0,7.0,63.0,177.0,76.0,239.0,32.0,1.0
4,2012,Auvergne-Rh√¥ne-Alpes,ATHLETISME,19275.0,8745.0,10510.0,906.0,3628.0,1019.0,4331.0,559.0,64.0,640.0,3646.0,741.0,3334.0,364.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18779,2023,√éle-de-France,VOILE,52809.0,20431.0,32378.0,8804.0,12161.0,2364.0,6710.0,2068.0,186.0,6456.0,8664.0,1723.0,3096.0,439.0,41.0
18780,2023,√éle-de-France,VOITURES RADIO COMMANDEES,1019.0,65.0,954.0,31.0,99.0,54.0,658.0,107.0,2.0,6.0,11.0,1.0,38.0,8.0,1.0
18781,2023,√éle-de-France,VOL EN PLANEUR,1669.0,287.0,1382.0,3.0,370.0,250.0,440.0,259.0,39.0,0.0,107.0,63.0,87.0,29.0,0.0
18782,2023,√éle-de-France,VOL LIBRE,2547.0,405.0,2142.0,2.0,72.0,336.0,1373.0,332.0,21.0,0.0,36.0,121.0,232.0,13.0,3.0


In [3]:
df["annee"] = df['annee'].astype(str)

numeric_cols = ["total_license", "total_f", "total_h","f_1_9", "f_10_19", "f_20_29","f_30_59","f_60_74","f_75","h_1_9", "h_10_19", "h_20_29","h_30_59","h_60_74","h_75"]
df[numeric_cols] = df[numeric_cols].astype(int)

In [4]:
df

Unnamed: 0,annee,region,nom_fed,total_license,total_f,total_h,h_1_9,h_10_19,h_20_29,h_30_59,h_60_74,h_75,f_1_9,f_10_19,f_20_29,f_30_59,f_60_74,f_75
0,2012,Auvergne-Rh√¥ne-Alpes,AEROMODELISME,2494,61,2433,20,298,142,1251,622,100,2,8,5,29,14,3
1,2012,Auvergne-Rh√¥ne-Alpes,AEROSTATION,90,20,70,0,2,3,55,9,1,0,0,3,13,4,0
2,2012,Auvergne-Rh√¥ne-Alpes,AIKIDO AIKIBUDO ET AFFINITAIRES,2433,664,1769,114,477,139,935,93,11,62,173,80,317,31,1
3,2012,Auvergne-Rh√¥ne-Alpes,AIKIDO ET BUDO,2486,600,1875,114,502,139,955,125,7,63,177,76,239,32,1
4,2012,Auvergne-Rh√¥ne-Alpes,ATHLETISME,19275,8745,10510,906,3628,1019,4331,559,64,640,3646,741,3334,364,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18779,2023,√éle-de-France,VOILE,52809,20431,32378,8804,12161,2364,6710,2068,186,6456,8664,1723,3096,439,41
18780,2023,√éle-de-France,VOITURES RADIO COMMANDEES,1019,65,954,31,99,54,658,107,2,6,11,1,38,8,1
18781,2023,√éle-de-France,VOL EN PLANEUR,1669,287,1382,3,370,250,440,259,39,0,107,63,87,29,0
18782,2023,√éle-de-France,VOL LIBRE,2547,405,2142,2,72,336,1373,332,21,0,36,121,232,13,3


In [5]:
display(100 * df.isnull().sum() / df.shape[0])

annee            0.0
region           0.0
nom_fed          0.0
total_license    0.0
total_f          0.0
total_h          0.0
h_1_9            0.0
h_10_19          0.0
h_20_29          0.0
h_30_59          0.0
h_60_74          0.0
h_75             0.0
f_1_9            0.0
f_10_19          0.0
f_20_29          0.0
f_30_59          0.0
f_60_74          0.0
f_75             0.0
dtype: float64

In [6]:
df = df[df["total_license"] > 0]
df["part_f"] = df["total_f"] / df["total_license"]

target = "part_f"

Y = df.loc[:,target]
X = df.drop(target, axis=1)

print(X)
print(Y)

      annee                region                          nom_fed  \
0      2012  Auvergne-Rh√¥ne-Alpes                    AEROMODELISME   
1      2012  Auvergne-Rh√¥ne-Alpes                      AEROSTATION   
2      2012  Auvergne-Rh√¥ne-Alpes  AIKIDO AIKIBUDO ET AFFINITAIRES   
3      2012  Auvergne-Rh√¥ne-Alpes                   AIKIDO ET BUDO   
4      2012  Auvergne-Rh√¥ne-Alpes                       ATHLETISME   
...     ...                   ...                              ...   
18779  2023         √éle-de-France                            VOILE   
18780  2023         √éle-de-France        VOITURES RADIO COMMANDEES   
18781  2023         √éle-de-France                   VOL EN PLANEUR   
18782  2023         √éle-de-France                        VOL LIBRE   
18783  2023         √éle-de-France                           VOLLEY   

       total_license  total_f  total_h  h_1_9  h_10_19  h_20_29  h_30_59  \
0               2494       61     2433     20      298      142     1251 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["part_f"] = df["total_f"] / df["total_license"]


In [7]:
X_train, X_test, y_train, y_test= train_test_split(X, Y, test_size=0.2, random_state=42)

In [8]:
numeric_features = ["h_1_9", "h_10_19", "h_20_29","h_30_59","h_60_74","h_75"]
numeric_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)

In [9]:
categorical_features = ["annee","region","nom_fed"]
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ( "encoder", OneHotEncoder(drop="first"))
    ]
)

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [None]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", LinearRegression())
    ]
)

model.fit(X_train, y_train)
print("...Done.")


Performing preprocessings on train set...
      annee                     region  \
12675  2020                  Occitanie   
6392   2016           Pays de la Loire   
8049   2017  Provence-Alpes-C√¥te dAzur   
1489   2013        Centre-Val de Loire   
7261   2017                  Grand Est   

                                          nom_fed  total_license  total_f  \
12675                                         SKI          10121     3823   
6392                                 PARACHUTISME            524       77   
8049                                  GYMNASTIQUE          20555    16887   
1489                                        DANSE           2728     2573   
7261   UNION SPORTIVE LENSEIGNEMENT PREMIER DEGRE          90908    45345   

       total_h  h_1_9  h_10_19  h_20_29  h_30_59  h_60_74  h_75  f_1_9  \
12675     6298   1207     1986      412     1885      703    82   1027   
6392       447      0       43      130      249       24     1      0   
8049      3668   18

In [13]:
print("Train R2:",model.score(X_train, y_train))
print("Test R2:",model.score(X_test, y_test))


Train R2: 0.21624473798327482
Test R2: 0.03282825752690943


üëâ Ton mod√®le explique 21% de la variance en train \
üëâ Mais quasiment rien en test

On est face √† un probl√®me de g√©n√©ralisation.

üëâ La part f√©minine est probablement d√©termin√©e par :\
	‚Ä¢	culture sportive propre √† chaque f√©d√©ration\
	‚Ä¢	dynamique historique \
	‚Ä¢	inertie forte \
	‚Ä¢	facteurs socio-√©conomiques absents de ton dataset\
\
Donc ton mod√®le lin√©aire capte peu d‚Äôinformation structurelle.

In [17]:
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("Moyenne part_f:", y_test.mean())

MAE: 0.13082180301331411
Moyenne part_f: 0.3450304611546037


üëâ L‚Äôerreur moyenne est d‚Äôenviron 10 points de pourcentage \
üëâ La part moyenne est 36% \
üëâ Donc il n‚Äôest pas exploitable pour de la d√©cision publique.

Et √ßa confirme le R¬≤ test = 0.03 :
il n‚Äôexplique quasiment rien.

In [15]:
df = df.sort_values(["nom_fed","region","annee"])
df["part_f_lag1"] = df.groupby(["nom_fed","region"])["part_f"].shift(1)

df_model = df.dropna(subset=["part_f_lag1"])

Y = df_model["part_f"]
X = df_model[["part_f_lag1"]]

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

print("Train R2:", model.score(X_train, y_train))
print("Test R2:", model.score(X_test, y_test))

Train R2: 0.1934810168898231
Test R2: 0.05004241502163287


In [16]:
df_model = df_model.sort_values("annee")

train = df_model[df_model["annee"] < "2022"]
test  = df_model[df_model["annee"] >= "2022"]

X_train = train[["part_f_lag1"]]
y_train = train["part_f"]

X_test = test[["part_f_lag1"]]
y_test = test["part_f"]

model = LinearRegression()
model.fit(X_train, y_train)

print("Train R2:", model.score(X_train, y_train))
print("Test R2:", model.score(X_test, y_test))

Train R2: 0.0784588670405928
Test R2: 0.42875850800818616


Un R¬≤ test beaucoup plus √©lev√© que le train, ce n‚Äôest pas ‚Äúmagique‚Äù ‚Äî √ßa veut dire qu‚Äôon a chang√© la structure du probl√®me.


üéØ Interpr√©tation correcte

Tu as maintenant :
	‚Ä¢	Split temporel (donc coh√©rent)
	‚Ä¢	Mod√®le tr√®s simple (probablement part_f_lag1 seul)

Ce que √ßa dit :

üëâ La f√©minisation est principalement inertielle
üëâ La meilleure pr√©diction de l‚Äôann√©e t est l‚Äôann√©e t-1
üëâ Le mod√®le capte une vraie dynamique temporelle

‚∏ª

üß† Pourquoi le R¬≤ train est plus faible ?

Deux raisons possibles :

1Ô∏è‚É£ Variance plus forte dans le train

Peut-√™tre que les ann√©es anciennes ont plus de volatilit√©.

2Ô∏è‚É£ Moins d‚Äôann√©es en test

Le R¬≤ d√©pend de la variance du y.
Si la variance est plus forte dans le test, le R¬≤ peut m√©caniquement √™tre plus √©lev√©.