# 1. Initialisation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
import warnings
warnings.filterwarnings('ignore')

In [None]:
%load_ext pycodestyle_magic

In [None]:
%flake8_on

In [None]:
df = pd.read_parquet('2 - Données retraitées/dataset_metro.parquet')

Les données de qualité de l'air extérieur ne sont disponibles qu'à partir de fin septembre 2017. On restreint donc l'étude à la période suivant ce moment :

In [None]:
dfd = df[df.date > pd.Timestamp('2017-10-01')].copy()

In [None]:
dfd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35451 entries, 41638 to 77088
Data columns (total 31 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   date/heure         35451 non-null  datetime64[ns]
 1   date               35451 non-null  object        
 2   heure              35451 non-null  object        
 3   jour               35451 non-null  object        
 4   ferie              35451 non-null  int64         
 5   vacances           35451 non-null  object        
 6   semestre           35451 non-null  object        
 7   CAT_JOUR           35451 non-null  object        
 8   NO                 31293 non-null  float64       
 9   NO2                31864 non-null  float64       
 10  PM10               33992 non-null  float64       
 11  CO2                34799 non-null  float64       
 12  TEMP               34798 non-null  float64       
 13  HUMI               31671 non-null  float64       
 14  te

In [None]:
# Suppression des colonnes peu fournies
dfd.drop(columns={'ressenti_ext',
                  'radiation_ext',
                  'O3_ext',
                  'PM25_ext'},
         inplace=True)

# 2. Centrage et normalisation des données

In [15]:
# Sélection des variables chiffrées des différents paramètres
dfw = dfd[dfd.columns[8:]].copy()
# Suppression des lignes contenant des NaN
dfw.dropna(inplace=True)
dfw.reset_index(inplace=True)
dfw.drop(columns={'index'}, inplace=True)
print("Il y a", str(dfw.shape[0]), "valeurs à étudier")

Il y a 17555 valeurs à étudier


In [16]:
# Split du DataFrame
X = dfw[dfw.columns[6:]]
y = dfw[dfw.columns[:6]]
# Centrage/réduction des colonnes
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X))
y = pd.DataFrame(scaler.fit_transform(y))
# Renommage des colonnes en vue du LASSO
for i in range(X.shape[1]):
    X.rename(columns={X.columns[i]: i}, inplace=True)
# Vérification du résultat
print("Espérances ≈ 0 :", str(np.allclose(X.mean(), 0)))
print("Variances ≈ 1 :", str(np.allclose(X.var(), 1, atol=1e-2)))

Espérances ≈ 0 : True
Variances ≈ 1 : True


<div class="alert alert-block alert-success">
    <b>
        <li>Les variables sont centrées car leur espérance est proche de 0</li>
        <li>Les variables sont normalisées car leur variance est proche de 1</li>
    </b>
</div>

# 3. Etude de corrélation via une régression linéaire simple, un régresseur KNN et un perceptron multi-couche

In [17]:
# Initialisation du DataFrame de résultat
df_res = pd.DataFrame()

# Initialisation des listes de résultats
R2_lr = []
R2_mlp = []
R2_knn = []

# Choix aléatoire de valeurs de random_state en vue de reproduire l'analyse
rs = [494, 115, 421, 354, 67, 310]

# Initialisation des modèles
reg_lss = linear_model.LinearRegression()
mlp = MLPRegressor()

# Prédictions des 3 modèles pour chaque variable à expliquer
for i in range(y.shape[1]):
    # Data split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y[y.columns[i]],
                                                        random_state=rs[i],
                                                        test_size=0.25)
    # LASSO and Linear Regression
    skl_lss = LassoCV().fit(X_train, y_train)
    lss_features = np.nonzero(skl_lss.coef_)[0].tolist()
    reg_lss.fit(X_train[lss_features].values, y_train)
    R2_reg_lss = reg_lss.score(X_test[lss_features].values, y_test)
    R2_lr.append(round(R2_reg_lss, 2))

    # KNN regressor
    shuf_split = ShuffleSplit(5, test_size=0.33)

    def score_func(k):
        knn = KNeighborsRegressor(n_neighbors=k)
        scores = cross_val_score(estimator=knn,
                                 X=X_train,
                                 y=y_train,
                                 cv=shuf_split)
        return np.mean(scores)

    list_k = list(range(1, 80, 3))
    scores_per_k = list(map(score_func, list_k))
    best_k = list_k[np.argmax(scores_per_k)]
    knn = KNeighborsRegressor(n_neighbors=best_k).fit(X_train, y_train)
    R2_knn.append(round(knn.score(X_test, y_test), 2))

    # Multi layer Perceptron
    mlp.fit(X_train, y_train)
    R2_mlp.append(round(mlp.score(X_test, y_test), 2))

df_res['r2_linear_regression'] = R2_lr
df_res['r2_knn'] = R2_knn
df_res['r2_perceptron'] = R2_mlp
df_res.index = dfw.columns.tolist()[:6]
df_res

Unnamed: 0,r2_linear_regression,r2_knn,r2_perceptron
NO,0.37,0.53,0.57
NO2,0.37,0.54,0.57
PM10,0.12,0.19,0.27
CO2,0.82,0.87,0.89
TEMP,0.64,0.7,0.73
HUMI,0.32,0.46,0.51


# 4. Industrialisation de l'analyse

### 4.1. Fonction regroupant les calculs précédents

In [15]:
def predictions(df):
    df.drop(columns={'ressenti_ext',
                     'radiation_ext',
                     'O3_ext',
                     'PM25_ext'},
            inplace=True)
    dfw = df[df.columns[8:]].copy()
    dfw.dropna(inplace=True)
    dfw.reset_index(inplace=True)
    dfw.drop(columns={'index'}, inplace=True)
    X = dfw[dfw.columns[6:]]
    y = dfw[dfw.columns[:6]]
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X))
    y = pd.DataFrame(scaler.fit_transform(y))
    for i in range(X.shape[1]):
        X.rename(columns={X.columns[i]: i}, inplace=True)
    df_res = pd.DataFrame()
    rs = [494, 115, 421, 354, 67, 310]
    R2_lr = []
    R2_mlp = []
    R2_knn = []
    reg_lss = linear_model.LinearRegression()
    mlp = MLPRegressor()
    for i in range(y.shape[1]):
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y[y.columns[i]],
                                                            random_state=rs[i],
                                                            test_size=0.25)
        skl_lss = LassoCV().fit(X_train, y_train)
        lss_features = np.nonzero(skl_lss.coef_)[0].tolist()
        reg_lss.fit(X_train[lss_features].values, y_train)
        R2_reg_lss = reg_lss.score(X_test[lss_features].values, y_test)
        R2_lr.append(round(R2_reg_lss, 2))

        shuf_split = ShuffleSplit(5, test_size=0.33)

        def score_func(k):
            knn = KNeighborsRegressor(n_neighbors=k)
            scores = cross_val_score(estimator=knn,
                                     X=X_train,
                                     y=y_train,
                                     cv=shuf_split)
            return np.mean(scores)

        list_k = list(range(1, 80, 3))
        scores_per_k = list(map(score_func, list_k))
        best_k = list_k[np.argmax(scores_per_k)]
        knn = KNeighborsRegressor(n_neighbors=best_k).fit(X_train, y_train)
        R2_knn.append(round(knn.score(X_test, y_test), 2))
        mlp.fit(X_train, y_train)
        R2_mlp.append(round(mlp.score(X_test, y_test), 2))
    df_res['r2_linear_regression'] = R2_lr
    df_res['r2_knn'] = R2_knn
    df_res['r2_perceptron'] = R2_mlp
    df_res.index = dfw.columns.tolist()[:6]
    return df_res

### 4.2. Analyse par semestre

In [16]:
list_sem = df.semestre.unique().tolist()
dict_var = {}
results = []
liste_sem_ok = []
i = 0
j = 0
for sem in list_sem:
    try:
        pred = predictions(df[df.semestre == sem])
        results.append(pred)
        dict_var[sem] = j
        liste_sem_ok.append(sem)
        j += 1
        print(str(i) + "/" + str(len(list_sem)-1))
    except:
        pass
    i += 1

9/17
10/17
11/17
12/17
15/17


15:5: E722 do not use bare 'except'


16/17


In [17]:
liste_sem_ok

['2017S2', '2018S1', '2018S2', '2019S1', '2020S2', '2021S1']

In [29]:
results[dict_var['2017S2']]

Unnamed: 0,r2_linear_regression,r2_knn,r2_perceptron
NO,0.62,0.7,0.68
NO2,0.59,0.65,0.71
PM10,0.39,0.45,0.5
CO2,0.85,0.9,0.93
TEMP,0.69,0.73,0.79
HUMI,0.71,0.75,0.79


In [19]:
results[dict_var['2018S1']]

Unnamed: 0,r2_linear_regression,r2_knn,r2_perceptron
NO,0.2,0.44,0.49
NO2,0.3,0.51,0.54
PM10,0.09,0.25,0.35
CO2,0.88,0.91,0.93
TEMP,0.75,0.78,0.82
HUMI,0.29,0.48,0.59


In [20]:
results[dict_var['2018S2']]

Unnamed: 0,r2_linear_regression,r2_knn,r2_perceptron
NO,0.2,0.34,0.42
NO2,0.32,0.5,0.55
PM10,0.18,0.24,0.26
CO2,0.85,0.91,0.93
TEMP,0.65,0.74,0.78
HUMI,0.44,0.57,0.62


In [21]:
results[dict_var['2019S1']]

Unnamed: 0,r2_linear_regression,r2_knn,r2_perceptron
NO,0.35,0.61,0.58
NO2,0.4,0.62,0.61
PM10,0.13,0.36,0.22
CO2,0.79,0.88,0.88
TEMP,0.67,0.72,0.76
HUMI,0.39,0.53,0.56


In [22]:
results[dict_var['2020S2']]

Unnamed: 0,r2_linear_regression,r2_knn,r2_perceptron
NO,0.14,0.33,0.36
NO2,0.28,0.47,0.55
PM10,0.15,0.19,0.17
CO2,0.84,0.87,0.91
TEMP,0.5,0.61,0.65
HUMI,0.33,0.54,0.57


In [23]:
results[dict_var['2021S1']]

Unnamed: 0,r2_linear_regression,r2_knn,r2_perceptron
NO,0.38,0.67,0.7
NO2,0.41,0.65,0.65
PM10,0.3,0.39,0.56
CO2,0.74,0.85,0.89
TEMP,0.75,0.78,0.82
HUMI,0.32,0.57,0.52


In [48]:
df_ecarts = pd.DataFrame()
dict_ec = {}
for i, res in enumerate(results):
    dict_ec[liste_sem_ok[i]] = round(results[i].r2_perceptron[:3].mean(), 2)
dict_ec

{'2017S2': 0.63,
 '2018S1': 0.46,
 '2018S2': 0.41,
 '2019S1': 0.47,
 '2020S2': 0.36,
 '2021S1': 0.64}