#### To do
Analyse des données manquantes :
- etrangers : Français de l'étranger => moyenne des étrangers ailleurs ?
- part des ménages imposés et quartiles de revenu : Français de l'étranger et DOM (à part Réunion et Martinique) => utiliser les données de la Réunion et de la Martinique pour les autres DOM ? et moyenne nationale pour les français de l'étranger ?
- chômage : Français de l'étranger => utiliser moyenne nationale ?
- diplômes et CSP : Français de l'étranger, petits DOM (Wallis et Futuna, Nouvelle Calédonie, etc...) => utiliser moyenne des autres DOM ? et moyenne nationale pour les français de l'étranger ?

Nouvelles features :
- Données géo par Dept (dummy sur position Littoral, métropole, péri-urbain à établir par la densité de pop, est-ouest, Nord-Sud)

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
raw = pd.read_excel('data/dataframe_elections.xlsx')
print(raw.shape)

(36919, 68)


#### Retrait des colonnes et lignes inutiles / sélection de features

In [3]:
# drop colonnes
drop_list = ['c_dep', 'dep', 'circo', 'code', 'inscrits'] # données d'identification
drop_list.extend(['circo_parti', 'circo_nuance', 'nom',
                  'prenom', 'etiquette', 'nuance', 'voix', 'second_tour'])
raw.drop(drop_list, axis=1, inplace=True)

# drop lignes
raw = raw[raw['an'] > 1997]

print(raw.columns)
print(raw.shape)

Index(['an', 'etrangers', 'part_impose', 'revenus_q1', 'revenus_med',
       'revenus_q3', 'ecart_revenus', 'chom_tot', 'chom_tot_evol_5',
       'chom_jeunes', 'chom_jeunes_evol_5', 'chom_adultes',
       'chom_adultes_evol_5', 'chom_seniors', 'chom_seniors_evol_5', 'p_agri',
       'p_commercants', 'p_cadres', 'p_intermed', 'p_employes', 'p_ouvriers',
       'd_brevet', 'd_bep', 'd_bac', 'd_sup', 'circo_nuance_groupe',
       'circo_bloc', 'circo_leg_meme_nuance', 'nb_candidats_meme_bloc',
       'score_nuance_groupe_prec_leg', 'score_bloc_prec_leg', 'taux_vote_leg',
       'score_leg_exg', 'score_leg_g', 'score_leg_c', 'score_leg_d',
       'score_leg_exd', 'score_leg_div', 'circo_nuance_groupe_pres',
       'circo_pres_meme_nuance', 'score_nuance_groupe_pres', 'score_bloc_pres',
       'taux_vote_pres', 'score_pres_exg', 'score_pres_g', 'score_pres_c',
       'score_pres_d', 'score_pres_exd', 'score_pres_div',
       'score_candidat_prec_leg', 'depute_sortant', 'sexe', 'nuance_grou

### Feature engineering

#### Gestion des NaN

In [4]:
# Score du candidat aux législatives précédentes
raw['score_candidat_prec_leg'] = raw['score_candidat_prec_leg'].fillna(0.05)
#raw['score_candidat_prec_leg'] = raw['score_candidat_prec_leg'].fillna(value=raw['score_candidat_prec_leg'].mean())

In [5]:
print(raw.shape)
print(raw.isnull().sum())

(30565, 55)
an                                 0
etrangers                        360
part_impose                     1089
revenus_q1                      1089
revenus_med                     1089
revenus_q3                      1089
ecart_revenus                   1089
chom_tot                         360
chom_tot_evol_5                  360
chom_jeunes                      360
chom_jeunes_evol_5               360
chom_adultes                     360
chom_adultes_evol_5              360
chom_seniors                     360
chom_seniors_evol_5              360
p_agri                           680
p_commercants                    680
p_cadres                         680
p_intermed                       680
p_employes                       680
p_ouvriers                       680
d_brevet                         680
d_bep                            680
d_bac                            680
d_sup                            680
circo_nuance_groupe                0
circo_bloc                

#### Création des ensembles de train/validation/test

In [6]:
def train_val_splits(df, year_for_validation):
    df_train = df[(df.an != 2017) & (df.an != year_for_validation)].dropna(how='any')
    y_train = df_train.p_voix
    X_train = df_train.drop(['an', 'p_voix'], axis=1)
    
    df_val = df[df.an == year_for_validation].dropna(how='any')
    y_val = df_val.p_voix
    X_val = df_val.drop(['an', 'p_voix'], axis=1)
    
    return X_train, X_val, y_train, y_val

In [7]:
# Transformation des variables dummies
dummies_list = ['circo_nuance_groupe','circo_bloc', 'sexe','nuance_groupe',
                'circo_nuance_groupe_pres', 'bloc', 'an', 'p_voix']
scale_list = [col for col in raw.columns if col not in dummies_list]
print(scale_list)

raw_dummified = pd.get_dummies(raw, drop_first=True)
X_train, X_val, y_train, y_val = train_val_splits(raw_dummified, 2012)

# Normalisation du dataframe
scaler = StandardScaler()
X_train[scale_list] = scaler.fit_transform(X_train[scale_list])
X_val[scale_list] = scaler.transform(X_val[scale_list])

print(X_train.shape, X_val.shape)
print(y_train.shape, y_val.shape)

['etrangers', 'part_impose', 'revenus_q1', 'revenus_med', 'revenus_q3', 'ecart_revenus', 'chom_tot', 'chom_tot_evol_5', 'chom_jeunes', 'chom_jeunes_evol_5', 'chom_adultes', 'chom_adultes_evol_5', 'chom_seniors', 'chom_seniors_evol_5', 'p_agri', 'p_commercants', 'p_cadres', 'p_intermed', 'p_employes', 'p_ouvriers', 'd_brevet', 'd_bep', 'd_bac', 'd_sup', 'circo_leg_meme_nuance', 'nb_candidats_meme_bloc', 'score_nuance_groupe_prec_leg', 'score_bloc_prec_leg', 'taux_vote_leg', 'score_leg_exg', 'score_leg_g', 'score_leg_c', 'score_leg_d', 'score_leg_exd', 'score_leg_div', 'circo_pres_meme_nuance', 'score_nuance_groupe_pres', 'score_bloc_pres', 'taux_vote_pres', 'score_pres_exg', 'score_pres_g', 'score_pres_c', 'score_pres_d', 'score_pres_exd', 'score_pres_div', 'score_candidat_prec_leg', 'depute_sortant']
(15704, 93) (6276, 93)
(15704,) (6276,)


#### Exploration

In [8]:
X_train.head(5)

Unnamed: 0,etrangers,part_impose,revenus_q1,revenus_med,revenus_q3,ecart_revenus,chom_tot,chom_tot_evol_5,chom_jeunes,chom_jeunes_evol_5,...,nuance_groupe_MDM,nuance_groupe_RDG,nuance_groupe_REM,nuance_groupe_SOC,nuance_groupe_UDI,bloc_Divers,bloc_Droite,bloc_Exdroite,bloc_Exgauche,bloc_Gauche
14488,0.410384,0.135974,0.423524,0.55709,0.535236,0.494874,-1.195229,1.22362,-0.716497,0.926675,...,0,0,0,0,0,0,0,0,1,0
14489,0.410384,0.135974,0.423524,0.55709,0.535236,0.494874,-1.195229,1.22362,-0.716497,0.926675,...,0,0,0,0,0,0,1,0,0,0
14490,0.410384,0.135974,0.423524,0.55709,0.535236,0.494874,-1.195229,1.22362,-0.716497,0.926675,...,0,0,0,0,0,0,0,1,0,0
14491,0.410384,0.135974,0.423524,0.55709,0.535236,0.494874,-1.195229,1.22362,-0.716497,0.926675,...,0,0,0,1,0,0,0,0,0,1
14492,0.410384,0.135974,0.423524,0.55709,0.535236,0.494874,-1.195229,1.22362,-0.716497,0.926675,...,0,0,0,0,0,0,0,0,0,1


In [9]:
X_val.head(5)

Unnamed: 0,etrangers,part_impose,revenus_q1,revenus_med,revenus_q3,ecart_revenus,chom_tot,chom_tot_evol_5,chom_jeunes,chom_jeunes_evol_5,...,nuance_groupe_MDM,nuance_groupe_RDG,nuance_groupe_REM,nuance_groupe_SOC,nuance_groupe_UDI,bloc_Divers,bloc_Droite,bloc_Exdroite,bloc_Exgauche,bloc_Gauche
7881,0.453903,0.121356,1.204968,0.849285,0.475729,-0.021573,-0.797027,0.925056,-0.394528,0.371543,...,0,0,0,0,0,0,0,0,0,1
7882,0.453903,0.121356,1.204968,0.849285,0.475729,-0.021573,-0.797027,0.925056,-0.394528,0.371543,...,0,0,0,0,0,0,1,0,0,0
7883,0.453903,0.121356,1.204968,0.849285,0.475729,-0.021573,-0.797027,0.925056,-0.394528,0.371543,...,0,0,0,0,0,0,0,0,1,0
7884,0.453903,0.121356,1.204968,0.849285,0.475729,-0.021573,-0.797027,0.925056,-0.394528,0.371543,...,0,0,0,0,0,0,1,0,0,0
7885,0.453903,0.121356,1.204968,0.849285,0.475729,-0.021573,-0.797027,0.925056,-0.394528,0.371543,...,0,0,0,1,0,0,0,0,0,1


In [55]:
# imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
# imp = imp.fit(X_train)# Impute our data, then train
# X_train_imp = imp.transform(X_train)

In [56]:
# imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
# imp = imp.fit(X_test)# Impute our data, then train
# X_test_imp = imp.transform(X_test)

In [10]:
rf = RandomForestRegressor(n_jobs=-1, n_estimators=100)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [11]:
y_pred = rf.predict(X_val)
mean_absolute_error(y_val, y_pred)

0.026114582104281959

In [12]:
sorted(list(zip(rf.feature_importances_*100, X_train.columns)), reverse=True)

[(34.859212944767947, 'score_candidat_prec_leg'),
 (19.225046332362393, 'score_nuance_groupe_pres'),
 (14.630188900097851, 'nuance_groupe_LR'),
 (6.0371014973552368, 'nuance_groupe_SOC'),
 (4.4285044866398957, 'score_bloc_prec_leg'),
 (3.3508303747520984, 'bloc_Droite'),
 (2.9412161677119517, 'score_nuance_groupe_prec_leg'),
 (1.7281204167575128, 'depute_sortant'),
 (0.84657157730254917, 'score_bloc_pres'),
 (0.6561059990428596, 'nuance_groupe_MDM'),
 (0.59195080841233072, 'nuance_groupe_RDG'),
 (0.5199726562636432, 'score_leg_g'),
 (0.45396257228714731, 'score_pres_d'),
 (0.44786126551191713, 'score_pres_exg'),
 (0.37783043843723441, 'nb_candidats_meme_bloc'),
 (0.35920271619358718, 'taux_vote_leg'),
 (0.35491779334346479, 'circo_pres_meme_nuance'),
 (0.35144581467039793, 'score_leg_exg'),
 (0.34578809942037486, 'score_pres_g'),
 (0.33322387493955258, 'sexe_M'),
 (0.32409648159744503, 'score_leg_d'),
 (0.31156971048235627, 'score_pres_exd'),
 (0.29641720997673821, 'score_leg_exd'),
 (

In [70]:
print(y_pred[:10], y_val[:10])

[ 0.01335725  0.01090169  0.025709    0.29174825  0.16173332  0.0219576
  0.01080922  0.01080922  0.04448367  0.01335725] 7881    0.0147
7882    0.0037
7883    0.0192
7884    0.2241
7885    0.2316
7886    0.0028
7887    0.0022
7888    0.0018
7889    0.0922
7890    0.0029
Name: p_voix, dtype: float64


In [None]:
#TODO : concatener l'identifiant de la ligne (année, circo, nom), la vraie valeur et la prédiction
# repérer les grands écarts
def print_prediction(y_pred, y_true):
    return 