#### To do
Analyse des données manquantes :
- etrangers : Français de l'étranger => moyenne des étrangers ailleurs ?
- part des ménages imposés et quartiles de revenu : Français de l'étranger et DOM (à part Réunion et Martinique) => utiliser les données de la Réunion et de la Martinique pour les autres DOM ? et moyenne nationale pour les français de l'étranger ?
- chômage : Français de l'étranger => utiliser moyenne nationale ?
- diplômes et CSP : Français de l'étranger, petits DOM (Wallis et Futuna, Nouvelle Calédonie, etc...) => utiliser moyenne des autres DOM ? et moyenne nationale pour les français de l'étranger ?

Nouvelles features :
- Données géo par Dept (dummy sur position Littoral, métropole, péri-urbain à établir par la densité de pop, est-ouest, Nord-Sud)

In [39]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error

In [40]:
raw = pd.read_excel('data/dataframe_elections.xlsx')
print(raw.shape)

(36919, 64)


#### Retrait des colonnes et lignes inutiles

In [41]:
# drop colonnes
drop_list = ['c_dep', 'dep', 'circo', 'code', 'inscrits'] # données d'identification
drop_list.extend(['circo_parti', 'circo_nuance', 'nom',
                  'prenom', 'etiquette', 'nuance', 'voix', 'second_tour'])
raw.drop(drop_list, axis=1, inplace=True)

# drop lignes
raw = raw[raw['an'] > 1997]

print(raw.columns)
print(raw.shape)

Index(['an', 'etrangers', 'part_impose', 'revenus_q1', 'revenus_med',
       'revenus_q3', 'ecart_revenus', 'chom_tot', 'chom_tot_evol_5',
       'chom_jeunes', 'chom_jeunes_evol_5', 'chom_adultes',
       'chom_adultes_evol_5', 'chom_seniors', 'chom_seniors_evol_5', 'p_agri',
       'p_commercants', 'p_cadres', 'p_intermed', 'p_employes', 'p_ouvriers',
       'd_brevet', 'd_bep', 'd_bac', 'd_sup', 'circo_nuance_groupe',
       'circo_bloc', 'circo_leg_meme_nuance', 'nb_candidats_meme_bloc',
       'score_nuance_groupe_prec_leg', 'score_bloc_prec_leg', 'score_leg_exg',
       'score_leg_g', 'score_leg_c', 'score_leg_d', 'score_leg_exd',
       'circo_nuance_groupe_pres', 'circo_pres_meme_nuance',
       'score_nuance_groupe_pres', 'score_bloc_pres', 'score_pres_exg',
       'score_pres_g', 'score_pres_c', 'score_pres_d', 'score_pres_exd',
       'score_candidat_prec_leg', 'depute_sortant', 'sexe', 'nuance_groupe',
       'bloc', 'p_voix'],
      dtype='object')
(30565, 51)


In [42]:
raw.head(5)

Unnamed: 0,an,etrangers,part_impose,revenus_q1,revenus_med,revenus_q3,ecart_revenus,chom_tot,chom_tot_evol_5,chom_jeunes,...,score_pres_g,score_pres_c,score_pres_d,score_pres_exd,score_candidat_prec_leg,depute_sortant,sexe,nuance_groupe,bloc,p_voix
0,2017,0.079252,0.622351,13515.653227,19274.362957,26052.233039,12536.579812,0.063,0.009,0.159,...,0.168572,0.174259,0.212527,0.19963,,0.0,F,SOC,Gauche,
1,2017,0.079252,0.622351,13515.653227,19274.362957,26052.233039,12536.579812,0.063,0.009,0.159,...,0.168572,0.174259,0.212527,0.19963,,0.0,M,DIV,Divers,
2,2017,0.079252,0.622351,13515.653227,19274.362957,26052.233039,12536.579812,0.063,0.009,0.159,...,0.168572,0.174259,0.212527,0.19963,0.224066,1.0,M,LR,Droite,
3,2017,0.079252,0.622351,13515.653227,19274.362957,26052.233039,12536.579812,0.063,0.009,0.159,...,0.168572,0.174259,0.212527,0.19963,,0.0,M,FN,Exdroite,
4,2017,0.079252,0.622351,13515.653227,19274.362957,26052.233039,12536.579812,0.063,0.009,0.159,...,0.168572,0.174259,0.212527,0.19963,,0.0,F,DIV,Divers,


#### Feature engineering

#### Gestion des NaN

In [43]:
# Score du candidat aux législatives précédentes
raw['score_candidat_prec_leg'] = raw['score_candidat_prec_leg'].fillna(0)

In [44]:
print(raw.shape)
print(raw.isnull().sum())

(30565, 51)
an                                 0
etrangers                        360
part_impose                     1089
revenus_q1                      1089
revenus_med                     1089
revenus_q3                      1089
ecart_revenus                   1089
chom_tot                         360
chom_tot_evol_5                  360
chom_jeunes                      360
chom_jeunes_evol_5               360
chom_adultes                     360
chom_adultes_evol_5              360
chom_seniors                     360
chom_seniors_evol_5              360
p_agri                           680
p_commercants                    680
p_cadres                         680
p_intermed                       680
p_employes                       680
p_ouvriers                       680
d_brevet                         680
d_bep                            680
d_bac                            680
d_sup                            680
circo_nuance_groupe                0
circo_bloc                

#### Dummies

In [45]:
raw_dummified = pd.get_dummies(
    raw, columns=['circo_nuance_groupe','circo_bloc', 'sexe','nuance_groupe',
                  'circo_nuance_groupe_pres', 'bloc'])

#### Création des ensembles de train/validation/test

In [46]:
def train_val_test(df, year_for_validation):
    return (df[(df.an != 2017) & (df.an != year_for_validation)].drop('an', axis=1),
            df[df.an == year_for_validation].drop('an', axis=1),
            df[df.an == 2017].drop('an', axis=1))

In [53]:
X_train, X_val, X_test = train_val_test(raw_dummified, 2012)

# retrait des lignes contenant les N/A
X_train = X_train.dropna(how='any')
X_val = X_val.dropna(how='any')

# Séparation labels et features
y_train, y_val, y_test = X_train.p_voix, X_val.p_voix, X_test.p_voix
X_train = X_train.drop('p_voix', axis=1)
X_val = X_val.drop('p_voix', axis=1)
X_test = X_test.drop('p_voix', axis=1)

print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(15704, 95) (6276, 95) (7881, 95)
(15704,) (6276,) (7881,)


In [54]:
X_train.columns

Index(['etrangers', 'part_impose', 'revenus_q1', 'revenus_med', 'revenus_q3',
       'ecart_revenus', 'chom_tot', 'chom_tot_evol_5', 'chom_jeunes',
       'chom_jeunes_evol_5', 'chom_adultes', 'chom_adultes_evol_5',
       'chom_seniors', 'chom_seniors_evol_5', 'p_agri', 'p_commercants',
       'p_cadres', 'p_intermed', 'p_employes', 'p_ouvriers', 'd_brevet',
       'd_bep', 'd_bac', 'd_sup', 'circo_leg_meme_nuance',
       'nb_candidats_meme_bloc', 'score_nuance_groupe_prec_leg',
       'score_bloc_prec_leg', 'score_leg_exg', 'score_leg_g', 'score_leg_c',
       'score_leg_d', 'score_leg_exd', 'circo_pres_meme_nuance',
       'score_nuance_groupe_pres', 'score_bloc_pres', 'score_pres_exg',
       'score_pres_g', 'score_pres_c', 'score_pres_d', 'score_pres_exd',
       'score_candidat_prec_leg', 'depute_sortant', 'circo_nuance_groupe_COM',
       'circo_nuance_groupe_DIV', 'circo_nuance_groupe_DLF',
       'circo_nuance_groupe_DVD', 'circo_nuance_groupe_DVG',
       'circo_nuance_group

In [55]:
# imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
# imp = imp.fit(X_train)# Impute our data, then train
# X_train_imp = imp.transform(X_train)

In [56]:
# imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
# imp = imp.fit(X_test)# Impute our data, then train
# X_test_imp = imp.transform(X_test)

In [60]:
rf = RandomForestRegressor(n_jobs=-1, n_estimators=500)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [61]:
y_pred = rf.predict(X_val)
mean_absolute_error(y_val, y_pred)

0.028195012336291729

In [62]:
sorted(list(zip(rf.feature_importances_*100, X_train.columns)),reverse=True)

[(33.627537032907611, 'score_candidat_prec_leg'),
 (21.700574523561329, 'score_nuance_groupe_pres'),
 (13.352540611560363, 'nuance_groupe_LR'),
 (5.2977284001199223, 'nuance_groupe_SOC'),
 (5.0304768354251364, 'score_bloc_prec_leg'),
 (3.6623374238290007, 'bloc_Droite'),
 (2.8866840571659438, 'score_nuance_groupe_prec_leg'),
 (1.7488126274541993, 'depute_sortant'),
 (0.85765676093044996, 'score_bloc_pres'),
 (0.57623091847726959, 'nuance_groupe_RDG'),
 (0.52030246425326676, 'score_leg_g'),
 (0.50223459789529468, 'score_pres_d'),
 (0.49617750343659328, 'score_pres_exg'),
 (0.39548873413124896, 'nb_candidats_meme_bloc'),
 (0.38449399545601037, 'score_pres_g'),
 (0.34596282768875003, 'bloc_Centre'),
 (0.34364983113172837, 'score_leg_d'),
 (0.34274968522555105, 'score_leg_exg'),
 (0.34248801934126055, 'score_pres_exd'),
 (0.31663691233924601, 'circo_pres_meme_nuance'),
 (0.31566561510993796, 'nuance_groupe_MDM'),
 (0.29979285481756585, 'score_pres_c'),
 (0.29772732204164021, 'score_leg_exd

In [63]:
print(y_pred[:10], y_val[:10])

[ 0.02331032  0.02787863  0.01341656  0.27024979  0.17158898  0.04539962
  0.00770905  0.00770905  0.04311651  0.02331032] 7881    0.014684
7882    0.003744
7883    0.019224
7884    0.224066
7885    0.231604
7886    0.002782
7887    0.002163
7888    0.001783
7889    0.092201
7890    0.002858
Name: p_voix, dtype: float64
