In [66]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error

In [50]:
raw = pd.read_excel('data/dataframe_elections.xlsx')
print(raw.shape)

(36919, 60)


#### Retrait des colonnes et lignes inutiles

In [51]:
drop_list = ['c_dep', 'dep', 'circo', 'code', 'inscrits'] # données d'identification
drop_list.extend(['circo_parti', 'circo_nuance', 'nom',
                  'prenom', 'etiquette', 'nuance', 'voix', 'second_tour'])
raw.drop(drop_list, axis=1, inplace=True)
print(raw.shape)

(36919, 47)


In [52]:
raw.head(5)

Unnamed: 0,an,etrangers,part_impose,revenus_q1,revenus_med,revenus_q3,chom_tot,chom_tot_evol_5,chom_jeunes,chom_jeunes_evol_5,...,score_leg_exg,score_leg_g,score_leg_c,score_leg_d,score_leg_exd,nb_candidats_bloc,sexe,nuance_groupe,bloc,p_voix
0,2017,0.079252,0.622351,13515.653227,19274.362957,26052.233039,0.063,0.009,0.159,0.02,...,0.023171,0.249146,0.0,0.230592,0.092201,2,F,SOC,Gauche,
1,2017,0.079252,0.622351,13515.653227,19274.362957,26052.233039,0.063,0.009,0.159,0.02,...,0.023171,0.249146,0.0,0.230592,0.092201,2,M,DIV,-,
2,2017,0.079252,0.622351,13515.653227,19274.362957,26052.233039,0.063,0.009,0.159,0.02,...,0.023171,0.249146,0.0,0.230592,0.092201,1,M,LR,Droite,
3,2017,0.079252,0.622351,13515.653227,19274.362957,26052.233039,0.063,0.009,0.159,0.02,...,0.023171,0.249146,0.0,0.230592,0.092201,1,M,FN,Exdroite,
4,2017,0.079252,0.622351,13515.653227,19274.362957,26052.233039,0.063,0.009,0.159,0.02,...,0.023171,0.249146,0.0,0.230592,0.092201,2,F,DIV,-,


#### Feature engineering

Analyse des données manquantes :
- etrangers : Français de l'étranger => moyenne des étrangers ailleurs ?
- part des ménages imposés et quartiles de revenu : Français de l'étranger et DOM (à part Réunion et Martinique) => utiliser les données de la Réunion et de la Martinique pour les autres DOM ?
- chômage : Français de l'étranger => utiliser moyenne nationale ?
- diplômes et CSP : Français de l'étranger, petits DOM (Wallis et Futuna, Nouvelle Calédonie, etc...) => utiliser moyenne des autres DOM ?
- score du candidat aux législatives précédentes => remplacer les NA par un 0
- couleur de la circonscription avant les élections : manque les données pour les circonscriptions nouvellement créées et utilisées pour la 1ère fois en 2012 => pour la métropole, appliquer la couleur dominante du département. Pour les français de l'étranger, ??? (à faire directement sur le fichier excel)
- résultats présidentielles par bloc : nouvelle Calédonie et Polynésie Française => trouver sur internet des résultats au niveau du DOM en question.


In [53]:
# Gestion des N/A 
raw['score_candidat_prec_leg'] = raw['score_candidat_prec_leg'].fillna(0)
print(raw.isnull().sum())

an                             0
etrangers                   6714
part_impose                 7443
revenus_q1                  7443
revenus_med                 7443
revenus_q3                  7443
chom_tot                    7768
chom_tot_evol_5             7768
chom_jeunes                 7768
chom_jeunes_evol_5          7768
chom_adultes                7768
chom_adultes_evol_5         7768
chom_seniors                7768
chom_seniors_evol_5         7768
p_agri                      7034
p_commercants               7034
p_cadres                    7034
p_intermed                  7034
p_employes                  7034
p_ouvriers                  7034
d_brevet                    7034
d_bep                       7034
d_bac                       7034
d_sup                       7034
score_candidat_prec_leg        0
depute_sortant              6354
circo_nuance_groupe         6813
circo_bloc                  6813
circo_meme_nuance            459
score_nuance_pres            192
score_pres

In [54]:
raw_dummised = pd.get_dummies(raw, columns=['circo_nuance_groupe','circo_bloc','sexe','nuance_groupe','bloc'],
prefix=['circo_nuance_groupe','circo_bloc','sexe','nuance_groupe','bloc'])

#### Création des ensembles de train/validation/test

In [60]:
X_test = raw_dummised[raw_dummised.an == 2012].drop('p_voix', axis=1)
X_train = raw_dummised[(raw_dummised.an==2002) | (raw_dummised.an==2007)].drop('p_voix', axis=1)

y_test = raw_dummised[raw_dummised.an == 2012]['p_voix']
y_train = raw_dummised[(raw_dummised.an==2002) | (raw_dummised.an==2007)]['p_voix']

In [62]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp = imp.fit(X_train)# Impute our data, then train
X_train_imp = imp.transform(X_train)

In [63]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp = imp.fit(X_test)# Impute our data, then train
X_test_imp = imp.transform(X_test)

In [None]:
# rf = RandomForestRegressor(n_jobs=-1, n_estimators=100)
# rf.fit(X_train_imp, y_train)

In [None]:
# y_pred = rf.predict(X_test_imp)
# mean_absolute_error(y_test, y_pred)

In [None]:
# sorted(list(zip(rf.feature_importances_*100, X_train.columns)),reverse=True)