#### To do
Analyse des données manquantes :
- etrangers : Français de l'étranger => moyenne des étrangers ailleurs ?
- part des ménages imposés et quartiles de revenu : Français de l'étranger et DOM (à part Réunion et Martinique) => utiliser les données de la Réunion et de la Martinique pour les autres DOM ? et moyenne nationale pour les français de l'étranger ?
- chômage : Français de l'étranger => utiliser moyenne nationale ?
- diplômes et CSP : Français de l'étranger, petits DOM (Wallis et Futuna, Nouvelle Calédonie, etc...) => utiliser moyenne des autres DOM ? et moyenne nationale pour les français de l'étranger ?

Nouvelles features :
- Données géo par Dept (dummy sur position Littoral, métropole, péri-urbain à établir par la densité de pop, est-ouest, Nord-Sud)

In [70]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

#### Retrait des colonnes et lignes inutiles / sélection de features

In [94]:
# load data
raw = pd.read_excel('data/dataframe_elections.xlsx')
print('Taille initiale du DF :', raw.shape)

# drop lignes
raw = raw[raw['an'] > 1997]

# Données d'identification
id_list = ['an', 'code', 'nom', 'prenom', 'nuance_groupe', 'second_tour']
df_id = raw[id_list]

# drop colonnes
drop_list = ['c_dep', 'dep', 'circo', 'code', 'inscrits', 'circo_parti', 'circo_nuance',
             'nom', 'prenom', 'etiquette', 'nuance', 'voix', 'second_tour']
raw = raw.drop(drop_list, axis=1)

print(raw.columns)
print('Taille du DF après retrait de 1997 et des colonnes inutiles :', raw.shape)

Taille initiale du DF : (36919, 68)
Index(['an', 'etrangers', 'part_impose', 'revenus_q1', 'revenus_med',
       'revenus_q3', 'ecart_revenus', 'chom_tot', 'chom_tot_evol_5',
       'chom_jeunes', 'chom_jeunes_evol_5', 'chom_adultes',
       'chom_adultes_evol_5', 'chom_seniors', 'chom_seniors_evol_5', 'p_agri',
       'p_commercants', 'p_cadres', 'p_intermed', 'p_employes', 'p_ouvriers',
       'd_brevet', 'd_bep', 'd_bac', 'd_sup', 'circo_nuance_groupe',
       'circo_bloc', 'circo_leg_meme_nuance', 'nb_candidats_meme_bloc',
       'score_nuance_groupe_prec_leg', 'score_bloc_prec_leg', 'taux_vote_leg',
       'score_leg_exg', 'score_leg_g', 'score_leg_c', 'score_leg_d',
       'score_leg_exd', 'score_leg_div', 'circo_nuance_groupe_pres',
       'circo_pres_meme_nuance', 'score_nuance_groupe_pres', 'score_bloc_pres',
       'taux_vote_pres', 'score_pres_exg', 'score_pres_g', 'score_pres_c',
       'score_pres_d', 'score_pres_exd', 'score_pres_div',
       'score_candidat_prec_leg', 'd

### Feature engineering

#### Retrait des features qui dégradent le modèle

In [95]:
keep_list = ['an', 'p_voix'] # à garder obligatoirement pour identifier les lignes et créer les labels
keep_list.extend(['part_impose', 'chom_tot', 'p_agri', 'p_ouvriers', 'd_brevet','d_sup',
            'circo_leg_meme_nuance', 'nb_candidats_meme_bloc', 'score_nuance_groupe_prec_leg',
             'score_bloc_prec_leg', 'taux_vote_leg', 'score_nuance_groupe_pres', 'score_bloc_pres',
            'taux_vote_pres', 'score_candidat_prec_leg', 'depute_sortant'])

df = raw.drop([col for col in raw.columns if col not in keep_list], axis=1)

In [96]:
print(df.columns)

Index(['an', 'part_impose', 'chom_tot', 'p_agri', 'p_ouvriers', 'd_brevet',
       'd_sup', 'circo_leg_meme_nuance', 'nb_candidats_meme_bloc',
       'score_nuance_groupe_prec_leg', 'score_bloc_prec_leg', 'taux_vote_leg',
       'score_nuance_groupe_pres', 'score_bloc_pres', 'taux_vote_pres',
       'score_candidat_prec_leg', 'depute_sortant', 'p_voix'],
      dtype='object')


#### Gestion des NaN

In [97]:
# Score du candidat aux législatives précédentes
df['score_candidat_prec_leg'] = df['score_candidat_prec_leg'].fillna(0.05)
#raw['score_candidat_prec_leg'] = raw['score_candidat_prec_leg'].fillna(value=raw['score_candidat_prec_leg'].mean())

In [98]:
print(df.shape)
print(df.isnull().sum())

(30565, 18)
an                                 0
part_impose                     1089
chom_tot                         360
p_agri                           680
p_ouvriers                       680
d_brevet                         680
d_sup                            680
circo_leg_meme_nuance              0
nb_candidats_meme_bloc             0
score_nuance_groupe_prec_leg     185
score_bloc_prec_leg              185
taux_vote_leg                    185
score_nuance_groupe_pres           0
score_bloc_pres                    0
taux_vote_pres                     0
score_candidat_prec_leg            0
depute_sortant                     0
p_voix                          7881
dtype: int64


#### Création des ensembles de train/validation/test

In [99]:
def train_val_splits(df, year_for_validation):
    df_train = df[(df.an != 2017) & (df.an != year_for_validation)].dropna(how='any')
    y_train = df_train.p_voix
    X_train = df_train.drop(['an', 'p_voix'], axis=1)
    
    df_val = df[df.an == year_for_validation].dropna(how='any')
    y_val = df_val.p_voix
    X_val = df_val.drop(['an', 'p_voix'], axis=1)
    
    return X_train, X_val, y_train, y_val

In [100]:
# Transformation des variables dummies
dummies_list = ['circo_nuance_groupe','circo_bloc', 'sexe','nuance_groupe',
                'circo_nuance_groupe_pres', 'bloc', 'an', 'p_voix', 'depute_sortant',
               'nb_candidats_meme_bloc']
scale_list = [col for col in df.columns if col not in dummies_list]
print(scale_list)

df_dummified = pd.get_dummies(df, drop_first=True)
X_train, X_val, y_train, y_val = train_val_splits(df_dummified, 2012)

# Normalisation du dataframe
scaler = StandardScaler()
X_train[scale_list] = scaler.fit_transform(X_train[scale_list])
X_val[scale_list] = scaler.transform(X_val[scale_list])

print(X_train.shape, X_val.shape)
print(y_train.shape, y_val.shape)

['part_impose', 'chom_tot', 'p_agri', 'p_ouvriers', 'd_brevet', 'd_sup', 'circo_leg_meme_nuance', 'score_nuance_groupe_prec_leg', 'score_bloc_prec_leg', 'taux_vote_leg', 'score_nuance_groupe_pres', 'score_bloc_pres', 'taux_vote_pres', 'score_candidat_prec_leg']
(15704, 16) (6276, 16)
(15704,) (6276,)


#### Exploration

In [101]:
X_train.head(5)

Unnamed: 0,part_impose,chom_tot,p_agri,p_ouvriers,d_brevet,d_sup,circo_leg_meme_nuance,nb_candidats_meme_bloc,score_nuance_groupe_prec_leg,score_bloc_prec_leg,taux_vote_leg,score_nuance_groupe_pres,score_bloc_pres,taux_vote_pres,score_candidat_prec_leg,depute_sortant
14488,0.135974,-1.195229,-0.396878,0.202611,-0.775392,0.188107,-0.251611,3.0,-0.5568,-0.877061,-0.428999,-0.714429,-0.952695,1.205046,-0.188453,0.0
14489,0.135974,-1.195229,-0.396878,0.202611,-0.775392,0.188107,3.974383,2.0,3.281535,1.492936,-0.428999,3.426466,1.897607,1.205046,-0.188453,0.0
14490,0.135974,-1.195229,-0.396878,0.202611,-0.775392,0.188107,-0.251611,1.0,0.351531,-0.421217,-0.428999,0.53038,-0.502589,1.205046,-0.188453,0.0
14491,0.135974,-1.195229,-0.396878,0.202611,-0.775392,0.188107,-0.251611,3.0,2.415104,0.999024,-0.428999,2.039692,0.90427,1.205046,-0.188453,0.0
14492,0.135974,-1.195229,-0.396878,0.202611,-0.775392,0.188107,-0.251611,3.0,-0.537346,0.999024,-0.428999,-0.494757,0.90427,1.205046,-0.188453,0.0


In [102]:
X_val.head(5)

Unnamed: 0,part_impose,chom_tot,p_agri,p_ouvriers,d_brevet,d_sup,circo_leg_meme_nuance,nb_candidats_meme_bloc,score_nuance_groupe_prec_leg,score_bloc_prec_leg,taux_vote_leg,score_nuance_groupe_pres,score_bloc_pres,taux_vote_pres,score_candidat_prec_leg,depute_sortant
7881,0.121356,-0.797027,-0.489147,0.010294,-1.424594,0.671551,-0.251611,3.0,-0.359271,0.870177,-1.263185,-0.638216,0.829991,0.692942,-0.188453,0.0
7882,0.121356,-0.797027,-0.489147,0.010294,-1.424594,0.671551,-0.251611,3.0,-0.508914,1.655946,-1.263185,-0.881798,1.298944,0.692942,-1.122242,0.0
7883,0.121356,-0.797027,-0.489147,0.010294,-1.424594,0.671551,-0.251611,3.0,-0.694471,-0.944412,-1.263185,0.259899,-0.549151,0.692942,-0.188453,0.0
7884,0.121356,-0.797027,-0.489147,0.010294,-1.424594,0.671551,3.974383,3.0,3.494028,1.655946,-1.263185,2.682271,1.298944,0.692942,4.274708,1.0
7885,0.121356,-0.797027,-0.489147,0.010294,-1.424594,0.671551,-0.251611,3.0,2.141258,0.870177,-1.263185,2.083029,0.829991,0.692942,2.519729,0.0


In [103]:
# imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
# imp = imp.fit(X_train)# Impute our data, then train
# X_train_imp = imp.transform(X_train)

In [104]:
# imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
# imp = imp.fit(X_test)# Impute our data, then train
# X_test_imp = imp.transform(X_test)

In [105]:
rf = RandomForestRegressor(n_jobs=-1, n_estimators=100)
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [106]:
y_pred = rf.predict(X_val)
mean_absolute_error(y_val, y_pred)

0.02267941285992315

In [107]:
sorted(list(zip(rf.feature_importances_*100, X_train.columns)), reverse=True)

[(36.642684789778684, 'score_candidat_prec_leg'),
 (34.168038094960977, 'score_nuance_groupe_pres'),
 (10.484556928638465, 'score_bloc_prec_leg'),
 (4.4072714951170822, 'score_nuance_groupe_prec_leg'),
 (3.3688967966145449, 'score_bloc_pres'),
 (2.3455852242476403, 'taux_vote_pres'),
 (2.1970749166121966, 'depute_sortant'),
 (1.1560311882009366, 'nb_candidats_meme_bloc'),
 (1.0169174518508997, 'taux_vote_leg'),
 (0.89866096833246278, 'part_impose'),
 (0.82332377721972916, 'chom_tot'),
 (0.6303756377067028, 'd_brevet'),
 (0.59723231342542293, 'p_agri'),
 (0.57372265293731406, 'p_ouvriers'),
 (0.4984009543970363, 'd_sup'),
 (0.19122680995990179, 'circo_leg_meme_nuance')]

### Analyser les prédictions

In [108]:
def get_predictions(y_pred, y_val, id_val):
    res = pd.concat([id_val, pd.DataFrame(y_val)], axis=1, join='inner')
    res['prediction'] = y_pred
    res['ecart'] = y_pred - res['p_voix']
    return res

In [109]:
df_predictions = get_predictions(y_pred, y_val, df_id)

In [110]:
df_predictions.sort('ecart').tail(10)

  if __name__ == '__main__':


Unnamed: 0,an,code,nom,prenom,nuance_groupe,second_tour,p_voix,prediction,ecart
11680,2012,68|6,BASCHUNG,REGIS,MDM,N,0.009,0.139737,0.130737
8096,2012,6|4,GUIBAL,JEAN-CLAUDE,LR,O,0.1857,0.318824,0.133124
12601,2012,77|10,THOMAS,CLAUDINE,LR,O,0.1294,0.273665,0.144265
12815,2012,80|2,BOUZY,OLIVIER,MDM,N,0.0161,0.165137,0.149037
9732,2012,35|7,CLEMENT,PASCAL,FN,N,0.0447,0.194634,0.149934
13536,2012,92|9,GUEANT,CLAUDE,LR,O,0.1799,0.336332,0.156432
10986,2012,59|10,VANNESTE,CHRISTIAN,DVD,N,0.0612,0.221716,0.160516
9729,2012,35|7,BELLOIR,NICOLAS,LR,N,0.1065,0.273915,0.167415
13369,2012,91|8,BETEILLE,LAURENT,LR,N,0.0558,0.228246,0.172446
9578,2012,34|3,GRAND,JEAN-PIERRE,LR,O,0.1076,0.282324,0.174724


In [111]:
df_predictions.sort('ecart').head(10)

  if __name__ == '__main__':


Unnamed: 0,an,code,nom,prenom,nuance_groupe,second_tour,p_voix,prediction,ecart
12187,2012,75|6,DUFLOT,CECILE,ECO,O,0.289,0.021455,-0.267545
12261,2012,75|10,BAUPIN,DENIS,ECO,O,0.2502,0.020818,-0.229382
8912,2012,25|2,ALAUZET,ERIC,ECO,O,0.2248,0.016326,-0.208474
12421,2012,76|2,MOINET,VERONIQUE,ECO,O,0.2138,0.012161,-0.201639
11357,2012,63|3,AUROI,DANIELLE,ECO,O,0.2183,0.01919,-0.19911
9942,2012,38|9,BONNETON,MICHELE,ECO,O,0.2116,0.013644,-0.197956
13917,2012,95|2,VUILLETET,GUILLAUME,ECO,O,0.207,0.014185,-0.192815
12709,2012,78|7,RICHARD,ARNAUD,UDI,O,0.1834,0.003854,-0.179546
10559,2012,52|1,ANDRIOT,PATRICIA,ECO,O,0.1935,0.014084,-0.179416
10652,2012,54|4,HOUCHARD,MARIE-NEIGE,ECO,O,0.1888,0.01093,-0.17787
