### To do
- passage en classification et implémentation métriques de score
- export résultats en JSON et CSV
- faire une PCA

In [150]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as pltz
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, auc, confusion_matrix

#### Retrait des colonnes et lignes inutiles / sélection de features

In [52]:
# load data
raw = pd.read_excel('data/dataframe_elections.xlsx')
print('Taille initiale du DF :', raw.shape)

# Données d'identification
id_list = ['an', 'code', 'nom', 'prenom', 'nuance_groupe', 'taux_vote_leg', 'second_tour']
df_id = raw[id_list]

# drop colonnes
drop_list = ['dep', 'circo', 'code', 'inscrits', 'circo_parti', 'circo_nuance',
             'nom', 'prenom', 'etiquette', 'nuance', 'voix', 'second_tour']
raw = raw.drop(drop_list, axis=1)

print('Taille du DF après retrait de 1997 et des colonnes inutiles :', raw.shape)

Taille initiale du DF : (36919, 76)
Taille du DF après retrait de 1997 et des colonnes inutiles : (36919, 64)


### Feature engineering

In [53]:
# à garder obligatoirement pour identifier les lignes et créer les labels
keep_list = ['an', 'c_dep', 'nuance_groupe', 'p_voix']

# Sélection des features
keep_list.extend([#'geo_frontalier', 'geo_dom', 'geo_idf',
                  'part_impose', 'chom_tot',
                  #'p_agri', 'p_ouvriers', 'd_brevet','d_sup',
                  'circo_leg_meme_nuance', 'circo_meme_nuance_president', 'circo_pres_meme_nuance',
                  'nb_candidats_meme_bloc', 'score_nuance_groupe_pres', 'score_candidat_prec_leg',
                  #'score_nuance_groupe_prec_leg', #(un peu mauvais)
                  #'score_bloc_pres', 'score_bloc_prec_leg', #(mauvais pour le score)
                  #'nuance_groupe',
                  'taux_vote_pres', 'taux_vote_leg',
                  'depute_sortant', 'ancien_depute', 'au_gouvernement', 'ancien_ministre',
                  'membre_majorite', 'sexe'
    ])

df = raw.drop([col for col in raw.columns if col not in keep_list], axis=1)
print(df.columns)

# Gestion des NaN au niveau du score du candidat aux législatives précédentes
years = [2002, 2007, 2012, 2017]

# TODO : POUR LES CANDIDATS "EN MARCHE", PAR QUOI REMPLIR LE SCORE DES CANDIDATS A LA PREC LEG ?

partis = ['EXG', 'COM', 'FI', 'DVG', 'ECO', 'SOC', 'RDG', 'MDM', 'UDI', 'DVD', 'LR', 'DLF', 'FN', 'EXD', 'DIV']
for parti in partis:
    print('\nRemplissage des valeurs manquantes pour {} :'.format(parti))
    for year in years:
        mask = (df['an']==year) & (df['nuance_groupe']==parti)
        mask_2 = (df['an']==year - 5) & (df['nuance_groupe']==parti)
        mean = np.mean(df['p_voix'][mask_2]) # moyenne des voix du parti 5 ans avant
        df.loc[mask, 'score_candidat_prec_leg'] = df['score_candidat_prec_leg'][mask].fillna(value=mean)
        print('La moyenne du parti en {} est {:.2f}%'.format(year - 5, mean * 100))

mean = np.mean(df.p_voix)
print('\nRemplissage des dernières valeurs manquantes pour la législative précédente avec {:.2f}%'.format(mean * 100))
df.score_candidat_prec_leg = df.score_candidat_prec_leg.fillna(value=mean)

# Gestion des Nan : Autres variables. Remplacement des valeurs manquantes par la moyenne nationale
features_a_completer = ['chom_tot', 'taux_vote_leg']
for feature in features_a_completer:
    print('\nRemplissage des valeurs manquantes pour {} :'.format(feature))
    for year in years:
        mask = (df['an']==year)
        mean = np.mean(df[feature][mask])
        print('La moyenne de la feature pour {} est {:.2f}%'.format(year, mean * 100))
        df.loc[mask, feature] = df[feature][mask].fillna(value=mean)
        
# Catégorisation
# df['score_candidat_prec_leg'] = pd.cut(df['score_candidat_prec_leg'], bins=[-1, -.01, .05, .1, .15, .2, 1],
#                                       labels=['N/A', 'A', 'B', 'C', 'D', 'E'])

# On retire les lignes de 1997
df = df[df.an > 1997]
df = df[df.nuance_groupe!='DIV']
df = df.drop(['c_dep', 'nuance_groupe'], axis=1)

Index(['an', 'c_dep', 'part_impose', 'chom_tot', 'circo_leg_meme_nuance',
       'nb_candidats_meme_bloc', 'taux_vote_leg', 'circo_pres_meme_nuance',
       'circo_meme_nuance_president', 'score_nuance_groupe_pres',
       'taux_vote_pres', 'score_candidat_prec_leg', 'depute_sortant',
       'ancien_depute', 'au_gouvernement', 'ancien_ministre',
       'membre_majorite', 'sexe', 'nuance_groupe', 'p_voix'],
      dtype='object')

Remplissage des valeurs manquantes pour EXG :
La moyenne du parti en 1997 est 1.37%
La moyenne du parti en 2002 est 0.79%
La moyenne du parti en 2007 est 0.86%
La moyenne du parti en 2012 est 0.31%

Remplissage des valeurs manquantes pour COM :
La moyenne du parti en 1997 est 6.78%
La moyenne du parti en 2002 est 3.67%
La moyenne du parti en 2007 est 3.03%
La moyenne du parti en 2012 est 1.16%

Remplissage des valeurs manquantes pour FI :
La moyenne du parti en 1997 est nan%
La moyenne du parti en 2002 est nan%
La moyenne du parti en 2007 est nan%
La moyenne du

In [54]:
print(df.shape)
print(df.isnull().sum())

(25974, 18)
an                                0
part_impose                       0
chom_tot                          0
circo_leg_meme_nuance             0
nb_candidats_meme_bloc            0
taux_vote_leg                     0
circo_pres_meme_nuance            0
circo_meme_nuance_president       0
score_nuance_groupe_pres          0
taux_vote_pres                    0
score_candidat_prec_leg           0
depute_sortant                    0
ancien_depute                     0
au_gouvernement                   0
ancien_ministre                   0
membre_majorite                   0
sexe                              0
p_voix                         6314
dtype: int64


#### Création des ensembles de train/validation/test

In [55]:
def train_val_splits(df, year_for_validation):
    drop_list = ['an', 'p_voix']
    df_train = df[(df.an != 2017) & (df.an != year_for_validation)].dropna(how='any')
    y_train = df_train.p_voix
    X_train = df_train.drop(drop_list, axis=1)
    
    df_val = df[df.an == year_for_validation].dropna(how='any')
    y_val = df_val.p_voix
    X_val = df_val.drop(drop_list, axis=1)
    
    return X_train, X_val, y_train, y_val

In [56]:
# Liste des variables dummies et binaires pour les exclure de la normalisation
dummies_list = ['bloc', 'an', 'p_voix', 'sexe',
                #'geo_frontalier', 'geo_dom', 'geo_idf',
                'circo_nuance_groupe', 'circo_bloc', 'nuance_groupe',
                'circo_nuance_groupe_pres', 'circo_pres_meme_nuance', 'circo_meme_nuance_president',
                'circo_leg_meme_nuance',
                'depute_sortant', 'ancien_depute', 'au_gouvernement', 'ancien_ministre', 'membre_majorite',
                'nb_candidats_meme_bloc', 'score_candidat_prec_leg']
scale_list = [col for col in df.columns if col not in dummies_list]

print('Variables à normaliser', scale_list)
df_dummified = pd.get_dummies(df, drop_first=True)
X_train, X_val, y_train, y_val = train_val_splits(df_dummified, 2012)

# Normalisation du dataframe
scaler = StandardScaler()
X_train[scale_list] = scaler.fit_transform(X_train[scale_list])
X_val[scale_list] = scaler.transform(X_val[scale_list])

print(X_train.shape, X_val.shape)
print(y_train.shape, y_val.shape)

Variables à normaliser ['part_impose', 'chom_tot', 'taux_vote_leg', 'score_nuance_groupe_pres', 'taux_vote_pres']
(13570, 16) (6090, 16)
(13570,) (6090,)


#### Exploration

In [57]:
X_train.head(5)

Unnamed: 0,part_impose,chom_tot,circo_leg_meme_nuance,nb_candidats_meme_bloc,taux_vote_leg,circo_pres_meme_nuance,circo_meme_nuance_president,score_nuance_groupe_pres,taux_vote_pres,score_candidat_prec_leg,depute_sortant,ancien_depute,au_gouvernement,ancien_ministre,membre_majorite,sexe_M
14488,0.184895,-0.92759,0.0,3.0,-0.247783,0.0,0.0,-0.835451,1.128834,0.036679,0.0,0.0,0.0,0.0,0.0,1
14489,0.184895,-0.92759,1.0,2.0,-0.247783,1.0,1.0,3.214634,1.128834,0.223836,0.0,0.0,0.0,0.0,1.0,1
14490,0.184895,-0.92759,0.0,1.0,-0.247783,0.0,0.0,0.382059,1.128834,0.070266,0.0,0.0,0.0,0.0,0.0,0
14491,0.184895,-0.92759,0.0,3.0,-0.247783,0.0,0.0,1.858272,1.128834,0.184033,0.0,0.0,0.0,0.0,0.0,1
14492,0.184895,-0.92759,0.0,3.0,-0.247783,0.0,0.0,-0.620596,1.128834,0.014068,0.0,0.0,0.0,0.0,0.0,0


#### Choix de modèle

In [160]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_jobs=-1, n_estimators=150)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)

In [163]:
from sklearn.svm import SVR
reg = SVR()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_val)

### Analyser les prédictions

In [156]:
def get_predictions(y_pred, y_val, id_val):
    from scipy.stats import rankdata
    res = pd.concat([id_val, pd.DataFrame(y_val)], axis=1, join='inner')
    res['prediction'] = y_pred
    res['ecart'] = y_pred - res['p_voix']
    res['classement'] = 0
    res['qualif'] = 'N'
    
    for circo in set(res.code):
        # classement des candidats par circonscription
        res.loc[res.code==circo, 'classement'] = rankdata(-res.prediction[res.code==circo], method='ordinal')
        taux_participation = np.float(res.loc[(res.code==circo) & (res.classement==1), 'taux_vote_leg'])
        
        # simulation des qualifiés pour le second tour
        i = 1
        q = 0
        while True:
            if np.float(res.loc[(res.code==circo) & (res.classement==i), 'prediction']) / taux_participation >= .5:
                res.loc[(res.code==circo) & (res.classement==i), 'qualif'] = 'E'
                break
            elif np.float(res.loc[(res.code==circo) & (res.classement==i), 'prediction']) > .125:
                res.loc[(res.code==circo) & (res.classement==i), 'qualif'] = 'O'
                q += 1
            elif q < 2:
                res.loc[(res.code==circo) & (res.classement==i), 'qualif'] = 'O'
                q += 1
            else:
                break

            i += 1
    return res

In [157]:
sorted(list(zip(rf.feature_importances_*100, X_train.columns)), reverse=True)

[(77.906687295617118, 'score_candidat_prec_leg'),
 (6.3709851223534413, 'score_nuance_groupe_pres'),
 (3.0305127231099522, 'depute_sortant'),
 (2.0772624190282212, 'taux_vote_leg'),
 (1.9762975594353169, 'taux_vote_pres'),
 (1.6860396079427422, 'part_impose'),
 (1.5055620418198374, 'chom_tot'),
 (1.1598771489051736, 'nb_candidats_meme_bloc'),
 (0.922724630020615, 'membre_majorite'),
 (0.91578660376105026, 'circo_pres_meme_nuance'),
 (0.87328966235729055, 'circo_meme_nuance_president'),
 (0.79892900118047461, 'circo_leg_meme_nuance'),
 (0.39871462045172734, 'sexe_M'),
 (0.34804624492973379, 'ancien_depute'),
 (0.02283095983749394, 'ancien_ministre'),
 (0.0064543592498488515, 'au_gouvernement')]

In [158]:
df_predictions = get_predictions(y_pred, y_val, df_id)
n = df_predictions.shape[0]
print('Prédictions pour {} candidats.'.format(n))

ecarts_absolus = abs(df_predictions['ecart'])
max_ecart = np.max(ecarts_absolus)
grid = np.arange(max_ecart, step=.01)
ecarts_grid = []

for point in grid:
    ecarts_grid.append(np.sum(ecarts_absolus < point))

print('La MAE est : {:.2f}%'.format(mean_absolute_error(y_val, y_pred) * 100))
print('La plus grosse erreur est : {:.2f}%'.format(max_ecart * 100))
print('Le score AUC est : {:.2f}%'.format(auc(grid, ecarts_grid) * 100 / max_ecart / n))

point_list = [.01, .02, .05, .1]
ecart_list = [np.sum(ecarts_absolus < point) for point in point_list]

plt.figure(1, figsize=(14, 3))
plt.subplot(121)
plt.plot(grid, ecarts_grid)
for i in range(len(point_list)):
    plt.plot(point_list[i], ecart_list[i], 'ro')
    plt.text(point_list[i]+.005, ecart_list[i]-400, '{:.2f}%'.format(ecart_list[i]/n))

plt.title('Nb d\'obs en dessous d\'un seuil d\'erreur')
plt.xlabel('Seuils d\'erreur')
    
plt.subplot(122)
plt.hist(df_predictions['ecart'], bins=50)
plt.title('Répartition globale des erreurs')

# comparaison de la distribution des erreurs, parti par parti
from scipy.stats import gaussian_kde
plt.figure(2, figsize=(15,6))
x_grid = np.arange(min(df_predictions['ecart']), max(df_predictions['ecart']), step=.01)

nuances = {'FI': 'red','ECO': 'green', 'SOC': 'pink', 'MDM': 'orange', 'UDI': 'yellow',
           'LR': 'blue', 'DLF': 'darkblue', 'FN': 'black'}
for nuance in nuances:
    n_nuance = np.sum(df_predictions['nuance_groupe']==nuance)
    pdf = gaussian_kde(df_predictions['ecart'][df_predictions['nuance_groupe']==nuance],
                       bw_method=.9).evaluate(x_grid)
    plt.plot(x_grid, pdf, color=nuances[nuance], label=nuance, lw=2)

plt.title('Répartition des erreurs selon le parti')
plt.legend(loc='best')

plt.show()

TypeError: cannot convert the series to <class 'float'>

In [154]:
print('{} prédictions correctes sur {}'.format(np.sum(df_predictions.second_tour == df_predictions.qualif), n))
print('Matrice de confusion. En colonne les classes prédites, en ligne les classes réelles.')
confusion_matrix(df_predictions.second_tour, df_predictions.qualif, labels=['E', 'O', 'N'])

5692 prédictions correctes sur 6090
Matrice de confusion. En colonne les classes prédites, en ligne les classes réelles.


array([[   4,   31,    1],
       [   2,  937,  158],
       [   2,  204, 4751]])

In [146]:
df_predictions.sort_values(by='ecart').tail(5)

Unnamed: 0,an,code,nom,prenom,nuance_groupe,taux_vote_leg,second_tour,p_voix,prediction,ecart,classement,qualif
11167,2012,44|6,MADOUAS,HERVE,MDM,0.6293,N,0.0092,0.170853,0.161653,3,O
9547,2012,99|8,HOFFENBERG,VALERIE,LR,,O,0.0293,0.194825,0.165525,1,O
11334,2012,47|1,BONFANTI-DOSSAT,CHRISTINE,LR,0.6291,N,0.039,0.209495,0.170495,1,O
10518,2012,34|3,GRAND,JEAN-PIERRE,LR,0.6241,O,0.1076,0.286796,0.179196,1,O
9492,2012,99|4,MONTCHAMP,MARIE-ANNE,LR,,O,0.0504,0.2437,0.1933,1,O


In [147]:
df_predictions.sort_values(by='ecart').head(5)

Unnamed: 0,an,code,nom,prenom,nuance_groupe,taux_vote_leg,second_tour,p_voix,prediction,ecart,classement,qualif
13113,2012,975|1,GIRARDIN,ANNICK,RDG,0.6316,E,0.3398,0.051669,-0.288131,4,N
12340,2012,75|6,DUFLOT,CECILE,ECO,0.6073,O,0.289,0.018017,-0.270983,8,N
12414,2012,75|10,BAUPIN,DENIS,ECO,0.6603,O,0.2502,0.016284,-0.233916,9,N
10205,2012,65|2,DUBIE,JEANINE,RDG,0.6514,O,0.2555,0.045282,-0.210218,2,O
12759,2012,63|3,AUROI,DANIELLE,ECO,0.617,O,0.2183,0.013041,-0.205259,6,N


In [148]:
df_predictions[df_predictions.nuance_groupe=='DLF']

Unnamed: 0,an,code,nom,prenom,nuance_groupe,taux_vote_leg,second_tour,p_voix,prediction,ecart,classement,qualif
7882,2012,1|1,BAUDOUIN,GREGORY,DLF,0.5926,N,0.0037,0.006338,0.002638,10,N
7904,2012,1|3,ROCHET,DELPHINE,DLF,0.5474,N,0.0083,0.004705,-0.003595,8,N
7913,2012,1|4,VIANES,MICHELE,DLF,0.5793,N,0.0075,0.005109,-0.002391,9,N
7927,2012,1|5,SANSANO,BRIGITTE,DLF,0.5614,N,0.0041,0.005639,0.001539,13,N
7933,2012,2|1,GOSSET,MARIE-PAULE,DLF,0.6021,N,0.0042,0.005470,0.001270,10,N
7956,2012,2|3,PAGNIEZ,JEAN-CLAUDE,DLF,0.6071,N,0.0047,0.005940,0.001240,9,N
7969,2012,2|4,SAPORI,MICHELLE,DLF,0.5690,N,0.0045,0.004940,0.000440,11,N
7976,2012,2|5,POIRET,JEAN-CLAUDE,DLF,0.5974,N,0.0058,0.004676,-0.001124,7,N
7994,2012,3|2,TAILLANDIER,BERNARD,DLF,0.6202,N,0.0093,0.004414,-0.004886,10,N
8013,2012,4|1,EVANO,LUDIVINE,DLF,0.6530,N,0.0000,0.008723,0.008723,11,N


In [None]:
#df_predictions.to_excel('predictions.xlsx')