### To do


In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, r2_score
from sklearn.metrics import auc, confusion_matrix, make_scorer
from sklearn.pipeline import make_pipeline

### Loading the data

In [2]:
raw = pd.read_excel('data/dataframe_elections.xlsx')
print('Taille initiale du DF :', raw.shape)

# Données d'identification
id_list = ['an', 'dep', 'circo', 'code', 'nom', 'prenom', 'nuance', 'nuance_groupe', 'bloc',
           'taux_vote_leg', 'second_tour']
df_id = raw[id_list]

# drop colonnes
drop_list = ['dep', 'circo', 'code', 'inscrits', 'circo_parti', 'circo_nuance',
             'nom', 'prenom', 'etiquette', 'nuance', 'voix', 'second_tour']
raw = raw.drop(drop_list, axis=1)

print('Taille du DF après retrait des colonnes inutiles :', raw.shape)

Taille initiale du DF : (36919, 81)
Taille du DF après retrait des colonnes inutiles : (36919, 69)


In [131]:
def generate_df(raw_data, keep_list, penalty=.8, filter_div=False):
    # à garder obligatoirement pour identifier les lignes et créer les labels
    keep_list.extend(['an', 'c_dep', 'nuance_groupe', 'p_inscrits', 'p_exprimes'])

    df = raw_data.drop([col for col in raw_data.columns if col not in keep_list], axis=1)
    #print(df.columns)

    # Gestion des NaN au niveau du score du candidat aux législatives précédentes
    years = [2002, 2007, 2012, 2017]
    partis = ['EXG', 'COM', 'FI', 'DVG', 'ECO', 'SOC', 'RDG', 'MDM',
              'UDI', 'DVD', 'LR', 'DLF', 'FN', 'EXD', 'REG', 'DIV']
    
    # TODO : POUR LES CANDIDATS "EN MARCHE", PAR QUOI REMPLIR LE SCORE DES CANDIDATS A LA PREC LEG ?
    
    if 'score_candidat_prec_leg_ins' in keep_list:
        for parti in partis:
            #print('\nRemplissage des valeurs manquantes pour {} :'.format(parti))
            for year in years:
                mask = (df['an']==year) & (df['nuance_groupe']==parti)
                mask_2 = (df['an']==year - 5) & (df['nuance_groupe']==parti)
                mean = np.mean(df['p_inscrits'][mask_2]) * penalty # moyenne des voix du parti 5 ans avant (% d'inscrits)
                df.loc[mask, 'score_candidat_prec_leg_ins'] = df['score_candidat_prec_leg_ins'][mask].fillna(value=mean)
                #print('La moyenne du parti en {} est {:.2f}%'.format(year - 5, mean * 100))

        mean = np.mean(df.p_inscrits) * penalty
        df.score_candidat_prec_leg_ins = df.score_candidat_prec_leg_ins.fillna(value=mean)
        
    if 'score_candidat_prec_leg_expr' in keep_list:
        for parti in partis:
            #print('\nRemplissage des valeurs manquantes pour {} :'.format(parti))
            for year in years:
                mask = (df['an']==year) & (df['nuance_groupe']==parti)
                mask_2 = (df['an']==year - 5) & (df['nuance_groupe']==parti)
                mean = np.mean(df['p_exprimes'][mask_2]) * penalty # moyenne des voix du parti 5 ans avant (% exprimés)
                df.loc[mask, 'score_candidat_prec_leg_expr'] = df['score_candidat_prec_leg_expr'][mask].fillna(value=mean)
                #print('La moyenne du parti en {} est {:.2f}%'.format(year - 5, mean * 100))

        mean = np.mean(df.p_exprimes)
        df.score_candidat_prec_leg_expr = df.score_candidat_prec_leg_expr.fillna(value=mean)

    # Gestion des NaN : Autres variables. Remplacement des valeurs manquantes par la moyenne nationale
    features_a_completer = ['chom_tot', 'chom_jeunes', 'chom_adultes', 'chom_seniors',
                           'p_agri', 'p_commercants', 'p_cadres', 'p_intermed', 'p_employes',
                           'p_ouvriers', 'd_brevet', 'd_bep', 'd_bac', 'd_sup']
    for feature in features_a_completer:
        if feature in keep_list:
            #print('\nRemplissage des valeurs manquantes pour {} :'.format(feature))
            for year in years:
                mask = (df['an']==year)
                mean = np.mean(df[feature][mask])
                #print('La moyenne de la feature pour {} est {:.2f}%'.format(year, mean * 100))
                df.loc[mask, feature] = df[feature][mask].fillna(value=mean)

    # Catégorisation
    if 'score_candidat_prec_leg_cat' in keep_list:
        if 'score_candidat_prec_leg_expr' in keep_list:
            df['score_candidat_prec_leg_cat'] = pd.cut(df['score_candidat_prec_leg_expr'],
                                                       bins=[-1, -.01, .05, .1, .15, .25, 1],
                                                       labels=['N/A', 'A', 'B', 'C', 'D', 'E'])
        elif 'score_candidat_prec_leg_ins' in keep_list:
            df['score_candidat_prec_leg_cat'] = pd.cut(df['score_candidat_prec_leg_ins'],
                                                       bins=[-1, -.01, .05, .1, .15, .25, 1],
                                                       labels=['N/A', 'A', 'B', 'C', 'D', 'E'])

    df = df[df.an > 1997]
    df = df[df.c_dep!=99] # Retrait des Français de l'Etranger
    if filter_div: # On retire les candidats "Divers" (pour l'entraînement ET la prédiction)
        df = df[df.nuance_groupe!='DIV']
    
    
    df = df.drop(['c_dep', 'nuance_groupe'], axis=1)
    
    print('Format du dataframe :', df.shape)
    print(df.isnull().sum())
    
    return df

In [69]:
def train_val_splits(df, year_for_validation, target, scaler=MinMaxScaler()):
    # Liste des variables dummies et binaires pour les exclure de la normalisation
    dummies_list = ['bloc', 'sexe', 'an', 'p_inscrits', 'p_exprimes', 'dep', 'circo',
                    'geo_frontalier', 'geo_dom', 'geo_idf',
                    'circo_nuance_groupe', 'circo_bloc', 'nuance_groupe',
                    'circo_nuance_groupe_pres', 'circo_pres_meme_nuance', 'circo_meme_nuance_president',
                    'circo_leg_meme_nuance',
                    'depute_sortant', 'ancien_depute', 'au_gouvernement', 'ancien_ministre', 'membre_majorite',
                    #'nb_candidats_meme_bloc', 'nb_candidats_circo',
                    'score_candidat_prec_leg_cat',
                    'score_candidat_prec_leg_expr', 'score_candidat_prec_leg_ins',
                    #'nb_acces_second_tour'
                   ]
    scale_list = [col for col in df.columns if col not in dummies_list]

    df = pd.get_dummies(df, drop_first=True)
    
    drop_list = ['an', 'p_inscrits', 'p_exprimes']
    df_train = df[(df.an != 2017) & (df.an != year_for_validation)].dropna(how='any')
    y_train = df_train[target]
    X_train = df_train.drop(drop_list, axis=1)
    
    if year_for_validation != 2017:
        df_val = df[df.an == year_for_validation].dropna(how='any')
    else: # on ne drop pas car pour 2017 toutes les valeurs cibles sont des N/A
        df_val = df[df.an == year_for_validation]
    
    y_val = df_val[target]
    X_val = df_val.drop(drop_list, axis=1)
    
    # Normalisation du dataframe
    if scaler:
        print('Variables à normaliser', scale_list)
        X_train[scale_list] = scaler.fit_transform(X_train[scale_list])
        X_val[scale_list] = scaler.transform(X_val[scale_list])

    print('Format des ensembles de train et de validation :')
    print(X_train.shape, X_val.shape)
    
    return X_train, X_val, y_train, y_val

In [70]:
def get_classif(y_pred, y_val, id_val, target, strategy):
    from scipy.stats import rankdata
    
    res = pd.concat([id_val, pd.DataFrame(y_val)], axis=1, join='inner')
    res['prediction'] = y_pred
    res['classement'] = 0
    res['qualif'] = 'N'
    
    for circo in set(res.code):
        # classement des candidats par circonscription
        res.loc[res.code==circo, 'classement'] = rankdata(-res.prediction[res.code==circo], method='ordinal')
        
        # normalisation des votes pour que la somme soit égale à 100%
        res.loc[res.code==circo, 'prediction'] /= np.sum(res.loc[res.code==circo, 'prediction'])
        
        if target=='p_exprimes':
            taux_E = 1
            taux_O = np.float(res.loc[(res.code==circo) & (res.classement==1), 'taux_vote_leg'])
        elif target=='p_inscrits':
            taux_E = 1 / np.float(res.loc[(res.code==circo) & (res.classement==1), 'taux_vote_leg'])
            taux_O = 1
            
        # simulation des qualifiés pour le second tour
        i = 1
        q = 0
        while True:
            #print(res.loc[(res.code==circo) & (res.classement==i), 'prediction'].values)
            if (res.loc[(res.code==circo) & (res.classement==i), 'prediction'].values) * taux_E >= .5:
                res.loc[(res.code==circo) & (res.classement==i), 'qualif'] = 'E'
                if strategy == 'O':
                    q += 1
                elif strategy == 'E':
                    break
            elif (res.loc[(res.code==circo) & (res.classement==i), 'prediction'].values) * taux_O >= .125:
                res.loc[(res.code==circo) & (res.classement==i), 'qualif'] = 'O'
                q += 1
            elif q < 2:
                res.loc[(res.code==circo) & (res.classement==i), 'qualif'] = 'O'
                q += 1
            else:
                break

            i += 1
        
    res['ecart'] = res['prediction'] - res[target]
    
    return res

In [71]:
def analyze_predictions(df_predictions, target):
    n = df_predictions.shape[0]
    print('Prédictions pour {} candidats.'.format(n))

    ecarts_absolus = abs(df_predictions.ecart)
    max_ecart = np.max(ecarts_absolus)
    grid = np.arange(max_ecart, step=.01)
    ecarts_grid = []

    for point in grid:
        ecarts_grid.append(np.sum(ecarts_absolus < point))

    print('La MAE est : {:.2f}%'.format(
            mean_absolute_error(df_predictions[target], df_predictions.prediction) * 100))
    print('La plus grosse erreur est : {:.2f}%'.format(max_ecart * 100))
    print('Le score r2 est : {:.2f}'.format(r2_score(df_predictions[target], df_predictions.prediction)))

    point_list = [.01, .02, .05, .1]
    ecart_list = [np.sum(ecarts_absolus < point) for point in point_list]

    plt.figure(1, figsize=(13, 3))
    plt.subplot(121)
    plt.plot(grid, ecarts_grid)
    for i in range(len(point_list)):
        plt.plot(point_list[i], ecart_list[i], 'ro')
        plt.text(point_list[i]+.005, ecart_list[i]-400, '{:.2f}%'.format(ecart_list[i]/n))

    plt.title('Nb d\'obs en dessous d\'un seuil d\'erreur')
    plt.xlabel('Seuils d\'erreur')

    plt.subplot(122)
    plt.hist(df_predictions['ecart'], bins=50)
    plt.title('Répartition globale des erreurs')

    # comparaison de la distribution des erreurs, parti par parti
    from scipy.stats import gaussian_kde
    plt.figure(2, figsize=(15,4))
    x_grid = np.arange(min(df_predictions['ecart']), max(df_predictions['ecart']), step=.01)
    
    nuances = {'FI': 'red','ECO': 'green', 'SOC': 'pink', 'RDG': 'grey', 'MDM': 'orange', 'UDI': 'yellow',
               'LR': 'blue', 'DLF': 'darkblue', 'FN': 'black', 'REG': 'teal'}
    for nuance in nuances:
        n_pred = df_predictions[(df_predictions['nuance_groupe']==nuance)
                                & ((df_predictions['qualif']=='E')
                               | (df_predictions['qualif']=='O'))].shape[0]
        n_reel = df_predictions[(df_predictions['nuance_groupe']==nuance)
                                & ((df_predictions['second_tour']=='E')
                               | (df_predictions['second_tour']=='O'))].shape[0]
        print('{}: {} prédits | Réel {}'.format(nuance, n_pred, n_reel))
        try: # cas où le parti n'est pas représenté à l'élection prédite
            pdf = gaussian_kde(df_predictions['ecart'][df_predictions['nuance_groupe']==nuance],
                               bw_method=.9).evaluate(x_grid)
            plt.plot(x_grid, pdf, color=nuances[nuance], label=nuance, lw=2)
        except:
            pass

    plt.title('Répartition des erreurs selon le parti')
    plt.legend(loc='best')

    n_correct = np.sum(df_predictions.second_tour == df_predictions.qualif)
    print('\n{} prédictions correctes sur {} (soit {:.1f}%)'.format(n_correct, n, 100 * n_correct / n))

    print('\nMatrice de confusion. En colonne les classes prédites, en ligne les classes réelles.')
    cm = confusion_matrix(df_predictions.second_tour, df_predictions.qualif, labels=['E', 'O', 'N'])
    print(cm)

    f1_E = f1_score(df_predictions.second_tour, df_predictions.qualif, labels='E', average='micro')
    print('Score F1 pour la classe E : {:.1f}%'.format(f1_E * 100))
    f1_O = f1_score(df_predictions.second_tour, df_predictions.qualif, labels='O', average='micro')
    print('Score F1 pour la classe O : {:.1f}%'.format(f1_O * 100))
    f1_N = f1_score(df_predictions.second_tour, df_predictions.qualif, labels='N', average='micro')
    print('Score F1 pour la classe N : {:.1f}%'.format(f1_N * 100))
    plt.show()
    
    return

### Choix des paramètres

In [369]:
# Sélection des features
keep_list = [#'geo_frontalier', 'geo_dom', 'geo_idf',
    'part_impose', 'chom_tot',
    'chom_jeunes', 'chom_adultes', 'chom_seniors',
    'revenus_q1',
    'revenus_med',
    'revenus_q3',
    #'ecart_revenus',
    'p_agri', 'p_commercants', 'p_cadres', 'p_intermed', 'p_employes',
    'p_ouvriers', 'd_brevet', 'd_bep', 'd_bac', 'd_sup',
    #'taux_vote_pres',
    'sexe',
    'circo_leg_meme_nuance', 'circo_pres_meme_nuance',
    #'circo_meme_nuance_president',
    'nb_candidats_meme_bloc',
    'nb_candidats_circo',
    'score_nuance_groupe_pres', 'score_bloc_pres',
    #'score_pres_exg', 'score_pres_g', 'score_pres_c', 'score_pres_d', 'score_pres_exd', 'score_pres_div',
    #'score_candidat_prec_leg_ins',
    #'score_candidat_prec_leg_expr',
    #'score_candidat_prec_leg_cat',
    #'score_nuance_groupe_prec_leg', #(un peu mauvais)
    #'score_bloc_prec_leg', #(mauvais pour le score)
    #'nuance_groupe',
    'depute_sortant', 'ancien_depute', 'au_gouvernement', 'ancien_ministre', 'membre_majorite',
    'nb_acces_second_tour'
]

scaler = MinMaxScaler()
#scaler = StandardScaler()

# Prédire les score en % des voix exprimées ou % des inscrits sur les listes électorales
target_values = 'p_exprimes' # 'p_inscrits'
target_year = 2017

In [370]:
# Taux par lequel on multiplie les score à la législative précédente qui étaient à la base des N/A
df = generate_df(raw, keep_list, penalty=.9, filter_div=False)
X_train, X_val, y_train, y_val = train_val_splits(df, target_year, target_values, scaler)

Format du dataframe : (30205, 34)
an                             0
part_impose                    0
revenus_q1                     0
revenus_med                    0
revenus_q3                     0
chom_tot                       0
chom_jeunes                    0
chom_adultes                   0
chom_seniors                   0
p_agri                         0
p_commercants                  0
p_cadres                       0
p_intermed                     0
p_employes                     0
p_ouvriers                     0
d_brevet                       0
d_bep                          0
d_bac                          0
d_sup                          0
circo_leg_meme_nuance          0
circo_pres_meme_nuance         0
score_nuance_groupe_pres       0
score_bloc_pres                0
depute_sortant                 0
ancien_depute                  0
au_gouvernement                0
ancien_ministre                0
membre_majorite                0
nb_acces_second_tour           0
nb_candid

In [371]:
X_train.columns

Index(['part_impose', 'revenus_q1', 'revenus_med', 'revenus_q3', 'chom_tot',
       'chom_jeunes', 'chom_adultes', 'chom_seniors', 'p_agri',
       'p_commercants', 'p_cadres', 'p_intermed', 'p_employes', 'p_ouvriers',
       'd_brevet', 'd_bep', 'd_bac', 'd_sup', 'circo_leg_meme_nuance',
       'circo_pres_meme_nuance', 'score_nuance_groupe_pres', 'score_bloc_pres',
       'depute_sortant', 'ancien_depute', 'au_gouvernement', 'ancien_ministre',
       'membre_majorite', 'nb_acces_second_tour', 'nb_candidats_meme_bloc',
       'nb_candidats_circo', 'sexe_M'],
      dtype='object')

### Choix de modèle

In [372]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_jobs=-1, n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
sorted(list(zip(rf.feature_importances_*100, X_train.columns)), reverse=True)

[(55.583048394024345, 'score_nuance_groupe_pres'),
 (14.923681170925649, 'ancien_depute'),
 (6.3518388017963971, 'membre_majorite'),
 (5.6728303725751594, 'score_bloc_pres'),
 (4.4499016785494883, 'nb_acces_second_tour'),
 (1.0266256449033224, 'nb_candidats_meme_bloc'),
 (0.8907342190158245, 'depute_sortant'),
 (0.88603782634228867, 'nb_candidats_circo'),
 (0.71994412603818159, 'revenus_q1'),
 (0.7150911750031349, 'revenus_q3'),
 (0.71406082331041965, 'part_impose'),
 (0.66764542230964152, 'chom_jeunes'),
 (0.66674843997368582, 'd_brevet'),
 (0.56576804536622904, 'd_bac'),
 (0.52237692332292274, 'revenus_med'),
 (0.51789181715213084, 'sexe_M'),
 (0.50518224930232603, 'circo_leg_meme_nuance'),
 (0.45633801798861562, 'd_bep'),
 (0.45331423468593202, 'p_commercants'),
 (0.44755900643814739, 'p_employes'),
 (0.43480495309438144, 'chom_adultes'),
 (0.42265417750849121, 'chom_seniors'),
 (0.41675004566139939, 'p_agri'),
 (0.40333573038672393, 'd_sup'),
 (0.39472345184897262, 'chom_tot'),
 (0

In [374]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

clf = XGBRegressor(n_estimators=80, learning_rate=9e-2, max_depth=2)
# param_grid = {'n_estimators': [75, 100, 125], #140, 150, 160: 160
#               'max_depth' : [2, 3, 5], #70
#               'learning_rate' : [0.001, 0.01, 0.1], #0.05, 0.1, 0.2 : 0.05
#               'min_child_weight' : [1], #4,5,6 :4
#               'reg_alpha' : [0, 1, 2], #8, 10, 12 : 8
#               'reg_lambda' : [0, 1, 2] #0.7
#              }

# rs = RandomizedSearchCV(clf, param_grid, cv=3, n_jobs=-1, verbose=10)
# rs.fit(X_train, y_train)
# clf = rs.best_estimator_
clf.fit(X_train, y_train)
y_pred_xgb = clf.predict(X_val)
if target_year != 2017:
    mean_absolute_error(y_pred_xgb, y_val) * 100

In [375]:
from sklearn.neural_network import MLPRegressor
reg = MLPRegressor((100, 100, 100), activation='relu', solver='adam', alpha=1e-4, batch_size=200,
                   learning_rate='constant', learning_rate_init=5e-3, max_iter=500, tol=1e-6, verbose=True,
                  power_t=.5, early_stopping=True)
reg.fit(X_train, y_train)
y_pred_mlp = reg.predict(X_val)

Iteration 1, loss = 0.00342305
Validation score: 0.791894
Iteration 2, loss = 0.00139441
Validation score: 0.829323
Iteration 3, loss = 0.00134981
Validation score: 0.820424
Iteration 4, loss = 0.00131613
Validation score: 0.828551
Iteration 5, loss = 0.00131537
Validation score: 0.831842
Iteration 6, loss = 0.00125924
Validation score: 0.833783
Iteration 7, loss = 0.00127000
Validation score: 0.826490
Iteration 8, loss = 0.00122499
Validation score: 0.837085
Iteration 9, loss = 0.00119247
Validation score: 0.846304
Iteration 10, loss = 0.00122248
Validation score: 0.836551
Iteration 11, loss = 0.00121233
Validation score: 0.839322
Iteration 12, loss = 0.00123660
Validation score: 0.829586
Validation score did not improve more than tol=0.000001 for two consecutive epochs. Stopping.


In [376]:
# Mix des prédictions des modèles
y_pred = np.mean(np.array([y_pred_rf,
                           y_pred_mlp,
                           y_pred_xgb
                          ]), axis=0)

In [377]:
# si 'O' : on note les potentiels qualifiés dès le 1er tour mais on indique aussi le candidat arrivé 2ème
# si 'E' : si un candidat est classé 'E', ses concurrents sont automatiquement tous classés 'N'
df_predictions = get_classif(y_pred, y_val, df_id, target_values, strategy='O')
if target_year!=2017:
    analyze_predictions(df_predictions, target_values)

In [378]:
print('Parti:  1er |  2e | 3e | 4e')
print('---------------------------')

# création JSON pour histogramme et affichage résultats
summary = []
for parti in ['EXG', 'COM', 'FI', 'DVG', 'ECO', 'SOC', 'RDG', 'MDM', 'REM',
              'UDI', 'DVD', 'LR', 'DLF', 'FN', 'EXD', 'REG', 'DIV']:
    a = (df_predictions.classement==1) & (df_predictions.nuance == parti)
    b = (df_predictions.classement==2) & (df_predictions.qualif == 'O') & (df_predictions.nuance == parti)
    c = (df_predictions.classement==3) & (df_predictions.qualif == 'O') & (df_predictions.nuance == parti)
    d = (df_predictions.classement==4) & (df_predictions.qualif == 'O') & (df_predictions.nuance == parti)
    
    print('{:4} :  {:3} | {:3} | {:2} | {:1}'.format(parti, df_predictions[a].shape[0], df_predictions[b].shape[0],
                                   df_predictions[c].shape[0], df_predictions[d].shape[0]))
    
    if parti in ['FI', 'SOC', 'MDM', 'REM', 'UDI', 'LR', 'DLF', 'FN']:
        ligne_df = {'Parti': parti, '1er': df_predictions[a].shape[0], '2e': df_predictions[b].shape[0],
                   '3e': df_predictions[c].shape[0]}
        if df_predictions[d].shape[0]: # cas de quadrangulaire
            ligne_df['4e'] = df_predictions[d].shape[0]
            
        summary.append(ligne_df)
    
print(summary)
df_histo = pd.DataFrame(summary).set_index('Parti')
df_histo.to_json('dataviz/data/histogramme_partis_second_tour.json', orient='split')

Parti:  1er |  2e | 3e | 4e
---------------------------
EXG  :    0 |   0 |  0 | 0
COM  :    3 |   1 |  0 | 0
FI   :    3 |  87 |  0 | 0
DVG  :   16 |  12 |  1 | 0
ECO  :    5 |   4 |  0 | 0
SOC  :   84 |  54 |  0 | 0
RDG  :    3 |   2 |  0 | 0
MDM  :   37 |  29 |  0 | 0
REM  :  248 | 181 |  5 | 0
UDI  :   18 |  10 |  0 | 0
DVD  :    3 |   7 |  0 | 0
LR   :  124 |  96 |  4 | 0
DLF  :    1 |   0 |  0 | 0
FN   :   18 |  83 | 14 | 0
EXD  :    1 |   0 |  0 | 0
REG  :    1 |   0 |  0 | 0
DIV  :    1 |   0 |  0 | 0
[{'Parti': 'FI', '1er': 3, '3e': 0, '2e': 87}, {'Parti': 'SOC', '1er': 84, '3e': 0, '2e': 54}, {'Parti': 'MDM', '1er': 37, '3e': 0, '2e': 29}, {'Parti': 'REM', '1er': 248, '3e': 5, '2e': 181}, {'Parti': 'UDI', '1er': 18, '3e': 0, '2e': 10}, {'Parti': 'LR', '1er': 124, '3e': 4, '2e': 96}, {'Parti': 'DLF', '1er': 1, '3e': 0, '2e': 0}, {'Parti': 'FN', '1er': 18, '3e': 14, '2e': 83}]


In [384]:
df_predictions.sort_values(by='prediction').head(3)

Unnamed: 0,an,dep,circo,code,nom,prenom,nuance,nuance_groupe,bloc,taux_vote_leg,second_tour,p_exprimes,prediction,classement,qualif,ecart
5379,2017,PARIS,5,75|5,WELGRYN,LOU,DIV,DIV,Divers,0.5961,,,0.005399,20,N,
5368,2017,PARIS,5,75|5,EDERICH RIGAUDIERE,CARINE,DIV,DIV,Divers,0.5961,,,0.005399,19,N,
5323,2017,PARIS,2,75|2,ZANGHELLINI,ANNE,DIV,DIV,Divers,0.6362,,,0.005598,24,N,


In [383]:
df_predictions.sort_values(by='prediction').tail(3)

Unnamed: 0,an,dep,circo,code,nom,prenom,nuance,nuance_groupe,bloc,taux_vote_leg,second_tour,p_exprimes,prediction,classement,qualif,ecart
4259,2017,MAINE-ET-LOIRE,5,49|5,MASSEGLIA,DENIS,REM,REM,Centre,0.5683,,,0.487124,1,O,
7086,2017,VAL-DE-MARNE,7,94|7,BRIDEY,JEAN-JACQUES,REM,REM,Centre,0.5695,,,0.488532,1,O,
3106,2017,HAUTS-DE-SEINE,9,92|9,SOLERE,THIERRY,LR,LR,Droite,0.5916,,,0.608598,1,E,


#### Save predictions

In [385]:
from datetime import datetime
now = datetime.now()
stamp = '{}_{}_{}h{}'.format(now.day, now.month, now.hour, now.minute)

# fichier pour simulation second tour
liste_id = ['an', 'dep', 'circo', 'code', 'nom', 'prenom', 'nuance', 'bloc']
liste_features = ['circo_leg_meme_nuance', 'circo_pres_meme_nuance', 'score_bloc_pres', 'depute_sortant',
                  'ancien_depute', 'au_gouvernement', 'ancien_ministre', 'membre_majorite']
df_second_tour = pd.concat([df_id[liste_id], X_val[liste_features],
                            df_predictions[['prediction', 'qualif']]], axis=1, join='inner')
df_predictions.to_excel('data/predictions_du_{}.xlsx'.format(stamp))
df_second_tour.to_excel('data/pred_format_second_tour_du_{}.xlsx'.format(stamp))

In [382]:
exportCsv = df_predictions[['dep','circo','code','prenom','nom','nuance','prediction','qualif']]
exportCsv['qualif'] = exportCsv['qualif'].apply(lambda x : x.replace('E','O'))
exportCsv['code'] = exportCsv['code'].apply(str.lower)
exportCsv['code'] = exportCsv['code'].apply(str).apply(lambda x : x.replace('|', '-'))
exportCsv['candidat'] = exportCsv['prenom'].apply(str.title) + ' ' + exportCsv['nom'].apply(str.title)
exportCsv['nom circo'] = exportCsv['dep'].apply(
    lambda x : x.replace('-', ' ')).apply(str.title) + ' - ' + exportCsv['circo'].apply(str)
exportCsv = exportCsv.drop(['dep', 'circo', 'prenom', 'nom'], axis=1)
exportCsv1 = exportCsv[exportCsv['qualif'] == 'O']
exportCsv1 = exportCsv1.drop(['qualif'], axis=1)
exportCsv1 = exportCsv1.sort_values(['code', 'prediction'], ascending=[True, False])
#print(exportCsv1.head())

circos = exportCsv1['code'].unique()
temp = pd.DataFrame()
d = {}
for c in circos:
    temp = exportCsv1[exportCsv1['code'] == c]
    l = []
    for index, rows in temp.iterrows():
        l.extend(rows[1:].values)
    d[c] = l

    
duels = {k: v for k, v in d.items() if len(v) < 9}
triangulaires = {k: v for k, v in d.items() if len(v) > 9}
#quadrangulaires = {k: v for k, v in d.items() if len(v) > 10}

duels = pd.DataFrame(duels)
duels.index = ['color1','score1','candidat1','nom circo','color2','score2','candidat2','nom circo']
duels = duels.T
duels = duels.iloc[:,:7]
#print(duels.head())

triangulaires = pd.DataFrame(triangulaires)
triangulaires.index = ['color1','score1','candidat1','nom circo','color2','score2','candidat2','nom circo','color3','score3','candidat3','nom circo']
triangulaires = triangulaires.T
nomCirco = triangulaires['nom circo'].iloc[:,0]
triangulaires = triangulaires.drop('nom circo', axis=1)
triangulaires['nom circo'] = nomCirco
#print(triangulaires.head())

# quadrangulaires = pd.DataFrame(quadrangulaires)
# print(quadrangulaires.head())

# quadrangulaires.index = ['color1','score1','candidat1','color2','score2','candidat2','color3',
#                          'score3','candidat3','color4','score4','candidat4']
# quadrangulaires = quadrangulaires.T
# print(quadrangulaires.head())

# vainqueurs1er.index = vainqueurs1er.code
# vainqueurs1er = vainqueurs1er.drop(['code','prenom','nom','qualif'], axis=1)
# vainqueurs1er['color1'], vainqueurs1er['score1'], vainqueurs1er['candidat1'] = vainqueurs1er.nuance_groupe,\
# vainqueurs1er.prediction, vainqueurs1er.candidat
# vainqueurs1er = vainqueurs1er.drop(['nuance_groupe', 'prediction', 'candidat'], axis=1)
# print(vainqueurs1er.head())


final = pd.concat([duels, triangulaires])#, quadrangulaires])
final = final.fillna(value='')
#final = pd.concat([vainqueurs1er, duels, triangulaires])#, quadrangulaires])

final.index.name = 'circo'
print(final.head())
final.to_csv('dataviz/data/resultats1_du_{}.csv'.format(stamp), index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

                       candidat1            candidat2 candidat3 color1 color2  \
circo                                                                           
1-1                Xavier Breton       Laurent Mallet               LR    MDM   
1-2    Charles De La Verpilliere  Marie-Jeanne Beguet               LR    MDM   
1-3     Stephanie Pernod Beaudon        Olga Givernet               LR    REM   
1-4           Stephane Trompille        Guy Billoudet              REM     LR   
1-5                  Damien Abad      Helene De Meire               LR    REM   

      color3 nom circo    score1    score2 score3  
circo                                              
1-1            Ain - 1  0.338142  0.272217         
1-2            Ain - 2  0.334827  0.265386         
1-3            Ain - 3  0.305308  0.282511         
1-4            Ain - 4  0.287454  0.261619         
1-5            Ain - 5  0.317913  0.240510         
