In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFECV, SelectFromModel
from collections import Counter

In [4]:
df = pd.read_excel(io='data/dataframe_elections.xlsx')
df_gagnant = pd.read_excel(io='data/histo_elus.xlsx')

### Parenthèse (

In [3]:
depart_frontaliers = ["PAS-DE-CALAIS","NORD", "AISNE", "ARDENNES", "MEUSE",
                      "MEURTHE-ET-MOSELLE", "MOSELLE", "BAS-RHIN", "HAUT-RHIN","TERRITOIRE DE BELFORT", "DOUBS", "JURA", "AIN", "HAUTE-SAVOIE", \
                      "SAVOIE", "HAUTES-ALPES", "ALPES-DE-HAUTE-PROVENCE", "ALPES-MARITIMES", \
                      "PYRENEES-ATLANTIQUES", "HAUTES-PYRENEES", "HAUTE-GARONNE",\
                      "ARIEGE", "PYRENEES-ORIENTALES", "AUDE", "HERAULT", "GARD",\
                     "BOUCHES-DU-RHONE", "VAR"]
depart_OM = ["GUADELOUPE", "GUYANE", "MARTINIQUE", "LA REUNION", "MAYOTTE"]
depart_corse = ["CORSE", "HAUTE-CORSE", "CORSE-DU-SUD"]
region_parisienne = ["PARIS","SEINE-ET-MARNE", "YVELINES", "ESSONNE", "HAUTS-DE-SEINE", "SEINE-SAINT-DENIS",\
                    "VAL-DE-MARNE", "VAL-D'OISE"]



In [4]:
df["dep"] = df["dep"].map(lambda x : x.upper())

In [5]:
df["depart_frontalier"] = [1 if dep in depart_frontaliers else 0 for dep in df["dep"]]
df["depart_OM"] = [1 if dep in depart_OM else 0 for dep in df["dep"]]
df["depart_CORSE"] = [1 if dep in depart_corse else 0 for dep in df["dep"]]
df["region_parisienne"] = [1 if dep in region_parisienne else 0 for dep in df["dep"]]

In [6]:
df[["dep", "depart_frontalier", "depart_OM", "depart_CORSE", "region_parisienne"]].drop_duplicates().to_excel("data/geographie_depart.xlsx")

### )

In [5]:
df_gagnant["gagnant"] = 1
df = df.merge(df_gagnant[["Nom", "Prénom", "gagnant", "Année", "Circo"]], left_on=["nom", "prenom", "an", "circo"], \
         right_on=["Nom", "Prénom", "Année", "Circo"], how="left").drop(["Nom", "Prénom", "Année", "Circo"], axis=1)
df["gagnant"] = df["gagnant"].fillna(0)

In [6]:
to_dummifiy = ["c_dep", "circo", "circo_bloc", "circo_leg_meme_nuance", "nb_candidats_meme_bloc", \
              "sexe", "bloc"]
to_drop = ["dep", "code", "circo_parti", "circo_nuance", "circo_nuance_groupe", "circo_nuance_groupe_pres", \
          "nom", "prenom", "etiquette", "nuance", "nuance_groupe", "voix", "p_voix", "second_tour"]

to_standardize = ["inscrits", "revenus_q1", "revenus_med", "revenus_q3", \
               "ecart_revenus"]
proba_var = ["etrangers", "part_impose", "chom_tot", "chom_tot_evol_5", \
             "chom_jeunes", "chom_jeunes_evol_5", "chom_adultes", "chom_adultes_evol_5",\
             "chom_seniors", "chom_seniors_evol_5", "p_agri", "p_commercants", "p_cadres",\
             "p_intermed", "p_employes", "p_ouvriers", "d_brevet", "d_bep", "d_bac", "d_sup",\
            "score_nuance_groupe_prec_leg", "score_bloc_prec_leg", "taux_vote_leg", "score_leg_exg", \
             "score_leg_g", "score_leg_c", "score_leg_d", "score_leg_exd", "score_leg_div", \
            "score_nuance_groupe_pres", "score_bloc_pres", "taux_vote_pres", "score_pres_exg",\
             "score_pres_g", "score_pres_c", "score_pres_d", "score_pres_exd", "score_pres_div"]

to_keep = ["circo_pres_meme_nuance", "circo_meme_nuance_president", "depute_sortant", "depart_frontalier", "depart_OM", "depart_CORSE", "region_parisienne"]

a_travailler = ["an", "score_candidat_prec_leg"]
cible = "gagnant"

In [7]:
for col in df :
    if col not in to_drop+to_dummifiy+to_standardize+to_keep+proba_var+a_travailler:
        print(col)

gagnant


In [8]:
df.drop(to_drop, axis=1, inplace=True)

In [9]:
df = df[df.an!=1997]

In [10]:
df1 = df.copy()

In [11]:
df1["score_candidat_prec_leg_<5"] = df1["score_candidat_prec_leg"].map(lambda x : 1 if x<0.05 else 0)
df1["score_candidat_prec_leg_>5<15"] = df1["score_candidat_prec_leg"].map(lambda x : 1 if (x>0.05) & (x<0.15) else 0)
df1["score_candidat_prec_leg_>15"] = df1["score_candidat_prec_leg"].map(lambda x : 1 if x>0.15 else 0)

In [12]:
df1.drop("score_candidat_prec_leg", axis=1, inplace=True)

In [13]:
df1.ix[:, :-10].isnull().sum()

an                                 0
c_dep                              0
circo                              0
inscrits                         182
etrangers                        360
part_impose                     1089
revenus_q1                      1089
revenus_med                     1089
revenus_q3                      1089
ecart_revenus                   1089
chom_tot                         360
chom_tot_evol_5                  360
chom_jeunes                      360
chom_jeunes_evol_5               360
chom_adultes                     360
chom_adultes_evol_5              360
chom_seniors                     360
chom_seniors_evol_5              360
p_agri                           680
p_commercants                    680
p_cadres                         680
p_intermed                       680
p_employes                       680
p_ouvriers                       680
d_brevet                         680
d_bep                            680
d_bac                            680
d

In [14]:
#df1 = df1.groupby(["an"]).transform(lambda x: x.fillna(x.mean()))


In [15]:
df1.isnull().sum()["an"]

0

In [16]:
for col in df1.columns:
    if df1.isnull().sum()[col]>0:
        try:
            df1[col] = df1[col].fillna(df[col].mean())
        except: 
            continue

In [17]:
df1 = pd.get_dummies(df1, columns=to_dummifiy)

In [21]:
to_dummifiy

['c_dep',
 'circo',
 'circo_bloc',
 'circo_leg_meme_nuance',
 'nb_candidats_meme_bloc',
 'sexe',
 'bloc']

In [23]:
del df1["circo_1"]
del df1["circo_bloc_Divers"]
del df1["circo_leg_meme_nuance_0.0"]
del df1["nb_candidats_meme_bloc_1.0"]
del df1["sexe_F"]
del df1["bloc_Divers"]
del df1["c_dep_1"]

### Modélisation

In [24]:
df_2017 = df1[df1["an"]==2017]
df_train = df1[df1["an"]!=2017]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop("gagnant", axis=1), df_train["gagnant"], test_size=0.33, random_state=42)

#### Dealing with the imbalanced dataset

Taux de 1 dans le dataset : 

In [26]:
np.sum(df1["gagnant"]==1)*100/(np.sum(df1["gagnant"]==1)+np.sum(df1["gagnant"]==0))

4.1223621789628657

Poids à donner aux 1 :

In [27]:
100/4

25.0

In [28]:
sample_weight = np.array([25 if i == 1 else 1 for i in y_train])

In [31]:
clf = RandomForestClassifier(n_estimators=20, max_depth=20)
clf.fit(X_train, y_train, sample_weight=sample_weight)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [32]:
print(classification_report(clf.predict(X_test), y_test))

             precision    recall  f1-score   support

        0.0       0.99      0.98      0.98      7104
        1.0       0.67      0.73      0.70       382

avg / total       0.97      0.97      0.97      7486



In [37]:
sfm = SelectFromModel(estimator=clf, prefit=True)
temp_tr = sfm.transform(X_train)
temp_te = sfm.transform(X_test)


In [40]:
print(temp_tr.shape, temp_te.shape)

(15198, 41) (7486, 41)


In [50]:
clf = RandomForestClassifier(n_estimators=20, max_depth=20)
clf.fit(temp_tr, y_train, sample_weight=sample_weight)
print(classification_report(clf.predict(temp_te), y_test))

             precision    recall  f1-score   support

        0.0       0.99      0.98      0.98      7105
        1.0       0.68      0.75      0.71       381

avg / total       0.97      0.97      0.97      7486



** Feat importance **

In [51]:
pd.DataFrame(clf.feature_importances_, list(df1.drop("gagnant", axis=1).columns), columns=["col"]).sort_values("col", ascending=False).head(20)

ValueError: Shape of passed values is (1, 41), indices imply (1, 202)

### SVC

In [55]:
svc = SVC(class_weight="balanced")
svc.fit(temp_tr, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [56]:
print(classification_report(svc.predict(temp_te), y_test))

             precision    recall  f1-score   support

        0.0       1.00      0.94      0.97      7486
        1.0       0.00      0.00      0.00         0

avg / total       1.00      0.94      0.97      7486



  'recall', 'true', average, warn_for)


### Logistic 

In [57]:
from sklearn.linear_model import LogisticRegression

In [96]:
clf = LogisticRegression(class_weight={1: 25})
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight={1: 25}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [97]:
print(classification_report(clf.predict(X_test), y_test))

             precision    recall  f1-score   support

        0.0       0.93      0.99      0.96      6587
        1.0       0.89      0.42      0.57       899

avg / total       0.92      0.92      0.91      7486

