## IMPORT DES LIBRAIRIES

pip install pandas
pip install sklearn
pip install matplotlib
pip install seaborn
pip install plotly
pip install scikit-plot
pip install xlrd

In [1]:
# Package de manipaulation des tableaux et dataframe
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Package pour analyse statistique
import scipy.stats as ss

# Package pour représentation graphique
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import scikitplot as skplt

# Affichage Image (png, jpeg, html ...)
from IPython.display import Image, HTML

# Typing des fonctions
from typing import List, Optional

Décision 1 : importer les fichiers 1 par 1 dans une DF car la mise en forme est différente pour chacun des fichiers

## DECLARATION DES FONCTIONS

In [2]:
# ---------------------------------------Fonction-------------------
# Création d'une dataframe de distribution 
def ditrib_calc(var1,var2,var3):
    df_temp = pd.DataFrame(var1.value_counts(normalize=True) * 100).reset_index().rename(columns={"index": var2, var3: "count"})
    return df_temp

# Représentation graphique : bar plot target

def ditrib_graph(var1,var2,var3):
    fig = px.bar(var1, 
                 y="target", x="count",
                 hover_data={'count':':.2f'}
    )
    fig.update_layout(
        title=var2,
        xaxis_title="Pourcentage par classe",
        yaxis_title=var3,
        margin=dict(l=0, r=0, t=30, b=50),
        width=500, height=300
    )
    return fig.show()

def del_top_lines(nb,df_init):
    # Fonction qui supprime : 
    # - les première lignes d'un tableau, 
    # - renomme les colonnes
    # - crée un nouvel index en préservant l'ancien
    #
    #
    # nb : index number of the new head table to keep
    # df_init : dataframe to change
    print("....index new head number : ", nb)
    df_fin_trans = df_init.iloc[nb:]
    df_fin_trans.columns = df_fin_trans.values[0]
    df_fin_trans = df_fin_trans.iloc[1:].reset_index().rename(columns={"index": "old_index"})
    return df_fin_trans

## IMPORT et MISE EN FORME DES DONNEES

In [4]:
#---------------------------------------IMPORT DES DONNEES----------------------------------------------------------
# On spécifie le chemin d'accès aux données
folder_path = "./data/"

# On importe le fichier principal
df_pres_result = pd.read_csv(folder_path + "2020_US_County_Level_Presidential_Results.csv", sep = ",")
df_pres_result_old = pd.read_csv(folder_path + "US_County_Level_Presidential_Results_08-16.csv", sep = ",")
df_education = pd.read_excel(folder_path + "Education.xls")
df_pop_est = pd.read_excel(folder_path + "PopulationEstimates.xls")
df_pov_est = pd.read_excel(folder_path + "PovertyEstimates.xls")
df_unemp = pd.read_excel(folder_path + "Unemployment.xls")

#-----------------------------------------TYPOLOGIE DES FEATURES--------------------------------------------------------


# Mise en forme et Modification des entêtes des fichiers excels 
df_education = del_top_lines(3,df_education)
df_pop_est = del_top_lines(1,df_pop_est)
df_pov_est = del_top_lines(3,df_pov_est)
df_unemp = del_top_lines(3,df_unemp)

# création d'une liste des dataframe à traiter
df_xls_list = [df_pres_result,df_education,df_pop_est,df_pov_est,df_unemp]
# Info et affichage premières lignes 
for d in df_xls_list:
    print(d.info())
    #display(d.head())

....index new head number :  3
....index new head number :  1
....index new head number :  3
....index new head number :  3
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3152 entries, 0 to 3151
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state_name      3152 non-null   object 
 1   county_fips     3152 non-null   int64  
 2   county_name     3152 non-null   object 
 3   votes_gop       3152 non-null   int64  
 4   votes_dem       3152 non-null   int64  
 5   total_votes     3152 non-null   int64  
 6   diff            3152 non-null   int64  
 7   per_gop         3152 non-null   float64
 8   per_dem         3152 non-null   float64
 9   per_point_diff  3152 non-null   float64
dtypes: float64(3), int64(5), object(2)
memory usage: 246.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3283 entries, 0 to 3282
Data columns (total 48 columns):
 #   Column                                          

In [None]:
#----------------------------------------------CREATION DE LA TARGET 'Party' ---------------------------------------------------
conditionlist = [
    (df_pres_result['per_gop'] < df_pres_result['per_dem']) ,
    (df_pres_result['per_gop'] > df_pres_result['per_dem'])]
choicelist = [0,1]
df_pres_result['party'] = np.select(conditionlist, choicelist, default='Not Specified')

#print(df_pres_result['party']) #Affichage de la colonne party

# -------------------------------Representation globale de la target party-------------------

target_distribution_bef_split = ditrib_calc(df_pres_result['party'],"target","party")
#print(target_distribution_bef_split)
ditrib_graph(target_distribution_bef_split,"Repartition de la target global","Target (GOP/DEM)")

In [None]:
#-------------------------------------------- REGROUPEMENT DES DONNES SOCIOECONOMIQUES -----------------------

#-------------------------------------------- MISE EN FORME DU JEU DE DONNEES -----------------------
# Séparer les variables explicatives de la target
X = df_pres_result.drop(["party"], axis=1)
y = df_pres_result["party"]

# split des données en train et test
set_seed = 1204
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=set_seed)

# Création des listes de variables par catégorie
cat_cols = X_train.select_dtypes(exclude=np.number).columns.tolist()
num_cols =  X_train.select_dtypes(include=np.number).columns.tolist()
all_cols = X_train.columns.tolist()


# ------------------------------------DISTRIBUTION DE LA feature target 'party'-----------------------------------------------
target_distribution = ditrib_calc(y_train,"target","party")
#print(target_distribution)
ditrib_graph(target_distribution,"Repartition de la target sur le jeu y train","Target (GOP/DEM)")