## Sommaire 
1. [Sommaire](#ch1) <a class="anchor" id="ch1"></a>
1. [Contexte Python](#ch2) 


<a class="anchor" id="ch2"></a>
## [Contexte Python](#ch1) 
[Chapitre préc.](#ch1) - [Chapitre suiv.](#ch3)


### Import des librairies


In [2]:
#### builtin :

# System
import os, sys, gc, psutil 

# Gestion des dates
from datetime import datetime, date, time, timedelta, timezone
from dateutil import tz

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import math
import random, re
#import random, re, psutil, string, math, tkinter
#from os import listdir
from collections import Counter
#from glob import glob
from itertools import compress
import pickle



In [3]:
startime = datetime.now()
print(f'Début des traitements à {startime.strftime("%d/%m/%Y, %H:%M:%S")}')

Début des traitements à 05/09/2023, 12:20:14


In [4]:
#### for notebook process :
import dummy as dummy # dummy chargé pour vérifier que le scrapping de la version sur pypi.org fonctionne sur le projet data-dummy

import IPython as ipy
from IPython.display import display, Markdown, Latex, HTML

import ipywidgets as ipw
from ipywidgets import FloatProgress  # affichage de la barre de progession des traitements


#### numpy and pandas for data manipulation
import numpy as np
import pandas as pd 
import scipy as sci
from scipy.stats import norm, skew

import featuretools as ft
import woodwork.logical_types as ft_lt
from featuretools import selection
#from featuretools.variable_types import list_variable_types


#### machine learning :
import sklearn as skl  # Pour afficher la version du package

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold, RFECV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.model_selection import KFold, cross_validate, cross_val_score, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, fbeta_score, adjusted_rand_score, accuracy_score, auc, roc_auc_score, roc_curve, make_scorer, make_scorer, classification_report, fbeta_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import RandomOverSampler, SMOTE


import lightgbm as lgb

import xgboost as xgb
from xgboost import XGBClassifier


#### For plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
plt.close("all")

import seaborn as sns

import missingno as msno  # Affichage des données vides des fichiers


#### Scrapping :
import requests as req  # Pour afficher la version du package
from requests import get  # On importe la fonction 'get' (téléchargement) de 'requests'

import lxml as lxml  # Pour afficher la version du package
from lxml import html

import lime
import lime.lime_tabular

import shap

#try:
#except Exception as e: pass
#raise Exception()

### Définitions des méthodes

#### Description des données

In [5]:
# Récupération d'une liste triée de valeurs unique sur une colonne d'un df
def getDistinctValuesOfColonne(df, colonne, sort=False):
    my_dict = dict.fromkeys(df[colonne].values)
    dict_colonne = {k: v for k, v in my_dict.items() if pd.isna(k) == False}
    if sort:
        return list(sorted(dict_colonne.keys()))
    else:
        return list(dict_colonne.keys())


# Récupération du nombre d'outlier
def fGetNombreOutliers(colonne_values):
    # Check for outliers (assuming a normal distribution)
    if colonne_values.dtypes in ['int64', 'float64']:
        mean = colonne_values.mean()
        std = colonne_values.std()
        return ((colonne_values - mean).abs() > 3 * std).sum()
    else:
        return 'N/A'    


# Récupération de données indormatives sur une colonne d'un df
def getInfoOfOneDataFrameColonne(df, colonne, max_char=100, log=False):
    list_colonne_value = list(set(df.loc[~df[colonne].isna(), colonne].unique()))
    if log:
        print(df[colonne].dtypes)
        print(df[colonne].isnull().sum() / df.shape[0])
        print(df[colonne].isnull().sum())
        print(df[colonne].count())
        print(len(list_colonne_value) / df.shape[0])
        print(len(list_colonne_value))
        print(fGetNombreOutliers(df.loc[~df[colonne].isna(), colonne]))
        print('|'.join([str(_) for _ in list_colonne_value])[0:max_char:1])
    return pd.DataFrame(data={
        'Colonne':
        colonne,
        'type':
        df[colonne].dtypes,
        '% Null':
        df[colonne].isnull().sum() / df.shape[0],
        'Nb valeurs Null':
        df[colonne].isnull().sum(),
        'Nb valeurs':
        df[colonne].count(),
        '% Distinctes':
        len(list_colonne_value) / df.shape[0],
        'Nb valeurs distinctes':
        len(list_colonne_value),
        'Nombre d\'outliers':
        fGetNombreOutliers(df.loc[~df[colonne].isna(), colonne]),
        'Liste valeurs':
        '|'.join([str(_) for _ in list_colonne_value])[0:max_char:1]
    }, index=[0])


# Soulignement du texte
def fSouligneTitle(title, decalage = 0):
    souligne = ''
    for i in range(decalage):
        souligne = souligne + ' '
    
    len_title = max([len(x.strip()) for x in title.split("\n")])
    for i in range(len_title):
        souligne = souligne + '-'

    return souligne


# Imprime un texte souligné
def fPrintTitleSouligne(title, decalage = 0):
    line = ''
    for i in range(decalage):
        line = line + ' '

    line = line + title
    print(line)
    print(fSouligneTitle(title, decalage))

# Imprime un texte 
def fPrintTitle(title, decalage = 0, souligne=False):
    line = ''
    for i in range(decalage):
        line = line + ' '

    line = line + title
    print(line)
    if souligne:
        print(fSouligneTitle(title, decalage))

# Récupération de données indormatives d'un df
def getInfoOfDataFrame(df, file, log=False):
    
    title_part = f'Description du fichier {file} :'
    print(fSouligneTitle(title_part))
    print(title_part)
    print(fSouligneTitle(title_part))
            
    print()

    # Dimension de DataFrame
    print('=> Taille du DataFrame')
    nb_lignes, nb_colonnes = df.shape
    df_caracteristiques_fichier = pd.DataFrame(data={
        'Nom du fichier': file,
        'Colonnes': nb_colonnes,
        'Lignes': nb_lignes
    },
                                               index=[0])
    display(df_caracteristiques_fichier.set_index('Nom du fichier'))

    if nb_lignes != 0 and nb_colonnes != 0:
        # Création d'un DataFrame pour y stocker des informations des colonnes à afficher
        df_detail_colonnes = pd.DataFrame()
        if log: print(list(df))
        for colonne in list(df):
            if log: print(colonne)
            df_detail_colonne_courant = getInfoOfOneDataFrameColonne(
                df, colonne, log=log)
            if log: print(df_detail_colonne_courant)
            df_detail_colonnes = pd.concat(
                [df_detail_colonnes, df_detail_colonne_courant],
                ignore_index=True)    
            if log: print("Passage colonne suivante.\n")
            
        pd.options.display.float_format = '{:.2%}'.format
        print('=> Description des colonnes')
        display(HTML(df_detail_colonnes.to_html()))
        print()

        print(f'=> Nombre de lignes en double : {df.duplicated().sum()}')
        print()

        # Représentation graphique du replisage du dataset
        print('=> Représentation graphique du remplissage du jeu de données')
        msno.matrix(df, color=(0, 100 / 255, 170 / 255))
        plt.show()
        print()
        
        # Représentation graphique # Affichage des valeurs manquantes en barplot
        if df.isna().any().sum() > 0:
            print('=> Représentation graphique des données manquantes')        
            plot_null_prop(df)
            print()

        pd.options.display.float_format = '{:.2f}'.format
        
        df_types_num = [k for k, v in df.dtypes.to_dict().items() if str(v) != 'object']
        if len(df_types_num) > 0:
            print('=> Statistique sur les données numériques')
            display(HTML(df[df_types_num].describe().T.to_html()))
            print()

        df_types_cat = [k for k, v in df.dtypes.to_dict().items() if str(v) == 'object']
        if len(df_types_cat) > 0:
            print('=> Statistique sur les données catégorielles')
            display(HTML(df[df_types_cat].describe().T.to_html()))
            print()
            
        print('=> Présentation succinctes des données')
        display(df.head(5).T)
        print()


# Ajouter le label pour les valeurs 
def addlabels(x):
    for i in range(len(x)):
        plt.text(x[i] + 0.01, i, f'{x[i]:.1%}', ha='left', va='center')


# Affichage des valeurs manquantes en barplot
def plot_null_prop(df, commentaire=""):
    """
    Input : dataframe, commentaire (option)
    Output : graphique
    Affiche dans un bar plot, le % de valeurs manquantes de chaque colonne du dataframe 
    """
    
    # Proportion de valeurs manquantes par colonne
    null_counts = df.isnull().mean(axis=0).sort_values(ascending=True)
    
    # barplot des valeurs manquantes par colonne
    fig, ax = plt.subplots(figsize=(8, null_counts.shape[0] * 0.3))
    plt.barh(np.arange(len(null_counts[null_counts != 0])),null_counts[null_counts != 0], color=(0, 100 / 255, 170 / 255))
    plt.yticks(np.arange(len(null_counts[null_counts != 0])),null_counts[null_counts != 0].index,rotation=0, fontsize=10)
    
    addlabels(null_counts[null_counts != 0])
    
    ax.xaxis.set_major_locator(ticker.MultipleLocator(0.25))
    ax.xaxis.set_minor_locator(ticker.MultipleLocator(0.05))
    ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))
    
    plt.axvline(x = 1, color = 'grey')
    plt.axvline(x = 0.50, color = 'grey', linestyle=':')

    sns.despine()
    plt.margins(y=0)
    plt.tight_layout()
    plt.title('Proportion de données manquantes dans les colonnes concernées\n'+commentaire)
    plt.show()




In [6]:
def fGetDescribeDf(df, nb_ligne=60):
    
    df_types_num = [k for k, v in df.dtypes.to_dict().items() if str(v) != 'object']
    if len(df_types_num) > 0:
        print('=> Statistique sur les données numériques')
        if len(df_types_num) <= nb_ligne:
            display(df[df_types_num].describe().T)
        else:
            for i in range(int(len(df_types_num) / nb_ligne)):
                display(df[df_types_num].describe().T.head((i + 1) * nb_ligne).tail(nb_ligne))
            display(df[df_types_num].describe().T.tail(len(df_types_num) % nb_ligne))
        print()

    df_types_cat = [k for k, v in df.dtypes.to_dict().items() if str(v) == 'object']
    if len(df_types_cat) > 0:
        print('=> Statistique sur les données catégorielles')
        if len(df_types_cat) <= nb_ligne:
            display(df[df_types_cat].describe().T)
        else:
            for i in range(int(len(df_types_cat) / nb_ligne)):
                display(df[df_types_cat].describe().T.head((i + 1) * nb_ligne).tail(nb_ligne))
            display(df[df_types_cat].describe().T.tail(len(df_types_num) % nb_ligne))
        print()

    print('=> Variables à NaN :')
    df_nan = pd.DataFrame(df.isna().sum(), columns=['Valeurs à NaN'])
    df_nan['% de NaN'] = 100 * df_nan['Valeurs à NaN'] / df.shape[0]

    if df_nan.shape[0] <= nb_ligne:
        display(df_nan)
    else:
        for i in range(int(df_nan.shape[0] / nb_ligne)):
            display(df_nan.head((i + 1) * nb_ligne).tail(nb_ligne))
        display(df_nan.tail(df_nan.shape[0] % nb_ligne))
    print()
    
    print('=> Entête du DataFrame :')
    display(df.head())
 

In [7]:
def fPrintFullBigDf(df, nb_ligne=60):

    if df.shape[0] <= nb_ligne:
        display(df)
    else:
        for i in range(int(df.shape[0] / nb_ligne)):
            display(df.head((i + 1) * nb_ligne).tail(nb_ligne))
        display(df.tail(df.shape[0] % nb_ligne))
    print()
    

#### Scrapping

In [8]:
# Récupération de la version courrante du package sur pypi.org
def getCurrentVersionPipProject(project, uri='https://pypi.org/project'):
    if project == 'sklearn':
        project = 'scikit-learn'

    if project == 'PIL':
        project = 'pillow'
        
    if project == 'cv2':
        project = 'opencv-python'

    return scrapingDataFromHtml('/'.join(
        [uri,
         project]), '//h1[@class="package-header__name"]/text()')[0].strip()


# Scraping générique
def scrapingDataFromHtml(url, xpath):
    # uri : url racince de la page du projet à scraper
    html_value = None
    try:
        # code that may cause exception
        response = get(url)
        if response.status_code == 200:
            # Si la requete s'est bien passee
            tree = html.fromstring(response.content)
            # Get element using XPath
            html_value = tree.xpath(xpath)
        else:
            html_value = [f' Page_non_trouvée_{url.split("/")[-1]}']
    except:
        # code to run when exception occurs
        html_value = [' Internet_non_disponible']

    return html_value

# Récupération de la version courrante de Python sur python.org
def getCurrentVersionPython():
    return scrapingDataFromHtml('https://www.python.org/', '//*[@id="content"]/div/section/div[1]/div[2]/p[2]/a/text()')[0].split(' ')[1]
    


#### Analyse de données

In [9]:
def fRepresentationDistributionVariableCategorielle(df, key, hue=None, hue_labels=None, max_values=60, log_scale=None, 
                                                    list_forcage_varaible_categorielle=None, table_name=None, dict_description_colonne=None):
    list_variable_categorielle = [k for k, v in df.dtypes.to_dict().items() if ((str(v) == 'object') | (str(v) == 'bool'))]
                                                        
    if list_forcage_varaible_categorielle:
        list_variable_categorielle = list_variable_categorielle + list_forcage_varaible_categorielle
    for mesure in list(sorted(set(list_variable_categorielle))):
        if len(list(set(df[mesure]))) != df.shape[0]:
            fPrintTitleSouligne(mesure)
            if ~((table_name == None) & (dict_description_colonne == None)):
                print(dict_description_colonne.get(f"{table_name} - {mesure}"))

            if (hue != None) & (mesure != hue):
                df_4_plot = df[[key, hue, mesure]].copy().groupby(by=[mesure, hue]).count()
                df_4_plot = pd.pivot_table(df_4_plot, values = key , index=mesure, columns=hue)
                df_4_plot.rename(columns=hue_labels, inplace=True)
                try:
                    df_4_plot = df_4_plot.sort_values(by=df_4_plot.idxmax(axis=1)[0], ascending=True)
                except:
                    df_4_plot = df_4_plot.sort_values(by=list(df_4_plot)[0], ascending=True)
                df_wip = df[[hue, mesure]]
                df_wip[mesure] = df_wip[mesure].isna()
                df_wip[hue] = df_wip[hue].apply(lambda x: hue_labels.get(x))
                df_wip.columns = ['TARGET', 'NaN']
                df_wip.set_index('TARGET', inplace=True)
                df_wip = df_wip.groupby(by=hue).sum()

            else:
                df_4_plot = df[[key, mesure]].copy().groupby(by=mesure).count()
                df_4_plot.rename(index=hue_labels, inplace=True)
                df_4_plot = df_4_plot.sort_values(by=key, ascending=True)
                df_wip = df[mesure].isna().sum()
                df_wip = pd.DataFrame([[key, df_wip]], columns=['TARGET', 'NaN'])
                df_wip.set_index('TARGET', inplace=True)

            df_describe = df_4_plot.describe().T
            colonnes = list(df_describe)
            df_describe.reset_index(inplace=True)
            df_describe.columns = ['TARGET'] + colonnes
            df_describe = df_describe.set_index('TARGET')
            df_describe = pd.concat([df_describe, df_wip], axis=1, ignore_index=True)
            df_describe.columns = colonnes + ['NaN']
            df_describe['NaN'] = df_describe.apply(lambda x: f"{x['NaN']} [{100 * (x['NaN'] / x['max']):.2f}%]", axis=1) 
            display(df_describe)
            #display(df_4_plot)
                
            nb_values = df_4_plot.shape[0]
    
            #Réprésentation graphique de la distribution de la données
            complement_titre = ''
            size_h = nb_values
            if max_values:
                if max_values < nb_values:
                    complement_titre = f" [{max_values}/{nb_values} valeurs]"
                    size_h = max_values
                    
                df_4_plot = df_4_plot.head(max_values)
    
            if not log_scale:
                log_scale = False
                for col in list(df_4_plot):
                    if (df_4_plot[col].min() / df_4_plot[col].max()) < 0.05:
                        log_scale = True
        
            if (hue != None) & (mesure != hue):
                ax = df_4_plot.plot.barh(color=['#2A7AB9', 'green', '#FF1A19'], log=log_scale, figsize=(10, size_h))
                ax.bar_label(ax.containers[0])
                ax.bar_label(ax.containers[1])
                ax.bar_label(ax.containers[2])
            else:
                ax = df_4_plot.plot.barh(color='#2A7AB9', log=log_scale, figsize=(10, size_h))
                ax.get_legend().remove()
                ax.bar_label(ax.containers[0])
            
            plt.title(f"Répartition des valeurs de {mesure}{complement_titre}")
            if log_scale:
                plt.xlabel("Quantité (log)")
            else:
                plt.xlabel("Quantité")
            
            plt.ylabel(mesure)
            
            plt.show()
            print()


# Représentation graphique de la distribution d'une variable pour analyse 
def representationDistriVaraible(df, key, mesure) :
    
    #Réprésentation graphique de la distribution de la données
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))

    plt.suptitle(f'Répartition des valeurs : {mesure}', fontsize=16, y=1.02)
    plt.tight_layout()
    
    sns.kdeplot(df[mesure], ax=ax1, fill=True)
    
    mean, std = np.mean(df[f'{mesure}']), np.std(df[f'{mesure}'])
    rvs = np.linspace(mean - 3*std, mean + 3*std, 100)
    pdf = norm.pdf(rvs, mean, std)
    ax1.plot(rvs, pdf, c="r", label="Distribution normale (PDF)")

    ax1.set_title(f'coefficient de skewness : {skew(df[mesure], bias=False):.2f}', fontsize=14)
    ax1.set_ylabel('Densité')
    ax1.set_xlabel('Valeurs')
    ax1.tick_params(labelrotation=45)

    df.boxplot(column=mesure, grid=True, ax=ax2)
    ax2.set_title('Valeurs extrèmes', fontsize=12)
    ax2.set_ylabel('Valeurs')
    plt.xticks([1], [''])

    plt.show()
    

# Déscription d'une variable pour analyse 
def varaibleDescribe(df, key, mesure, list_colonne_a_afficher) :
    df_value_zero = df.loc[df[mesure] == 0].shape[0]
    df_value_not_zero = df.loc[df[mesure] != 0].shape[0]
    df_value_nan = df.loc[df[mesure].isna()].shape[0]
    
    display(pd.DataFrame([[mesure, df.shape[0], df_value_nan, df_value_zero, df_value_not_zero]], columns=['Variable', 'Nb individus', 'Dont null', 'Dont = 0', 'Dont != 0']).set_index('Variable'))
    

    representationDistriVaraible(df.loc[~df[mesure].isna()], key, mesure)

    pd.options.display.float_format = '{:.1f}'.format
    df_describe = df[[key, mesure]].describe().T
    display(df_describe)

    upper = df_describe.iloc[0, 6] + 1.5 * (df_describe.iloc[0, 6] - df_describe.iloc[0, 4])
    lower = df_describe.iloc[0, 4] - 1.5 * (df_describe.iloc[0, 6] - df_describe.iloc[0, 4])
    
    pd.set_option('display.min_rows', 2000)

    print()
    print(f'  - Extrait de la liste des outilers :')

    df_upper_outlier = df.loc[(df[mesure] > upper), [key] + list_colonne_a_afficher + [mesure]].sort_values(by=mesure, ascending=False)
    upper_outlier_html = ''
    if df_upper_outlier.shape[0] > 0:
        upper_outlier_html = df_upper_outlier.head(10).to_html()

    df_lower_outlier = df.loc[(df[mesure] < lower), [key] + list_colonne_a_afficher + [mesure]].sort_values(by=mesure, ascending=False)
    lower_outlier_html = ''
    if df_lower_outlier.shape[0] > 0:
        lower_outlier_html = df_lower_outlier.tail(10).to_html()

    display(HTML(f'<table><tr><th>{df_lower_outlier.shape[0]} Lower Outliers (< {lower:.2f})</th><th width="100">&nbsp;</th><th>{df_upper_outlier.shape[0]} Upper Outliers (> {upper:.2f})</th></tr><tr><td>{lower_outlier_html}</td><td>&nbsp;</td><td>{upper_outlier_html}</td></tr></table>'))


def testNormalisation(df, mesure):
    
    liste_methode = ['_log', '_sqrt', '_cbrt', '_1x', '_xx']
    liste_methode_nom = ['Logrithmique [log(x+1)]', 'Racine carré', 'Racine cubique', 'Inverse [1/(x+1)]', 'Carré']
    
    coef_skew = skew(df[mesure], bias=False)
    
    df_courant = df.copy()
    df_courant[mesure].fillna(0, inplace=True)
    
    df_courant[f'{mesure}_log'] = 0
    if coef_skew > 0:
        df_courant.loc[df_courant[mesure] != 0, [f'{mesure}_log']] = np.log(df[mesure] + 1)
    else:
        df_courant.loc[df_courant[mesure] != 0, [f'{mesure}_log']] = np.log(max(df[mesure]) + 1 - df[mesure])

    df_courant[f'{mesure}_sqrt'] = 0
    if coef_skew > 0:
        df_courant.loc[df_courant[mesure] != 0, [f'{mesure}_sqrt']] = np.sqrt(df[mesure])
    else:
        df_courant.loc[df_courant[mesure] != 0, [f'{mesure}_sqrt']] = np.sqrt(max(df[mesure]) + 1 - df[mesure])

    df_courant[f'{mesure}_cbrt'] = 0
    if coef_skew > 0:
        df_courant.loc[df_courant[mesure] != 0, [f'{mesure}_cbrt']] = np.cbrt(df[mesure])
    else:
        df_courant.loc[df_courant[mesure] != 0, [f'{mesure}_cbrt']] = np.cbrt(max(df[mesure]) + 1 - df[mesure])

    df_courant[f'{mesure}_1x'] = 0
    if coef_skew > 0:
        df_courant.loc[df_courant[mesure] != 0, [f'{mesure}_1x']] = 1 / (df[mesure] + 1)
    else:
        df_courant.loc[df_courant[mesure] != 0, [f'{mesure}_1x']] = 1 / (max(df[mesure]) + 1 - df[mesure])

    df_courant[f'{mesure}_xx'] = df[mesure] * df[mesure]
    
    #Réprésentation graphique de la distribution de la données
    nb_ligne = 2
    nb_colonnes = 3
    fig, ax = plt.subplots(nrows=nb_ligne, ncols=nb_colonnes, figsize=(18, 12))

    plt.suptitle(f'Normalisation de {mesure} - skewness : {coef_skew:.1f}', fontsize=16, y=1.02)
    
    for methode in liste_methode:
        index = liste_methode.index(methode)
        
        sns.kdeplot(df_courant[f'{mesure}{methode}'], ax=ax[int(index / nb_colonnes), index % nb_colonnes], fill=True)
        ax[int(index / nb_colonnes), index % nb_colonnes].set_title(f"{liste_methode_nom[index]} - skewness : {skew(df_courant[f'{mesure}{methode}'], bias=False):.2f}", fontsize=14)
    
        mean, std = np.mean(df_courant[f'{mesure}{methode}']), np.std(df_courant[f'{mesure}{methode}'])
        rvs = np.linspace(mean - 3*std, mean + 3*std, 100)
        pdf = norm.pdf(rvs, mean, std)
        ax[int(index / nb_colonnes), index % nb_colonnes].plot(rvs, pdf, c="r", label="Distribution normale (PDF)")
        
        ax[int(index / nb_colonnes), index % nb_colonnes].set_ylabel('Densité')
        ax[int(index / nb_colonnes), index % nb_colonnes].set_xlabel('Valeurs')
        ax[int(index / nb_colonnes), index % nb_colonnes].tick_params(labelrotation=45)
        

    ax[1, 2].remove()  # don't display empty ax

    fig.tight_layout()
    plt.show()
    
    return [[mesure,coef_skew, 
             skew(df_courant[f'{mesure}_log'], bias=False),
             skew(df_courant[f'{mesure}_sqrt'], bias=False),
             skew(df_courant[f'{mesure}_cbrt'], bias=False),
             skew(df_courant[f'{mesure}_1x'], bias=False),
             skew(df_courant[f'{mesure}_xx'], bias=False),
            ]]

    

In [10]:
def fRepresentationDistributionVariableContinues(df, key, mesure, hue=None, hue_labels=None, list_forcage_varaible_categorielle=None, table_name=None, dict_description_colonne=None):
    df = df.copy()
    if (mesure != key) & (mesure != hue):
        fPrintTitleSouligne(mesure)
        if ~((table_name == None) & (dict_description_colonne == None)):
            print(dict_description_colonne.get(f"{table_name} - {mesure}"))
        df_describe = df[mesure].describe()
        df_describe = pd.concat([df_describe, pd.DataFrame([f"{df[mesure].isna().sum()} [{df[mesure].isna().sum()/len(df[mesure]):.2f}%]"], index=['NaN'])])
        #df_describe = df_describe.append(pd.Series([f"{df[mesure].isna().sum()} [{df[mesure].isna().sum()/len(df[mesure]):.2f}%]"], index=['NaN']))
        df_describe.name = 'Total'
        #display(df_describe.T)
        if hue:
            dff_wip = df[[mesure, hue]].copy()
            dff_wip[hue] = dff_wip[hue].apply(lambda x: hue_labels.get(x))
#            display(dff_wip)
            for var in list(set(dff_wip[hue])):
                dff1_wip = dff_wip.loc[(dff_wip[hue]==var), mesure]
                dff2_wip = dff1_wip.describe()
                dff2_wip = pd.concat([dff2_wip, pd.DataFrame([f"{dff1_wip.isna().sum()} [{dff1_wip.isna().sum()/len(dff1_wip):.2f}%]"], index=['NaN'])])
                #dff2_wip = dff2_wip.append(pd.Series([f"{dff1_wip.isna().sum()} [{dff1_wip.isna().sum()/len(dff1_wip):.2f}%]"], index=['NaN']))
                dff2_wip.name = var
#                display(dff1_wip)
                df_describe = pd.concat([df_describe, dff2_wip], axis=1)
                
        
        display(df_describe.T)
        #display(df_4_plot)
        print()
        print("Nombre de valeurs distinctes : ", len(list(set(df[mesure]))))
        print()

        df = df.loc[~df[mesure].isna(), [key, mesure, hue]].copy()
        df[f'{mesure}_num'] = df[mesure]
        
        if 'timedelta64' not in str(df[mesure].dtypes).lower():
            #Réprésentation graphique de la distribution de la données
            fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
        
            plt.suptitle(f'Répartition des valeurs : {mesure}', fontsize=16, y=1.02)
            plt.tight_layout()
    
            sns.kdeplot(df[mesure], ax=ax1, fill=True, label='Total')
            if hue:
                dff_wip = df[[mesure, hue]].copy()
                dff_wip[hue] = dff_wip[hue].apply(lambda x: hue_labels.get(x))
                for var in list(set(dff_wip[hue])):
                    dff1_wip = dff_wip.loc[(dff_wip[hue]==var), mesure]
                    sns.kdeplot(dff1_wip, ax=ax1, fill=True, label=var)
    
            if 'time' in str(df[mesure].dtypes).lower():
                df[f'{mesure}_num'] = pd.to_datetime(df[mesure]).astype('int64') / 10**9 / 3600 / 24    
                
            mean, std = np.mean(df[f'{mesure}_num']), np.std(df[f'{mesure}_num'])
            rvs = np.linspace(mean - 3*std, mean + 3*std, 100)
            pdf = norm.pdf(rvs, mean, std)
            ax1.plot(rvs, pdf, c="r", label="Distribution normale (PDF)")
            ax1.set_title(f"coefficient de skewness : {skew(df.loc[~df[mesure].isna(), f'{mesure}_num'], bias=False):.2f}", fontsize=14)
           
            ax1.set_ylabel('Densité')
                
            ax1.set_xlabel('Valeurs')
            ax1.legend()
            ax1.tick_params(labelrotation=45)
        
            df.boxplot(column=f'{mesure}_num', grid=True, ax=ax2)
            ax2.set_title('Valeurs extrèmes', fontsize=12)
            ax2.set_ylabel('Valeurs')
            plt.xticks([1], [''])
        
            plt.show()

        df[hue] = df[hue].apply(lambda x: hue_labels.get(x))
        print()
        display(HTML(f"<table><tr><td>{pd.DataFrame(df.loc[~df[mesure].isna(), [key, mesure, hue, f'{mesure}_num']].sort_values(by=mesure)).head(15).to_html()}</td><td>{pd.DataFrame(df.loc[~df[mesure].isna(), [key, mesure, hue, f'{mesure}_num']].sort_values(by=mesure)).tail(15).to_html()}</td></tr></table>"))
        print()

       

In [11]:
# Calcul des dimmensions des classes de catégories pour la transformation des variables continues en varaiables catégorielles 
def getDimensionBins(df, startBins=None) :
    if startBins is None :
        bins = len(df)
    else :
        bins = startBins
        
    try :
        pd.qcut(df, bins)
    except ValueError:
        bins = int(0.75 * bins)
        bins = getDimensionBins(df, startBins= bins)
    
    return bins
  

#### Function to Handle Categorical Variables

To make the code more efficient, we can now write a function to handle the categorical variables for us. This will take the same form as the `agg_numeric` function in that it accepts a dataframe and a grouping variable. Then it will calculate the counts and normalized counts of each category for all categorical variables in the dataframe.

In [12]:
def count_categorical(df, group_var, df_name):
    """Computes counts and normalized counts for each observation
    of `group_var` of each unique category in every categorical variable
    
    Parameters
    --------
    df : dataframe 
        The dataframe to calculate the value counts for.
        
    group_var : string
        The variable by which to group the dataframe. For each unique
        value of this variable, the final dataframe will have one row
        
    df_name : string
        Variable added to the front of column names to keep track of columns

    
    Return
    --------
    categorical : dataframe
        A dataframe with counts and normalized counts of each unique category in every categorical variable
        with one row for every unique value of the `group_var`.
        
    """
    
    # Select the categorical columns
    categorical = one_hot_encoder(df.select_dtypes('object'))

    # Make sure to put the identifying id on the column
    categorical[group_var] = df[group_var]

    # Groupby the group var and calculate the sum and mean
    categorical = categorical.groupby(group_var).agg(['sum', 'mean'])
    
    column_names = []
    
    # Iterate through the columns in level 0
    for var in categorical.columns.levels[0]:
        # Iterate through the stats in level 1
        for stat in ['count', 'count_norm']:
            # Make a new column name
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    
    categorical.columns = column_names
    
    return categorical

In [13]:
# Fonction pour ré"cupérer la liste exhaustive des valeurs d'un dataframe
def fGetDictOfCategorialValues(df):
    dict_of_values = {}
    for var in [k for k, v in df.dtypes.to_dict().items() if str(v) == 'object']:
        dict_of_values[var] = list(set(df[var]))

    return dict_of_values


In [14]:
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns


In [15]:
def fSetBoolEncoding(df, list_of_col_to_encode, values_to_encode=None):
    
    # Create a label encoder object
    le = LabelEncoder()
    le_count = 0
    
    # Iterate through the columns
    for col in list_of_col_to_encode:
        if values_to_encode:
            le.fit(values_to_encode)
            df[col] = le.transform(df[col])

        df[col] = df[col].astype('bool')

    return df

In [16]:
def fSetEncodingCategorialFeatures(df, dict_of_values, label_encoding_uniquement=False):
    
    # Create a label encoder object
    le = LabelEncoder()
    le_count = 0
    
    # Iterate through the columns
    for col in df:
        if df[col].dtype == 'object':
            # If 2 or fewer unique categories
            if len(dict_of_values.get(col)) <= 2:
                # Train on possible values
                le.fit(dict_of_values.get(col))
                # Transform both training and testing data
                df[col] = le.transform(df[col])
                
                # Keep track of how many columns were label encoded
                le_count += 1
                
    print('%d colonnes sont traité par Label Encoding.' % le_count)
    if label_encoding_uniquement == False:
        # one-hot encoding of categorical variables
        df, new_columns = one_hot_encoder(df)

    return_colonnes = [x for x in list(df) if 'Data_Not_Available' not in x]
    df = df[return_colonnes]
    
    print('Features shape: ', df.shape)
    if label_encoding_uniquement == False:
        print("liste des nouvelle colonnes : ", [x for x in return_colonnes if x in new_columns])

    return df
    

#### Aggregating Numeric Columns


In [17]:
def agg_numeric(df, group_var, df_name):
    """Aggregates the numeric values in a dataframe. This can
    be used to create features for each instance of the grouping variable.
    
    Parameters
    --------
        df (dataframe): 
            the dataframe to calculate the statistics on
        group_var (string): 
            the variable by which to group df
        df_name (string): 
            the variable used to rename the columns
        
    Return
    --------
        agg (dataframe): 
            a dataframe with the statistics aggregated for 
            all numeric columns. Each instance of the grouping variable will have 
            the statistics (mean, min, max, sum; currently supported) calculated. 
            The columns are also renamed to keep track of features created.
    
    """
    # Remove id variables other than grouping variable
    for col in df:
        if col != group_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    group_ids = df[group_var]
    numeric_df = df.select_dtypes('number')
    numeric_df[group_var] = group_ids

    # Group by the specified variable and calculate the statistics
    agg = numeric_df.groupby(group_var).agg(['count', 'mean', 'max', 'min', 'sum', 'std']).reset_index()

    # Need to create new column names
    columns = [group_var]

    # Iterate through the variables names
    for var in agg.columns.levels[0]:
        # Skip the grouping variable
        if var != group_var:
            # Iterate through the stat names
            for stat in agg.columns.levels[1][:-1]:
                # Make a new column name for the variable and stat
                columns.append('%s_%s_%s' % (df_name, var, stat))

    agg.columns = columns
    return agg

#### Analyse des traitements

In [18]:
def fGetExecuteTime(start_time, decalage=0):
    if start_time:
        duration = float(datetime.now().timestamp() - start_time)
        s_duration = f"{str(timedelta(seconds=duration))[:-5]}"
        fPrintTitle(f"\n{datetime.now().strftime('%d/%m/%Y, %H:%M:%S')}, durée d'exécution : {s_duration}", decalage=decalage)
    
    return datetime.now().timestamp()

start_time = fGetExecuteTime(None)

#### Modelisation

In [19]:
def fSplitDataSetForModelingTesting(df, target_label='TARGET', train_target_values=[0, 1], ratio_sampling=1.):
    train = df.loc[df[target_label].isin(train_target_values)]
    target = train[target_label]
    print(f"Train shape : {train.shape}, target répartition : {Counter(target)}.")
    train.drop(columns=target_label, inplace=True)
    train.reset_index(drop=True)
    
    test = df.loc[~df[target_label].isin(train_target_values)]
    test.drop(columns=target_label, inplace=True)
    test.reset_index(drop=True)
    print(f"Test shape : {test.shape}.")
    print()
    
    if ratio_sampling < 1.:
        print(f"Sampling ratio : {100 * ratio_sampling}%")
        train = pd.DataFrame([])
        for value in train_target_values:
            train = pd.concat([train, 
                               df.loc[df[target_label] == value].sample(n=int(df.loc[df[target_label] == value].shape[0] * ratio_sampling), random_state=72)
                              ])
        target = train[target_label]
        print(f"Train sample shape : {train.shape}, Sample target répartition : {Counter(target)}.")
        train.drop(columns=target_label, inplace=True)
        train.reset_index(drop=True)

        test = df.loc[~df[target_label].isin(train_target_values)].sample(n=int(df.loc[~df[target_label].isin(train_target_values)].shape[0] * ratio_sampling), random_state=72)
        test.drop(columns=target_label, inplace=True)
        test.reset_index(drop=True)
        print(f"Test sample shape : {test.shape}.")
        print()

    return train, target, test
    

In [20]:
def fPlotResults(df_score, type_model=None, max_values=100):
    nb_test = df_score.shape[0]

    df_4_plot = df_score.sort_values(by=['Equilibrage', 'Modèle', 'Scaler', 'Score'], ascending=[True, True, True, False]).copy()
    df_4_plot = df_4_plot.loc[df_score['Score'] < 1.0].drop_duplicates(subset=['Equilibrage', 'Modèle', 'Scaler'])
    df_4_plot = df_4_plot.sort_values(by='Score', ascending=False).reset_index(drop=True).copy()
    
    if type_model:
        df_4_plot = df_4_plot.loc[df_4_plot['Modèle'] == type_model]

    features_to_keep = []
    for features_out in df_4_plot['Liste features out']:
        features_to_keep = list(sorted(set(features_to_keep + list(features_out))))
    
    print(f"Liste des {len(features_to_keep)} features les plus pertinentes:\n", features_to_keep)
    print()
    
    if max_values == None:
        max_values = nb_test

    df_4_plot = df_4_plot.head(max_values)
    
    df_4_plot.loc[df_4_plot['Scaler'].isna(), 'Scaler'] = "No Scaler"
    df_4_plot['Modèle'] = df_4_plot['Modèle'] + " - " + df_4_plot['Scaler']
    df_4_plot.drop(columns='Scaler', inplace=True)
    
    df_4_plot['Score'] = df_4_plot['Score'].apply(lambda x: 100 * x)
    
    display(df_4_plot[['Equilibrage', 'Modèle', 'Score', 'Ecat type', 'Itération', 'Nb features in', 'Nb features out']].head())
    
    df_4_plot = df_4_plot[['Equilibrage', 'Modèle', 'Score']].pivot(index = 'Modèle', columns = 'Equilibrage', values = 'Score')
    
    df_4_plot.sort_values(by='Modèle', inplace=True, key=lambda col: col.str.lower()) 
    
    ## Représentation graphique
        
    fig = plt.figure(figsize=(df_4_plot.shape[1] + 2, df_4_plot.shape[0] + 2))    
    
    sns.heatmap(df_4_plot, annot=True, square=True, 
                            fmt='.1f', cmap="Blues", cbar=False, annot_kws={"fontsize":10}) #, yticklabels=targets_labels)

    if max_values < nb_test:
        title_detail = f"{max_values} meilleurs scores/{nb_test} tests"
    else:
        title_detail = f"{nb_test} tests"

    plt.title(f"Score of the model ({title_detail})", fontsize = 18, fontweight = 'bold')
    plt.ylabel('Modèle', fontsize = 14, fontweight = 'bold')
    plt.xlabel('DataSet - Equilibrage des jeux de données', fontsize = 14, fontweight = 'bold')
    
    plt.grid(True)
    plt.tight_layout()
    
    plt.show()

In [21]:
def fGetClassWeight(target):
    dict_class_weight = {}
    n_samples = len(target)
    n_classes = len(list(set(target)))
    c = Counter(target)
    for value in list(set(target)):
        dict_class_weight[value] = n_samples / (n_classes * c[value])
    
    return dict_class_weight

In [22]:
 def plot_feature_importances(df):
    """
    Plot importances returned by a model. This can work with any measure of
    feature importance provided that higher importance is better. 
    
    Args:
        df (dataframe): feature importances. Must have the features in a column
        called `features` and the importances in a column called `importance
        
    Returns:
        shows a plot of the 15 most importance features
        
        df (dataframe): feature importances sorted by importance (highest to lowest) 
        with a column for normalized importance
        """
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()

    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10, 5))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance_normalized'].head(15), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    
    return df

#### Custom scoring function

In [23]:
def custom_scoring(y_true, y_pred, beta=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    if beta == None :
        c = Counter(y_true)
        if c[0] > c[1]:
            beta = c[1] > c[0]
        else:
            beta = c[0] > c[1]
        
    fb = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
    return fb

def custom_scoring_min_fp(y_true, y_pred):
    # Pas d'accord de prêt alors qu'ils n'auraient pas fait de défaut de paiement
    return custom_scoring(y_true, y_pred, beta=0.5)

def custom_scoring_ratio_classweight(y_true, y_pred):
    return custom_scoring(y_true, y_pred, beta=None)

def custom_scoring_min_fn(y_true, y_pred):
    # Accord de prêt alors qu'ils vont être en défaut de paiement
    return custom_scoring(y_true, y_pred, beta=2)

def custom_fbeta_score(y_true, y_pred):
    # Accord de prêt alors qu'ils vont être en défaut de paiement
    return fbeta_score(y_true, y_pred, beta=2)

custom_scorer = make_scorer(custom_scoring_min_fn, greater_is_better=True, needs_proba=False)

custom_f2_scorer = make_scorer(custom_fbeta_score, greater_is_better=True, needs_proba=False)


### Vérification des versions des librairies Python

In [24]:
# Liste des packages non inclus dans Python a vérifier
package = [
    ft,
    ipw, 
    ipy,
    lgb,
    lxml, 
    mpl, 
    msno, 
    np, 
    pd,
    req, 
    sci,
    skl, 
    sns,
    xgb
]

print(f"Python :")
print(f"    - version utilisée   : {sys.version.split(' ')[0]}")
print(f"    - version disponible : {getCurrentVersionPython().replace('_', ' ')}")
print()

liste_version_package = [[f' {dummy.__name__.capitalize()}', dummy.__version__, getCurrentVersionPipProject('data-dummy').split(' ')[-1:][0].replace('_', ' '), '']]

for lib in package:
    if lib != '':
        try:
            liste_version_package.append([lib.__name__.capitalize(), lib.__version__.split('+')[0], getCurrentVersionPipProject(lib.__name__).split(' ')[-1:][0].replace('_', ' '), ''])
        except:
            print(lib)

df_package = pd.DataFrame(liste_version_package, columns=['Package', 'Version notebook', 'Version pypi.org', 'Nouvelle version disponible sur pypi.org']).set_index('Package')
try:
    df_package.loc[df_package['Version notebook'] != df_package['Version pypi.org'], 'Nouvelle version disponible sur pypi.org'] = df_package['Version pypi.org']
except:
    print('Doublon de package !')
    display(df_package)

display(df_package.drop(columns='Version pypi.org').sort_values('Package'))

Python :
    - version utilisée   : 3.11.2
    - version disponible : 3.11.5



Unnamed: 0_level_0,Version notebook,Nouvelle version disponible sur pypi.org
Package,Unnamed: 1_level_1,Unnamed: 2_level_1
Dummy,1.0,1.1
Featuretools,1.27.0,
Ipython,8.14.0,8.15.0
Ipywidgets,8.1.0,
Lightgbm,4.0.0,
Lxml,4.9.3,
Matplotlib,3.7.2,
Missingno,0.5.2,
Numpy,1.24.4,1.25.2
Pandas,2.0.3,2.1.0


In [25]:
endtime = datetime.now()

#print(f'Fin des traitements à {endtime.strftime("%d/%m/%Y, %H:%M:%S")}, durée : {int(str(endtime - startime).split(":")[0]):02d}h {int(str(endtime - startime).split(":")[1]):02d}min')
print(f'Durée partielle des traitements à {endtime.strftime("%d/%m/%Y, %H:%M:%S")}, durée : {endtime - startime}')


Durée partielle des traitements à 05/09/2023, 12:20:31, durée : 0:00:16.978276


### Variables d'environnements

In [26]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

# Définition des options d'affichages des tableaux de données
pd.set_option('display.min_rows', 2000)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

pd.options.display.float_format = '{:f}'.format

# For Windows
#set HTTP_PROXY=http://USER:PWD@proxy.company.com:PORT
#set HTTPS_PROXY=https://USER:PWD@proxy.company.com:PORT

In [27]:
# Répertoire courrant
print(os.getcwd())

# Dossier contenant les fichiers de données
DATA_DIR_SOURCE = os.path.join("data", "source")
DATA_DIR_CLEANED = os.path.join("data", "cleaned")
DATA_DIR_AUTRES = os.path.join("data", "autres")

full_dataset = True


C:\Users\petx698\OneDrive - LA POSTE GROUPE\Documents\MyDev\Python\Projets\ocr\ocr-ds-projet7


In [28]:
print("Nb CPU physique : ", psutil.cpu_count(logical=False))
print("Nb CPU logique  : ", psutil.cpu_count(logical=True))


Nb CPU physique :  4
Nb CPU logique  :  8


In [29]:
# init sns
sns.set_style("whitegrid")


In [30]:
# Free up memory by deleting old objects
#import gc
gc.enable()
#del train, bureau, bureau_balance, bureau_agg, bureau_agg_new, bureau_balance_agg, bureau_balance_counts, bureau_by_loan, bureau_balance_by_client, bureau_counts
#gc.collect()