# Importations

In [2]:
from typing import List
from enum import Enum

import numpy as np
import pandas as pd

# Dataset sur lequel nous allons travailler

In [3]:
class DataType(Enum):
    BOOLEAN = 1
    QUALITATIVE_NOMINAL = 2         # Named categories : ['FH', 'SF', 'EV']
    QUALITATIVE_ORDINAL = 3         # Categories with an implied order : ['small', 'medium', 'high']
    QUANTITATIVE_DISCRETE = 4       # Only particular numbers : [1, 2, 3, 6, 8]
    QUANTITATIVE_CONTINUOUS = 5     # Any numerical value : [1.345, 2.394, 8.345, 0.432]
    LABEL = 6                       # Label de la classification


class JobWhenMissingValue(Enum):    # Si une valeur d'un dataset est manquante
    DELETE_LINE = 1                 # - On supprime la ligne
    REPLACE_VALUE = 2               # - On remplace la valeur

class EncodingType(Enum):
    ONE_HOT_ENCODER = 1
    ORDINAL_ENCODER = 2

In [39]:
# Url à partir de la racine du dataset au format CSV
CSV_FILE_URL: str = 'breast-cancer.csv'

# Si le fichier CSV contient les noms de colonnes, mettre à True, si le fichier CSV ne contient
# que les données, et pas les titres, mettre à False
IS_CSV_FILE_CONTAINS_ROW_HEADER: bool = False

# Renommer les noms des colonnes
NAMES_ROWS_HEADER: List[str] = [
    "class",
    "age",
    "menopause",
    "tumor-size",
    "inv-nodes",
    "node-caps",
    "deg-malig",
    "breast",
    "breast-quad",
    "irradiat",
]

# Les types des données qui ont été inséré
ROWS_DATATYPES = {
    "class": {
        "datatype": DataType.BOOLEAN,
        "possible_values": [ "recurrence-events", "no-recurrence-events" ], # TRUE, FALSE
        "job_when_missing_value": {
            "type": JobWhenMissingValue.REPLACE_VALUE,
            "replace_by": False,
        }
    },
    "age": {
        "datatype": DataType.QUALITATIVE_ORDINAL,
        "possible_values": [ "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80-89", "90-99" ],
        "encoding": EncodingType.ORDINAL_ENCODER,
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "menopause": {
        "datatype": DataType.QUALITATIVE_NOMINAL,
        "possible_values": [ "lt40", "premeno", "ge40" ],
        "encoding": EncodingType.ONE_HOT_ENCODER,
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "tumor-size": {
        "datatype": DataType.QUALITATIVE_ORDINAL,
        "possible_values": [ "0-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59" ],
        "encoding": EncodingType.ORDINAL_ENCODER,
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "inv-nodes": {
        "datatype": DataType.QUALITATIVE_ORDINAL,
        "possible_values": [ "0-2", "3-5", "6-8", "9-11", "12-14", "15-17", "18-20", "21-23", "24-26", "27-29", "30-32", "33-35", "36-39" ],
        "encoding": EncodingType.ORDINAL_ENCODER,
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "node-caps": {
        "datatype": DataType.BOOLEAN,
        "possible_values": ["yes", "no"],
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "deg-malig": {
        "datatype": DataType.QUANTITATIVE_DISCRETE,
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "breast": {
        "datatype": DataType.QUALITATIVE_NOMINAL,
        "possible_values": [ "left", "right" ],
        "encoding": EncodingType.ONE_HOT_ENCODER,
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "breast-quad": {
        "datatype": DataType.QUALITATIVE_NOMINAL,
        "possible_values": [ "left_up",  "left_low",  "right_up",  "right_low",  "central" ],
        "encoding": EncodingType.ONE_HOT_ENCODER,
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "irradiat": {
        "datatype": DataType.LABEL,
        "possible_values": ["yes", "no"],
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
}

In [20]:

def load_dataset() -> pd.DataFrame:

    header_value = 1 if IS_CSV_FILE_CONTAINS_ROW_HEADER else None

    current_dataset = pd.read_csv(CSV_FILE_URL, header= header_value)

    if header_value is None:
        current_dataset.columns = NAMES_ROWS_HEADER

    return current_dataset

current_dataset = load_dataset()

# Pré-processing

## Nettoyage des données

Cela consiste à enlever les données manquantes ou erronées et à corriger les erreurs éventuelles.

In [30]:
def clean_dataset(dataframe: pd.DataFrame, parameters: dict) -> pd.DataFrame:
    """ Méthode qui nettoie le dataset selon les volontés définies par le client dans `parameters`
    :param dataframe: pd.DataFrame = notre dataset
    :param parameters: dict = notre dictionnaire ROWS_DATATYPES
    """
    dataframe = dataframe.replace("?", np.nan)

    for key, value in parameters.items():

        if value["job_when_missing_value"]["type"] == JobWhenMissingValue.DELETE_LINE:
            dataframe = dataframe[dataframe[key].notna()]

        if value["job_when_missing_value"]["type"] == JobWhenMissingValue.REPLACE_VALUE:
            dataframe[key] = dataframe[key].replace(np.nan, value["job_when_missing_value"]["replace_by"])

    return dataframe

current_dataset = load_dataset()
clean_dataset(current_dataset, ROWS_DATATYPES)

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no


## Normalisation des données (uniquement pour les données quantitatives)

Cela consiste à mettre toutes les données sur la même échelle, afin d'éviter que certains attributs aient un poids plus important que d'autres dans l'analyse.

In [11]:
from sklearn.preprocessing import minmax_scale

def normalize_dataset(dataframe: pd.DataFrame, parameters: dict) -> pd.DataFrame:
    """
    :param dataframe:
    :param parameters:
    :return:
    """
    for key, value in parameters.items():

        if value["datatype"] == DataType.QUANTITATIVE_DISCRETE or value["datatype"] == DataType.QUANTITATIVE_CONTINUOUS:

            dataframe[[key]] = minmax_scale(dataframe[[key]])

    return dataframe

current_dataset = load_dataset()
normalize_dataset(current_dataset, ROWS_DATATYPES)

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,1.0,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,0.5,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,0.5,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,0.5,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,0.5,right,right_low,no
...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,0.5,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,1.0,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,0.0,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,3-5,no,1.0,left,left_low,no


## Transformation des données

Cela consiste à appliquer des transformations mathématiques aux données afin de les mettre sous une forme qui convient mieux à l'analyse. Par exemple, il peut être utile de transformer des données catégorielles en données numériques.

In [40]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

def transform_data(dataframe: pd.DataFrame, parameters: dict) -> pd.DataFrame:
    """
    "datatype": DataType.QUALITATIVE_ORDINAL,
        "possible_values": [ "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80-89", "90-99" ],
        "encoding": EncodingType.ORDINAL_ENCODER,
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
    }
    """

    for key, value in parameters.items():
        if value["datatype"] == DataType.BOOLEAN:
            remplacement = {
                value["possible_values"][0]: True,
                value["possible_values"][1]: False
            }

            dataframe[key] = dataframe[key].map(remplacement)

        if "encoding" in value and value["encoding"] == EncodingType.ORDINAL_ENCODER:
            # Création de l'objet OrdinalEncoder
            ordinal_encoder = OrdinalEncoder(categories=[value["possible_values"]])

            # Encodage des valeurs de la colonne
            dataframe[key] = ordinal_encoder.fit_transform(dataframe[[key]])

        if "encoding" in value and value["encoding"] == EncodingType.ONE_HOT_ENCODER:
            # Création de l'objet OneHotEncoder
            one_hot_encoder = OneHotEncoder(categories=[value["possible_values"]])

            # Encodage des valeurs de la colonne
            values_encoded = one_hot_encoder.fit_transform(dataframe[[key]]).toarray()

            dataframe = dataframe.drop(key, axis=1)

            dataframe_encoded = pd.DataFrame(values_encoded, columns=[f"{key}_{x}" for x in one_hot_encoder.categories_[0]])
            dataframe = dataframe.join(dataframe_encoded)


    return dataframe

current_dataset = load_dataset()
current_dataset = clean_dataset(current_dataset, ROWS_DATATYPES)
current_dataset = normalize_dataset(current_dataset, ROWS_DATATYPES)
transform_data(current_dataset, ROWS_DATATYPES)

Unnamed: 0,class,age,tumor-size,inv-nodes,node-caps,deg-malig,irradiat,menopause_lt40,menopause_premeno,menopause_ge40,breast_left,breast_right,breast-quad_left_up,breast-quad_left_low,breast-quad_right_up,breast-quad_right_low,breast-quad_central
0,False,2.0,6.0,0.0,False,1.0,no,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,False,3.0,4.0,0.0,False,0.5,no,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,False,3.0,4.0,0.0,False,0.5,no,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,False,5.0,3.0,0.0,False,0.5,no,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4,False,3.0,0.0,0.0,False,0.5,no,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,True,2.0,6.0,0.0,False,0.5,no,,,,,,,,,,
282,True,2.0,4.0,0.0,False,1.0,yes,,,,,,,,,,
283,True,5.0,4.0,0.0,False,0.0,no,,,,,,,,,,
284,True,3.0,6.0,1.0,False,1.0,no,,,,,,,,,,
