# Importations

In [18]:
from typing import List
from enum import Enum

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.datasets import make_classification, make_circles
from sklearn import svm, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# Dataset sur lequel nous allons travailler

In [3]:
Xtemp, ytemp = make_circles(n_samples=1000, shuffle=True, noise=None, random_state=42)

In [None]:
"""
X : [
        [1, 5]
        [8, 2]
        [4, 1]
        [5, 8]
    ]
y : [
        [1],
        [0],
        [0],
        [1],
    ]
"""

In [21]:
class DataType(Enum):
    BOOLEAN = 1
    QUALITATIVE_NOMINAL = 2         # Named categories : ['FH', 'SF', 'EV']
    QUALITATIVE_ORDINAL = 3         # Categories with an implied order : ['small', 'medium', 'high']
    QUANTITATIVE_DISCRETE = 4       # Only particular numbers : [1, 2, 3, 6, 8]
    QUANTITATIVE_CONTINUOUS = 5     # Any numerical value : [1.345, 2.394, 8.345, 0.432]
    LABEL = 6                       # Label de la classification


class JobWhenMissingValue(Enum):    # Si une valeur d'un dataset est manquante
    DELETE_LINE = 1                 # - On supprime la ligne
    REPLACE_VALUE = 2               # - On remplace la valeur

In [14]:
# Url à partir de la racine du dataset au format CSV
CSV_FILE_URL: str = 'breast-cancer.csv'

# Si le fichier CSV contient les noms de colonnes, mettre à True, si le fichier CSV ne contient
# que les données, et pas les titres, mettre à False
IS_CSV_FILE_CONTAINS_ROW_HEADER: bool = False

# Renommer les noms des colonnes
NAMES_ROWS_HEADER: List[str] = [
    "class",
    "age",
    "menopause",
    "tumor-size",
    "inv-nodes",
    "node-caps",
    "deg-malig",
    "breast",
    "breast-quad",
    "irradiat",
]

# Les types des données qui ont été inséré
ROWS_DATATYPES = {
    "class": {
        "datatype": DataType.BOOLEAN,
        "matches": {
            "no-recurrence-events": False,
            "recurrence-events": True,
        },
        "job_when_missing_value": {
            "type": JobWhenMissingValue.REPLACE_VALUE,
            "replace_by": False,
        }
    },
    "age": {
        "datatype": DataType.QUALITATIVE_ORDINAL,
        "matches": {
            "10-19": 1,
            "20-29": 2,
            "30-39": 3,
            "40-49": 4,
            "50-59": 5,
            "60-69": 6,
            "70-79": 7,
            "80-89": 8,
            "90-99": 9
        },
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "menopause": {
        "datatype": DataType.QUALITATIVE_NOMINAL,
        "matches": {
            "lt40": 1,
            "premeno": 2,
            "ge40": 3,
        },
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "tumor-size": {
        "datatype": DataType.QUALITATIVE_ORDINAL,
        "matches": {
            "0-4": 1,
            "5-9": 2,
            "10-14": 3,
            "15-19": 4,
            "20-24": 5,
            "25-29": 6,
            "30-34": 7,
            "35-39": 8,
            "40-44": 9,
            "45-49": 10,
            "50-54": 11,
            "55-59": 12,
        },
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "inv-nodes": {
        "datatype": DataType.QUALITATIVE_ORDINAL,
        "matches": {
            "0-2": 1,
            "3-5": 2,
            "6-8": 3,
            "9-11": 4,
            "12-14": 5,
            "15-17": 6,
            "18-20": 7,
            "21-23": 8,
            "24-26": 9,
            "27-29": 10,
            "30-32": 11,
            "33-35": 12,
            "36-39": 13
        },
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "node-caps": {
        "datatype": DataType.BOOLEAN,
        "matches": {
            "yes": True,
            "no": False,
        },
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "deg-malig": {
        "datatype": DataType.QUANTITATIVE_DISCRETE,
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "breast": {
        "datatype": DataType.QUALITATIVE_NOMINAL,
        "matches": {
            "left": 1,
            "right": 2
        },
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "breast-quad": {
        "datatype": DataType.QUALITATIVE_NOMINAL,
        "matches": {
            "left-up": 1,
            "left-low": 2,
            "right-up": 3,
            "right-low": 4,
            "central": 5,
        },
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
    "irradiat": {
        "datatype": DataType.LABEL,
        "matches": {
            "yes": True,
            "no": False,
        },
        "job_when_missing_value": {
            "type": JobWhenMissingValue.DELETE_LINE,
        }
    },
}

In [15]:
header_value = 1 if IS_CSV_FILE_CONTAINS_ROW_HEADER else None

res = pd.read_csv(CSV_FILE_URL, header=header_value)
res

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no
...,...,...,...,...,...,...,...,...,...,...
281,recurrence-events,30-39,premeno,30-34,0-2,no,2,left,left_up,no
282,recurrence-events,30-39,premeno,20-24,0-2,no,3,left,left_up,yes
283,recurrence-events,60-69,ge40,20-24,0-2,no,1,right,left_up,no
284,recurrence-events,40-49,ge40,30-34,3-5,no,3,left,left_low,no


# Pré-processing

## Nettoyage des données

In [6]:
def clean_dataset(dataframe: pd.DataFrame, parameters: dict) -> None:
    """ Méthode qui nettoie le dataset selon les volontés définies par le client dans `parameters`
    :param dataframe: pd.DataFrame = notre dataset
    :param parameters: dict = notre dictionnaire ROWS_DATATYPES
    """
    for key, value in parameters.items():

        if value["job_when_missing_value"]["type"] == JobWhenMissingValue.DELETE_LINE:
            dataframe = dataframe[dataframe[key].notna()]

        if value["job_when_missing_value"]["type"] == JobWhenMissingValue.REPLACE_VALUE:
            dataframe[key] = dataframe[key].replace(np.nan, value["job_when_missing_value"]["replace_by"])

numpy.ndarray

## Normalisation des données (uniquement pour les données quantitatives)

In [None]:
def normalize_dataset(dataframe: pd.DataFrame, parameters: dict) -> None:
    print("normalize")