# Análisis Discriminante - Datos de Cancer de Mama

In [1]:
# Librerías necesarias para manipulación de datos
import pandas as pd
from pathlib import Path

# Librerías de scikit learn
from sklearn.preprocessing import StandardScaler, LabelEncoder # Librerías para estandarización de datos
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis # Importa los algoritmos de análisis discriminante
from sklearn.model_selection import train_test_split # Librerías para crear conjuntos de entrenamiento y de prueba
from sklearn.metrics import accuracy_score # Librería para calcular métricas para comparar modelos

## Preparación de los Datos

Fuente de los datos: [Breast Cancer Dataset](https://www.kaggle.com/datasets/yasserh/breast-cancer-dataset)

In [2]:
# Importa los datos en un DataFrame de Pandas
file_path = Path("../datos/breast_cancer.csv")  # Define la ruta del fichero de datos

# Crea un DataFrame con los datos del fichero de datos
data_raw = pd.read_csv(file_path, index_col=0)

# Muestra un ejemplo de los datos cargados
data_raw.head(10)

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,
843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,
844359,M,18.25,19.98,119.6,1040.0,0.09463,0.109,0.1127,0.074,0.1794,...,27.66,153.2,1606.0,0.1442,0.2576,0.3784,0.1932,0.3063,0.08368,
84458202,M,13.71,20.83,90.2,577.9,0.1189,0.1645,0.09366,0.05985,0.2196,...,28.14,110.6,897.0,0.1654,0.3682,0.2678,0.1556,0.3196,0.1151,
844981,M,13.0,21.82,87.5,519.8,0.1273,0.1932,0.1859,0.09353,0.235,...,30.73,106.2,739.3,0.1703,0.5401,0.539,0.206,0.4378,0.1072,
84501001,M,12.46,24.04,83.97,475.9,0.1186,0.2396,0.2273,0.08543,0.203,...,40.68,97.65,711.4,0.1853,1.058,1.105,0.221,0.4366,0.2075,


In [3]:
# Muestra el tamaño del DataFrame con los datos originales
data_raw.shape

(569, 32)

In [4]:
# Verificar si hay valores nulos
total_nan = data_raw.isna().sum().sum()

print(f"Número total de valores nulos: {total_nan}")

Número total de valores nulos: 569


In [5]:
# Eliminar las columnas donde al menos un elemento es NaN
data_clean = data_raw.dropna(axis=1, how="any")

In [6]:
# Muestra el tamaño del DataFrame con los datos limpios
data_clean.shape

(569, 31)

In [7]:
# Verificar los tipos de datos
data_clean.dtypes

diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst

In [8]:
# Guarda una copia de los datos limpios
file_path = Path("../datos/breast_cancer_clean.csv")  # Define la ruta del fichero de datos

data_clean.to_csv(file_path)

In [9]:
data_clean.head(3)

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [10]:
# Creación de conjunto de predictores (X)
X = data_clean.drop("diagnosis", axis=1)

# Datos de muestra
X.head()

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [11]:
# Creación de conjunto variable objetivo (y)
y =data_clean["diagnosis"].copy()

# Datos de muestra
y.head()

id
842302      M
842517      M
84300903    M
84348301    M
84358402    M
Name: diagnosis, dtype: object

In [12]:
# Mostrar posibles valores en y
y.value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [13]:
# Codificación de valores categóricos a numéricos
y_coded = y.replace(
    {
        "B":0,
        "M":1
    }
)

# Datos de muestra
y_coded.value_counts()

0    357
1    212
Name: diagnosis, dtype: int64

In [14]:
# Codificación de valores categóricos a numéricos usando label encoder
labelencoder= LabelEncoder() # Crea la instancia del LabelEncoder

y_labels = labelencoder.fit_transform(y) # Convierte los valores categóricos por numéricos

y_labels

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,

In [15]:
# Estandarización de los datos
scaler = StandardScaler() # Crea una instancia de StandardScaler

# Estandariza los datos utilizando la instancia de StandarScaler
X_standardized = scaler.fit_transform(X)

# Muestra los datos estandarizados
print(X_standardized[10])

[ 0.53755602  0.91927331  0.44201066  0.40645325 -1.01768583 -0.71354185
 -0.70068435 -0.40468555 -1.03547556 -0.82612434 -0.09265584 -0.05416438
 -0.19804156  0.00380456 -1.00403368 -0.9059213  -0.69244186 -0.68211388
 -0.71948464 -0.2847869   0.60484876  1.33577127  0.49262165  0.47361134
 -0.62547654 -0.63082823 -0.60587197 -0.22620973  0.07643089  0.03181881]


In [16]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y_coded, random_state=0)

In [17]:
# Datos de muestra
print(X_train[:5])
print(y_train[:5])

[[-0.64678318 -0.42577149 -0.67671518 -0.63192861 -0.8995512  -0.90817355
  -0.77739522 -0.67371679  0.2314018  -0.80060746 -0.71524457  0.03836716
  -0.80791928 -0.58210097  0.18468118 -0.58561835 -0.58932352 -0.52201294
  -0.31750401 -0.76062709 -0.66456714  0.01185125 -0.68242968 -0.63774107
   0.19863822 -0.49914731 -0.67447656 -0.35335182  0.32395133 -0.76893975]
 [-0.82571213  0.13272462 -0.8249999  -0.76105087  0.64331558 -0.69269522
  -1.05202266 -1.06622382  0.46871312 -0.35689739 -0.38825014  1.35920951
  -0.44902208 -0.45581116  1.94975275 -0.80694093 -0.94818198 -1.10775204
   2.65013011 -0.6929201  -0.88821628  0.01673656 -0.90403632 -0.78136254
   0.43973562 -1.00239744 -1.24178371 -1.43718102  0.63294742 -1.03770647]
 [ 1.70485436  2.08513394  1.61593137  1.72384158  0.10245823 -0.01783304
   0.69304299  1.26366923 -0.21766424 -1.0586114   1.30049923  2.26093843
   1.15685722  1.29156462 -0.42401016 -0.0697579   0.25220172  0.80843074
  -0.1891608  -0.49055563  1.5367201

## Análisis Discriminante Lineal (LDA)

In [18]:
# Crea instancia del modelo
lda = LinearDiscriminantAnalysis()

# Se entrena el modelo
lda.fit(X_train, y_train)

# Predicción utilizando datos de prueba
y_pred_lda = lda.predict(X_test)

# Medición de la exactitud
accuracy_lda = accuracy_score(y_test, y_pred_lda)
print(f'Exactitud LDA: {accuracy_lda:.4f}')

Exactitud LDA: 0.9720


## Análisis Discriminante Cuadrático (QDA)

In [19]:
# Crea instancia del modelo
qda = QuadraticDiscriminantAnalysis()

# Se entrena el modelo
qda.fit(X_train, y_train)

# Predicción utilizando datos de prueba
y_pred_qda = qda.predict(X_test)

# Medición de la exactitud
accuracy_qda = accuracy_score(y_test, y_pred_qda)
print(f'Exactitud QDA: {accuracy_qda:.4f}')

Exactitud QDA: 0.9580


## LDA Regularizado como aproximación a RDA

In [20]:
# Crear instancia del modelo
lda_rda = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')

# Se entrena el modelo
lda_rda.fit(X_train, y_train)

# Predicción utilizando datos de prueba
y_pred_lda_rda = lda_rda.predict(X_test)

# Medición de la exactitud
accuracy_lda_rda = accuracy_score(y_test, y_pred_lda_rda)
print(f'Exactitud LDA (Regularizado): {accuracy_lda_rda:.4f}')

Exactitud LDA (Regularizado): 0.9650
