Dataset:
- ✅ Outliers extremos en un 3% de los datos (valores multiplicados por 5-10x).
- ✅ Valores claramente incorrectos en columnas clave (ej. OxygenSaturation con valores de 200%).
- ✅ Errores de digitación en variables categóricas (Present → Presnt, Epithelial → Epthlial).
- ✅ Intercambio de valores entre columnas (CellSize ↔ MitosisRate para simular errores de ingreso de datos).
- ✅ Más valores nulos (10%) en distintas features.
- ✅ Datos duplicados con pequeñas variaciones para simular mediciones imprecisas.

In [1]:
# Requiere: pip install imbalanced-learn
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix, roc_curve, precision_recall_curve
)
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# --- Preprocesamiento ---

# Cargar el dataset
df_dev = pd.read_csv("data/cell_diagnosis_dev_imbalanced.csv")
df_test = pd.read_csv("data/cell_diagnosis_test_imbalanced.csv")

# Separar features y target
X = df_dev.drop(columns=["Diagnosis"])
y = df_dev["Diagnosis"]

# Split en train y val
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Imputación numérica
imputer = SimpleImputer(strategy="mean")
X_train_cont = X_train.select_dtypes(include=["float64", "int64"])
X_val_cont = X_val.select_dtypes(include=["float64", "int64"])
X_train_cont_imputed = pd.DataFrame(imputer.fit_transform(X_train_cont), columns=X_train_cont.columns)
X_val_cont_imputed = pd.DataFrame(imputer.transform(X_val_cont), columns=X_val_cont.columns)

# Codificación categórica
encoder = OneHotEncoder(drop="first", sparse_output=False)
X_train_cat = X_train.select_dtypes(include=["object"]).fillna("Missing")
X_val_cat = X_val.select_dtypes(include=["object"]).fillna("Missing")
X_train_cat_encoded = pd.DataFrame(encoder.fit_transform(X_train_cat),
                                   columns=encoder.get_feature_names_out(X_train_cat.columns))
X_val_cat_encoded = pd.DataFrame(encoder.transform(X_val_cat),
                                 columns=encoder.get_feature_names_out(X_val_cat.columns))

# Concatenar
X_train_processed = pd.concat([X_train_cont_imputed, X_train_cat_encoded], axis=1)
X_val_processed = pd.concat([X_val_cont_imputed, X_val_cat_encoded], axis=1)

# Escalado
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_processed)
X_val_scaled = scaler.transform(X_val_processed)

# --- Función de entrenamiento y evaluación ---
results = {}

def train_and_evaluate(X_train, y_train, X_val_scaled, y_val, method_name, class_weight=None):
    model = LogisticRegression(penalty='l2', solver='liblinear', random_state=42, class_weight=class_weight)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val_scaled)
    y_proba = model.predict_proba(X_val_scaled)[:, 1]

    results[method_name] = {
        "Accuracy": accuracy_score(y_val, y_pred),
        "Precision": precision_score(y_val, y_pred),
        "Recall": recall_score(y_val, y_pred),
        "F1-Score": f1_score(y_val, y_pred),
        "AUC-ROC": roc_auc_score(y_val, y_proba),
        "AUC-PR": average_precision_score(y_val, y_proba)
    }

# --- Aplicar distintas técnicas ---

# 1. Sin re-balanceo
train_and_evaluate(X_train_scaled, y_train, X_val_scaled, y_val, "No Rebalancing")

# 2. Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train_scaled, y_train)
train_and_evaluate(X_rus, y_rus, X_val_scaled, y_val, "Random Undersampling")

# 3. Random Oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train_scaled, y_train)
train_and_evaluate(X_ros, y_ros, X_val_scaled, y_val, "Random Oversampling")

# 4. SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train_scaled, y_train)
train_and_evaluate(X_smote, y_smote, X_val_scaled, y_val, "SMOTE")

# 5. Cost-sensitive learning
train_and_evaluate(X_train_scaled, y_train, X_val_scaled, y_val, "Cost Re-weighting", class_weight="balanced")

# --- Mostrar resultados ---
df_results = pd.DataFrame(results).T
print("Validation Metrics by Rebalancing Method:")
print(df_results.round(3))

Validation Metrics by Rebalancing Method:
                      Accuracy  Precision  Recall  F1-Score  AUC-ROC  AUC-PR
No Rebalancing           0.758      0.667   0.066     0.119    0.768   0.514
Random Undersampling     0.656      0.386   0.639     0.481    0.695   0.463
Random Oversampling      0.725      0.469   0.738     0.573    0.778   0.517
SMOTE                    0.738      0.484   0.738     0.584    0.797   0.552
Cost Re-weighting        0.725      0.468   0.721     0.568    0.782   0.530


## Version 2: Con Feature Engineering y Limpieza de Datos

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# --- Load and clean data ---
df_dev = pd.read_csv("data/cell_diagnosis_dev.csv")
df_test = pd.read_csv("data/cell_diagnosis_test.csv")

# Remove outliers using IQR
def remove_outliers(df, columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    mask = ~((df[columns] < (Q1 - 1.5 * IQR)) | (df[columns] > (Q3 + 1.5 * IQR))).any(axis=1)
    return df[mask]

num_columns = df_dev.select_dtypes(include=['float64', 'int64']).columns.tolist()
df_dev = remove_outliers(df_dev, num_columns)

# Correct invalid values
df_dev['OxygenSaturation'] = df_dev['OxygenSaturation'].clip(50, 100)
df_dev['MitosisRate'] = df_dev['MitosisRate'].clip(0, 20)
df_dev['CellSize'] = df_dev['CellSize'].clip(5, 200)

# Unify categorical values
category_corrections = {
    'CellType': {'Epthlial': 'Epithelial', 'Mesnchymal': 'Mesenchymal', '???': 'Unknown'},
    'GeneticMutation': {'Presnt': 'Present', 'Absnt': 'Absent', 'Error': 'Unknown'}
}
for col, mapping in category_corrections.items():
    if col in df_dev.columns:
        df_dev[col] = df_dev[col].replace(mapping)
    if col in df_test.columns:
        df_test[col] = df_test[col].replace(mapping)

# --- Split into train and val ---
X = df_dev.drop(columns=['Diagnosis'])
y = df_dev['Diagnosis']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# --- Preprocessing pipeline ---
imputer = SimpleImputer(strategy="mean")
encoder = OneHotEncoder(drop='first', sparse_output=False)
scaler = StandardScaler()

def preprocess(X_train, X_val):
    X_train_num = X_train.select_dtypes(include=['float64', 'int64'])
    X_val_num = X_val.select_dtypes(include=['float64', 'int64'])

    X_train_num_imputed = pd.DataFrame(imputer.fit_transform(X_train_num), columns=X_train_num.columns)
    X_val_num_imputed = pd.DataFrame(imputer.transform(X_val_num), columns=X_val_num.columns)

    X_train_cat = X_train.select_dtypes(include=['object']).fillna("Missing")
    X_val_cat = X_val.select_dtypes(include=['object']).fillna("Missing")

    X_train_cat_enc = pd.DataFrame(encoder.fit_transform(X_train_cat), columns=encoder.get_feature_names_out(X_train_cat.columns))
    X_val_cat_enc = pd.DataFrame(encoder.transform(X_val_cat), columns=encoder.get_feature_names_out(X_val_cat.columns))

    X_train_final = pd.concat([X_train_num_imputed, X_train_cat_enc], axis=1)
    X_val_final = pd.concat([X_val_num_imputed, X_val_cat_enc], axis=1)

    X_train_scaled = scaler.fit_transform(X_train_final)
    X_val_scaled = scaler.transform(X_val_final)

    return X_train_scaled, X_val_scaled

X_train_scaled, X_val_scaled = preprocess(X_train, X_val)

# --- Entrenamiento y evaluación ---
results = {}

def train_and_evaluate(X_tr, y_tr, name, class_weight=None):
    model = LogisticRegression(penalty='l2', solver='liblinear', class_weight=class_weight, random_state=42)
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_val_scaled)
    y_proba = model.predict_proba(X_val_scaled)[:, 1]
    results[name] = {
        "Accuracy": accuracy_score(y_val, y_pred),
        "Precision": precision_score(y_val, y_pred),
        "Recall": recall_score(y_val, y_pred),
        "F1-Score": f1_score(y_val, y_pred),
        "AUC-ROC": roc_auc_score(y_val, y_proba),
        "AUC-PR": average_precision_score(y_val, y_proba)
    }

# No rebalancing
train_and_evaluate(X_train_scaled, y_train, "No Rebalancing")

# Undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train_scaled, y_train)
train_and_evaluate(X_rus, y_rus, "Random Undersampling")

# Oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train_scaled, y_train)
train_and_evaluate(X_ros, y_ros, "Random Oversampling")

# SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train_scaled, y_train)
train_and_evaluate(X_smote, y_smote, "SMOTE")

# Cost re-weighting
train_and_evaluate(X_train_scaled, y_train, "Cost Re-weighting", class_weight='balanced')

# --- Resultados ---
df_results = pd.DataFrame(results).T
print("\nValidation Results:")
print(df_results.round(3))


Validation Results:
                      Accuracy  Precision  Recall  F1-Score  AUC-ROC  AUC-PR
No Rebalancing           0.946      0.910   0.976     0.942    0.968   0.931
Random Undersampling     0.953      0.911   0.992     0.950    0.969   0.934
Random Oversampling      0.953      0.911   0.992     0.950    0.969   0.935
SMOTE                    0.949      0.910   0.984     0.946    0.967   0.931
Cost Re-weighting        0.949      0.910   0.984     0.946    0.968   0.933


### Test Set

In [4]:
# --- Preprocesamiento del test set ---
X_test = df_test.drop(columns=["Diagnosis"])
y_test = df_test["Diagnosis"]

X_test_num = X_test.select_dtypes(include=['float64', 'int64'])
X_test_cat = X_test.select_dtypes(include=['object']).fillna("Missing")

X_test_num_imputed = pd.DataFrame(imputer.transform(X_test_num), columns=X_test_num.columns)
X_test_cat_encoded = pd.DataFrame(encoder.transform(X_test_cat),
                                  columns=encoder.get_feature_names_out(X_test_cat.columns))

X_test_processed = pd.concat([X_test_num_imputed, X_test_cat_encoded], axis=1)
X_test_scaled = scaler.transform(X_test_processed)

# --- Función de evaluación en test ---
test_results = {}

def evaluate_on_test(X_train_bal, y_train_bal, method_name, class_weight=None):
    model = LogisticRegression(penalty='l2', solver='liblinear', class_weight=class_weight, random_state=42)
    model.fit(X_train_bal, y_train_bal)
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    test_results[method_name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba),
        "AUC-PR": average_precision_score(y_test, y_proba)
    }

# 1. No rebalancing
evaluate_on_test(X_train_scaled, y_train, "No Rebalancing")

# 2. Random Undersampling
evaluate_on_test(X_rus, y_rus, "Random Undersampling")

# 3. Random Oversampling
evaluate_on_test(X_ros, y_ros, "Random Oversampling")

# 4. SMOTE
evaluate_on_test(X_smote, y_smote, "SMOTE")

# 5. Cost Re-weighting
evaluate_on_test(X_train_scaled, y_train, "Cost Re-weighting", class_weight='balanced')

# --- Mostrar resultados ---
df_test_results = pd.DataFrame(test_results).T
print("Test Set Evaluation Results:")
print(df_test_results.round(3))

Test Set Evaluation Results:
                      Accuracy  Precision  Recall  F1-Score  AUC-ROC  AUC-PR
No Rebalancing           0.897      0.872   0.904     0.888    0.832   0.742
Random Undersampling     0.886      0.869   0.880     0.874    0.814   0.723
Random Oversampling      0.897      0.872   0.904     0.888    0.833   0.744
SMOTE                    0.897      0.872   0.904     0.888    0.833   0.739
Cost Re-weighting        0.897      0.872   0.904     0.888    0.832   0.742
