In [None]:
# =======================
# IMPORTAR LIBRERÍAS
# =======================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# =======================
# CARGAR DATASETS (RUTA KAGGLE)
# =======================
train_path = '/kaggle/input/udea-ai-4-eng-20251-pruebas-saber-pro-colombia/train.csv'
test_path = '/kaggle/input/udea-ai-4-eng-20251-pruebas-saber-pro-colombia/test.csv'

z = pd.read_csv(train_path)
print("✅ Dataset cargado:", z.shape)

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# =======================
# SELECCIÓN DE FEATURES Y TARGET
# =======================
features = [
    'FAMI_EDUCACIONMADRE', 'ESTU_VALORMATRICULAUNIVERSIDAD', 'FAMI_ESTRATOVIVIENDA',
    'FAMI_TIENEINTERNET', 'FAMI_TIENECOMPUTADOR', 'FAMI_EDUCACIONPADRE',
    'ESTU_HORASSEMANATRABAJA', 'FAMI_TIENELAVADORA', 'FAMI_TIENEAUTOMOVIL',
    'ESTU_PAGOMATRICULAPROPIO', 'coef_1', 'coef_2', 'coef_3', 'coef_4'
]
target = 'RENDIMIENTO_GLOBAL'

# =======================
# MAPEO MATRÍCULA
# =======================
matricula_map = {
    'Entre 1 millón y menos de 2.5 millones': 1.75,
    'Entre 2.5 millones y menos de 4 millones': 3.25,
    'Menos de 500 mil': 0.25,
    'Entre 500 mil y menos de 1 millón': 0.75,
    'Entre 4 millones y menos de 5.5 millones': 4.75,
    'Entre 5.5 millones y menos de 7 millones': 6.25,
    'Más de 7 millones': 7.75,
    'No pagó matrícula': 0,
    'no info': -1
}

# =======================
# PREPROCESAMIENTO TRAIN
# =======================
df = train_df[features + [target]].copy().fillna('no info')
df['ESTU_VALORMATRICULAUNIVERSIDAD'] = df['ESTU_VALORMATRICULAUNIVERSIDAD'].map(matricula_map)
df[target] = df[target].map({'bajo': 0, 'medio-bajo': 1, 'medio-alto': 2, 'alto': 3})
df = pd.get_dummies(df)

# =======================
# PREPROCESAMIENTO TEST
# =======================
test_ids = test_df['ID']
test_df_proc = test_df[features].copy().fillna('no info')
test_df_proc['ESTU_VALORMATRICULAUNIVERSIDAD'] = test_df_proc['ESTU_VALORMATRICULAUNIVERSIDAD'].map(matricula_map)
test_df_proc = pd.get_dummies(test_df_proc)

# =======================
# ALINEACIÓN DE FEATURES
# =======================
X_train = df.drop(columns=[target])
y_train = df[target]
X_test = test_df_proc.reindex(columns=X_train.columns, fill_value=0)

# =======================
# MODELO: STACKING CLASSIFIER
# =======================
estimators = [
    ('xgb', XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.9,
        colsample_bytree=0.9,
        objective='multi:softprob',
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42
    )),
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42))
]

stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=1000),
    passthrough=True,
    cv=3,
    n_jobs=-1
)

stack_model.fit(X_train, y_train)

# =======================
# VALIDACIÓN CRUZADA
# =======================
scores = cross_val_score(stack_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"✅ Accuracy promedio con Stacking: {scores.mean():.4f}")

# =======================
# PREDICCIÓN FINAL
# =======================
y_pred = stack_model.predict(X_test)

inv_map = {0: 'bajo', 1: 'medio-bajo', 2: 'medio-alto', 3: 'alto'}
y_pred_labels = [inv_map[i] for i in y_pred]

submission = pd.DataFrame({
    'ID': test_ids,
    'RENDIMIENTO_GLOBAL': y_pred_labels
})

submission.to_csv('submission.csv', index=False)
print("✅ Archivo submission.csv generado correctamente.")
print(submission.head())
