In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# =======================
# 1. CARGA Y FEATURE ENGINEERING
# =======================
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
test_ids = test["PassengerId"]

full_data = pd.concat([train, test], sort=False)

# Title simplificado
full_data["Title"] = full_data["Name"].str.extract(' ([A-Za-z]+)\.', expand=False)
full_data["Title"] = full_data["Title"].replace(['Mlle', 'Ms'], 'Miss')
full_data["Title"] = full_data["Title"].replace(['Mme'], 'Mrs')
full_data["Title"] = full_data["Title"].replace(
    ['Countess', 'Lady', 'Sir', 'Jonkheer', 'Don', 'Dona', 'Capt', 'Col', 'Major', 'Dr', 'Rev'],
    'Rare')

# Tamaño familiar
full_data["FamilySize"] = full_data["SibSp"] + full_data["Parch"] + 1
full_data["IsAlone"] = (full_data["FamilySize"] == 1).astype(int)

# Cabina
full_data["CabinInitial"] = full_data["Cabin"].str[0].fillna('U')
full_data["HasCabin"] = full_data["Cabin"].notnull().astype(int)

# Interacciones y derivados
full_data["FarePerPerson"] = full_data["Fare"] / full_data["FamilySize"]
full_data["Age*Class"] = full_data["Age"] * full_data["Pclass"]
full_data["TicketPrefix"] = full_data["Ticket"].str.extract(r'([A-Za-z]+)', expand=False).fillna("NONE")

# Separar nuevamente
train = full_data[~full_data["Survived"].isna()]
test = full_data[full_data["Survived"].isna()].drop(columns=["Survived"])
X = train.drop(columns=["Survived", "Name", "Ticket", "PassengerId", "Cabin"])
y = train["Survived"]
X_test_final = test.drop(columns=["Name", "Ticket", "PassengerId", "Cabin"])

# =======================
# 2. PIPELINE DE PREPROCESAMIENTO
# =======================
cat_cols = ["Sex", "Embarked", "Title", "CabinInitial", "Pclass", "TicketPrefix"]
num_cols = ["Age", "Fare", "FamilySize", "IsAlone", "FarePerPerson", "Age*Class", "HasCabin"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

# =======================
# 3. MODELOS BASE Y STACKING
# =======================
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=2,
    random_state=42
)

lgbm = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.01,
    max_depth=5,
    num_leaves=25,
    min_child_samples=20,
    random_state=42
)

lr = LogisticRegression(max_iter=1000)

stacked_model = StackingClassifier(
    estimators=[("rf", rf), ("lgbm", lgbm)],
    final_estimator=lr,
    passthrough=True,
    cv=5,
    n_jobs=-1
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", stacked_model)
])

# =======================
# 4. VALIDACIÓN CRUZADA ROBUSTA
# =======================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X, y, cv=cv, scoring="accuracy")
print(f"CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

# =======================
# 5. ENTRENAMIENTO FINAL Y PREDICCIÓN
# =======================
clf.fit(X, y)
final_preds = clf.predict(X_test_final)

submission = pd.DataFrame({
    "PassengerId": test_ids,
    "Survived": final_preds.astype(int)
})
submission.to_csv("../data/submission_stacking.csv", index=False)

In [5]:
print(f"CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

CV Accuracy: 0.8271 ± 0.0145


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# =======================
# 1. CARGA Y FEATURE ENGINEERING
# =======================
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
test_ids = test["PassengerId"]

full_data = pd.concat([train, test], sort=False)

# Título simplificado
full_data["Title"] = full_data["Name"].str.extract(' ([A-Za-z]+)\.', expand=False)
full_data["Title"] = full_data["Title"].replace(['Mlle', 'Ms'], 'Miss')
full_data["Title"] = full_data["Title"].replace(['Mme'], 'Mrs')
full_data["Title"] = full_data["Title"].replace(
    ['Countess', 'Lady', 'Sir', 'Jonkheer', 'Don', 'Dona', 'Capt', 'Col', 'Major', 'Dr', 'Rev'],
    'Rare')
title_map = {"Mr": "Mr", "Mrs": "Mrs", "Miss": "Miss", "Master": "Master"}
full_data["Title"] = full_data["Title"].map(lambda x: title_map.get(x, "Rare"))

# Family size
full_data["FamilySize"] = full_data["SibSp"] + full_data["Parch"] + 1
full_data["IsAlone"] = (full_data["FamilySize"] == 1).astype(int)

# Fare por persona
full_data["FarePerPerson"] = full_data["Fare"] / full_data["FamilySize"].replace(0, 1)
full_data["FarePerPerson"] = full_data["FarePerPerson"].fillna(full_data["FarePerPerson"].median())

# Interacción edad y clase
full_data["Age*Class"] = full_data["Age"] * full_data["Pclass"]

# Tiene cabina
full_data["HasCabin"] = full_data["Cabin"].notnull().astype(int)

# Separar train y test
train = full_data[~full_data["Survived"].isna()]
test = full_data[full_data["Survived"].isna()].drop(columns=["Survived"])
X = train.drop(columns=["Survived", "Name", "Ticket", "PassengerId", "Cabin"])
y = train["Survived"]
X_test_final = test.drop(columns=["Name", "Ticket", "PassengerId", "Cabin"])

# =======================
# 2. PIPELINE DE PREPROCESAMIENTO
# =======================
cat_cols = ["Sex", "Embarked", "Title", "Pclass"]
num_cols = ["Age", "Fare", "FamilySize", "IsAlone", "FarePerPerson", "Age*Class", "HasCabin"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

# =======================
# 3. STACKING MODEL
# =======================
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=2,
    random_state=42
)

lgbm = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.01,
    max_depth=5,
    num_leaves=25,
    min_child_samples=20,
    random_state=42
)

lr = LogisticRegression(max_iter=1000)

stacked_model = StackingClassifier(
    estimators=[("rf", rf), ("lgbm", lgbm)],
    final_estimator=lr,
    passthrough=True,
    cv=5,
    n_jobs=-1
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", stacked_model)
])

# =======================
# 4. VALIDACIÓN CRUZADA ROBUSTA
# =======================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X, y, cv=cv, scoring="accuracy")
print(f"CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

# =======================
# 5. ENTRENAMIENTO FINAL Y PREDICCIÓN
# =======================
clf.fit(X, y)
final_preds = clf.predict(X_test_final)

submission = pd.DataFrame({
    "PassengerId": test_ids,
    "Survived": final_preds.astype(int)
})
submission.to_csv("../data/submission_stacking_cleaned.csv", index=False)

  full_data["Title"] = full_data["Name"].str.extract(' ([A-Za-z]+)\.', expand=False)


[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000542 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 424
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
[LightGBM] [Info] Number of positive: 218, number of negative: 351
[LightGBM] [Info] Number of positive: 218, number of negative: 351
[LightGBM] [Info] Number of positive: 218, number of negative: 352
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 374
[LightGBM] [Info] Number of data points in the train 

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000473 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 421
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info] Start training from score -0.471371
[LightGBM] [Info] Number of positive: 219, number of negative: 351
[LightGBM] [Info] Number of positive: 220, number of negative: 351
[LightGBM] [Info] Number of positive: 219, number of negative: 351
[LightGBM] [Info] [LightGBM] [Info] Number of positive: 219, number of negative: 352
Number of positive: 219, number of negative: 351
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001707 seconds.
You can se

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000393 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 434
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info] Start training from score -0.471371
[LightGBM] [Info] Number of positive: 219, number of negative: 351
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000756 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 388
[LightGBM] [Info] Number of data points in the train set: 570, number of used features: 19
[LightGBM] [Info] Number of posi

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000448 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 429
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info] Start training from score -0.471371
[LightGBM] [Info] Number of positive: 219, number of negative: 351
[LightGBM] [Info] Number of positive: 219, number of negative: 351
[LightGBM] [Info] Number of positive: 220, number of negative: 351
[LightGBM] [Info] Number of positive: 219, number of negative: 351
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001478 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is no

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


[LightGBM] [Info] Number of positive: 273, number of negative: 440
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 426
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382889 -> initscore=-0.477303
[LightGBM] [Info] Start training from score -0.477303
[LightGBM] [Info] Number of positive: 218, number of negative: 352
[LightGBM] [Info] Number of positive: 219, number of negative: 352
[LightGBM] [Info] Number of positive: 218, number of negative: 352
[LightGBM] [Info] Number of positive: 218, number of negative: 352
[LightGBM] [Info] Number of positive: 219, number of negative: 352
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001377 seconds.
You can se

  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


[LightGBM] [Info] Number of positive: 273, number of negative: 440
[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001292 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 431
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001394 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 427
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001681 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_c





  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


In [None]:
print(f"CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")
#fue peor

CV Accuracy: 0.8339 ± 0.0114
