In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# =======================
# 1. CARGA Y FEATURE ENGINEERING SIMPLE
# =======================
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
test_ids = test["PassengerId"]

full_data = pd.concat([train, test], sort=False)

# Simplificación de Title
full_data["Title"] = full_data["Name"].str.extract(' ([A-Za-z]+)\.', expand=False)
full_data["Title"] = full_data["Title"].replace(['Mlle', 'Ms'], 'Miss')
full_data["Title"] = full_data["Title"].replace(['Mme'], 'Mrs')
full_data["Title"] = full_data["Title"].replace(
    ['Countess', 'Lady', 'Sir', 'Jonkheer', 'Don', 'Dona', 'Capt', 'Col', 'Major', 'Dr', 'Rev'],
    'Rare')
title_map = {"Mr": "Mr", "Mrs": "Mrs", "Miss": "Miss", "Master": "Master"}
full_data["Title"] = full_data["Title"].map(lambda x: title_map.get(x, "Rare"))

# Tamaño familiar e indicador de estar solo
full_data["FamilySize"] = full_data["SibSp"] + full_data["Parch"] + 1
full_data["IsAlone"] = (full_data["FamilySize"] == 1).astype(int)

# Tiene cabina
full_data["HasCabin"] = full_data["Cabin"].notnull().astype(int)

# Separar train/test
train = full_data[~full_data["Survived"].isna()]
test = full_data[full_data["Survived"].isna()].drop(columns=["Survived"])
X = train.drop(columns=["Survived", "Name", "Ticket", "PassengerId", "Cabin"])
y = train["Survived"]
X_test_final = test.drop(columns=["Name", "Ticket", "PassengerId", "Cabin"])

# =======================
# 2. PIPELINE DE PREPROCESAMIENTO
# =======================
cat_cols = ["Sex", "Embarked", "Title", "Pclass"]
num_cols = ["Age", "Fare", "FamilySize", "IsAlone", "HasCabin"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

# =======================
# 3. MODELO LIGHTGBM CONSERVADOR
# =======================
lgbm = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.01,
    max_depth=4,
    num_leaves=15,
    min_child_samples=30,
    random_state=42
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", lgbm)
])

# =======================
# 4. VALIDACIÓN CRUZADA
# =======================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(clf, X, y, cv=cv, scoring="accuracy")
print(f"CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

# =======================
# 5. ENTRENAMIENTO Y ENVÍO A KAGGLE
# =======================
clf.fit(X, y)
final_preds = clf.predict(X_test_final)

submission = pd.DataFrame({
    "PassengerId": test_ids,
    "Survived": final_preds.astype(int)
})
submission.to_csv("../data/submission_lightgbm_simplificado.csv", index=False)

  full_data["Title"] = full_data["Name"].str.extract(' ([A-Za-z]+)\.', expand=False)


[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000420 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 216
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028
[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000338 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 220
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 17
[LightGBM] [Info] [binary:BoostF



[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000396 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 220
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info] Start training from score -0.471371




[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 222
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.384292 -> initscore=-0.471371
[LightGBM] [Info] Start training from score -0.471371




[LightGBM] [Info] Number of positive: 273, number of negative: 440
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000372 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 223
[LightGBM] [Info] Number of data points in the train set: 713, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382889 -> initscore=-0.477303
[LightGBM] [Info] Start training from score -0.477303




CV Accuracy: 0.8283 ± 0.0264
[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000398 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288








In [2]:
print(f"CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")


CV Accuracy: 0.8283 ± 0.0264


In [3]:
full_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamilySize,IsAlone,HasCabin
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Mr,2,0,0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,2,0,1
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Miss,1,1,0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Mrs,2,0,1
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Mr,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,Mr,1,1,0
414,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,Rare,1,1,1
415,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,Mr,1,1,0
416,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,Mr,1,1,0
