# Titanic - XGBoost & LightGBM

Сравнение boosting моделей с baseline RandomForest

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

pd.set_option('display.max_columns', None)

## 1. Load & Preprocess

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print(f"Train: {train.shape}, Test: {test.shape}")

Train: (891, 12), Test: (418, 11)


In [3]:
def preprocess(df):
    """Preprocessing with Title and Family features."""
    result = df.copy()
    
    # Fill missing values
    result['Age'] = result['Age'].fillna(result['Age'].median())
    result['Embarked'] = result['Embarked'].fillna(result['Embarked'].mode()[0])
    result['Fare'] = result['Fare'].fillna(result['Fare'].median())
    
    # Title
    result['Title'] = result['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
    result['Title'] = result['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr',
                                                'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    result['Title'] = result['Title'].replace(['Mlle', 'Ms'], 'Miss')
    result['Title'] = result['Title'].replace('Mme', 'Mrs')
    result['Title'] = result['Title'].map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4})
    result['Title'] = result['Title'].fillna(0)

    # Family
    result['FamilySize'] = result['SibSp'] + result['Parch'] + 1
    result['IsAlone'] = (result['FamilySize'] == 1).astype(int)
    
    # Encode
    result['Sex'] = result['Sex'].map({'male': 0, 'female': 1})
    result['Embarked'] = result['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
    
    return result

In [4]:
train_processed = preprocess(train)
test_processed = preprocess(test)

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked',
            'Title', 'FamilySize', 'IsAlone']

X = train_processed[features]
y = train_processed['Survived']
X_test = test_processed[features]

print(f"X: {X.shape}, y: {y.shape}, X_test: {X_test.shape}")

X: (891, 10), y: (891,), X_test: (418, 10)


## 2. Compare Models

In [5]:
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, 
                             random_state=42, verbosity=0),
    'LightGBM': LGBMClassifier(n_estimators=100, max_depth=3, learning_rate=0.1,
                               random_state=42, verbosity=-1)
}

results = []
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    results.append({
        'model': name,
        'cv_mean': scores.mean(),
        'cv_std': scores.std()
    })
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std():.4f})")

results_df = pd.DataFrame(results).sort_values('cv_mean', ascending=False)
results_df

RandomForest: 0.8283 (+/- 0.0163)
XGBoost: 0.8361 (+/- 0.0213)
LightGBM: 0.8350 (+/- 0.0256)


Unnamed: 0,model,cv_mean,cv_std
1,XGBoost,0.83615,0.021348
2,LightGBM,0.835039,0.02561
0,RandomForest,0.828259,0.016341


## 3. Train Best Model & Submit

In [6]:
# Train XGBoost (usually best for tabular data)
best_model = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1,
                           random_state=42, verbosity=0)
best_model.fit(X, y)

# Feature importance
pd.DataFrame({
    'feature': features,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
7,Title,0.512078
0,Pclass,0.180605
8,FamilySize,0.083614
5,Fare,0.053671
6,Embarked,0.042058
1,Sex,0.039422
2,Age,0.036686
3,SibSp,0.027251
4,Parch,0.024615
9,IsAlone,0.0


In [7]:
predictions = best_model.predict(X_test)

submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})

submission.to_csv('submission_xgb.csv', index=False)
print(f"Submission saved: {submission.shape}")
submission.head()

Submission saved: (418, 2)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


## 4. Try LightGBM too

In [8]:
lgbm_model = LGBMClassifier(n_estimators=100, max_depth=3, learning_rate=0.1,
                            random_state=42, verbosity=-1)
lgbm_model.fit(X, y)

predictions_lgbm = lgbm_model.predict(X_test)

submission_lgbm = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions_lgbm
})

submission_lgbm.to_csv('submission_lgbm.csv', index=False)
print(f"LightGBM submission saved: {submission_lgbm.shape}")

LightGBM submission saved: (418, 2)
