In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import lightgbm as lgb
import xgboost as xgb

In [2]:
print("Loading Titanic dataset...")
train = pd.read_csv('/content/sample_data/Titanic_train.csv')
test = pd.read_csv('/content/sample_data/Titanic_test.csv')

Loading Titanic dataset...


In [3]:
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("\nMissing values:")
print(train.isnull().sum())

Train shape: (891, 12)
Test shape: (418, 11)

Missing values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [4]:
def preprocess_data(df, is_train=True):
    df = df.copy()

    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna('S', inplace=True)
    df['Fare'].fillna(df['Fare'].median(), inplace=True)

    df.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1, inplace=True)

    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df.drop(['SibSp', 'Parch'], axis=1, inplace=True)

    le_sex = LabelEncoder()
    le_embarked = LabelEncoder()

    df['Sex'] = le_sex.fit_transform(df['Sex'])
    df['Embarked'] = le_embarked.fit_transform(df['Embarked'])

    if not is_train:
        return df, le_sex, le_embarked

    return df, None, None

In [5]:
print("\nPreprocessing data...")
X_full = preprocess_data(train)[0]
y = train['Survived']
X = X_full.drop('Survived', axis=1)


Preprocessing data...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna('S', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values a

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
print("Features:", X.columns.tolist())
print("Target distribution:")
print(y.value_counts(normalize=True))

Features: ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize', 'IsAlone']
Target distribution:
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [8]:
print("\nTraining models...")
lgb_model = lgb.LGBMClassifier(random_state=42, n_estimators=100, verbose=-1)
xgb_model = xgb.XGBClassifier(random_state=42, n_estimators=100, eval_metric='logloss')


Training models...


In [9]:
lgb_cv = cross_val_score(lgb_model, X, y, cv=5, scoring='accuracy')
xgb_cv = cross_val_score(xgb_model, X, y, cv=5, scoring='accuracy')

In [10]:
print(f"LGBM CV Accuracy: {lgb_cv.mean():.3f} (+/- {lgb_cv.std()*2:.3f})")
print(f"XGB CV Accuracy: {xgb_cv.mean():.3f} (+/- {xgb_cv.std()*2:.3f})")

LGBM CV Accuracy: 0.831 (+/- 0.058)
XGB CV Accuracy: 0.817 (+/- 0.042)


In [11]:
lgb_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

In [12]:
lgb_pred_val = lgb_model.predict(X_val)
xgb_pred_val = xgb_model.predict(X_val)

In [13]:
print(f"\nLGBM Validation Accuracy: {accuracy_score(y_val, lgb_pred_val):.3f}")
print(f"XGB Validation Accuracy: {accuracy_score(y_val, xgb_pred_val):.3f}")


LGBM Validation Accuracy: 0.788
XGB Validation Accuracy: 0.799


In [14]:
print("\nLGBM Classification Report:")
print(classification_report(y_val, lgb_pred_val))
print("XGB Classification Report:")
print(classification_report(y_val, xgb_pred_val))


LGBM Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       110
           1       0.74      0.70      0.72        69

    accuracy                           0.79       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179

XGB Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       110
           1       0.75      0.72      0.74        69

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



In [15]:
X_test, _, _ = preprocess_data(test, is_train=False)
lgb_test_pred = lgb_model.predict(X_test)
xgb_test_pred = xgb_model.predict(X_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna('S', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values a

In [16]:
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived_LGBM': lgb_test_pred,
    'Survived_XGB': xgb_test_pred
})

In [17]:
print("\nTest predictions sample:")
print(submission.head(10))


Test predictions sample:
   PassengerId  Survived_LGBM  Survived_XGB
0          892              0             0
1          893              0             0
2          894              0             0
3          895              0             0
4          896              1             1
5          897              0             0
6          898              0             0
7          899              0             0
8          900              1             1
9          901              0             0


In [18]:
print("\nFeature Importance (LGBM):")
feature_imp = pd.DataFrame({
    'feature': X.columns,
    'importance': lgb_model.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_imp.head(8))


Feature Importance (LGBM):
      feature  importance
3        Fare        1248
2         Age         973
5  FamilySize         147
4    Embarked         122
1         Sex         112
0      Pclass          94
6     IsAlone          14


In [19]:
print("\n" + "="*50)
print("ANALYSIS SUMMARY")
print("="*50)
print(f"Dataset: {train.shape[0]} train, {test.shape[0]} test samples")
print(f"Best Model: LGBM (Val Acc: {accuracy_score(y_val, lgb_pred_val):.3f})")
print(f"Survival Rate: {y.mean():.1%}")
print("Top Features: Sex, Pclass, Age, FamilySize, Fare")
print("Ready for Kaggle submission!")


ANALYSIS SUMMARY
Dataset: 891 train, 418 test samples
Best Model: LGBM (Val Acc: 0.788)
Survival Rate: 38.4%
Top Features: Sex, Pclass, Age, FamilySize, Fare
Ready for Kaggle submission!


In [20]:
submission.to_csv('titanic_lgbm_xgb_predictions.csv', index=False)
print("\nSubmission file saved: titanic_lgbm_xgb_predictions.csv")


Submission file saved: titanic_lgbm_xgb_predictions.csv
