In [172]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [173]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")
gender_submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")

train = train.drop(["Name", "PassengerId", "Ticket", "Cabin"], axis=1)
test = test.drop(["Name", "Ticket", "Cabin"], axis=1)

In [174]:
from sklearn.preprocessing import LabelEncoder

train = train.drop(train[train["Embarked"].isnull()].index, axis=0)
test = test.drop(test[test["Embarked"].isnull()].index, axis=0)
test.loc[test["Fare"].isnull(), "Fare"] = test["Fare"].mean()

ctrain = train.copy()
ctest = test.copy()

for col in ["Sex", "Embarked"]:
    ctrain[col] = ctrain[col].astype(str)
    ctest[col] = ctest[col].astype(str)
    le = LabelEncoder()
    ctrain[col] = le.fit_transform(ctrain[col])
    le = LabelEncoder()
    ctest[col] = le.fit_transform(ctest[col])

### Fillining in nan for Age

In [175]:
from sklearn.linear_model import LinearRegression
import random

features = ["Pclass", "Sex", "SibSp", "Parch", "Embarked", "Fare"]

age_df = ctrain[ctrain["Age"].notnull()]
age_missing = ctrain[ctrain["Age"].isnull()]

lr = LinearRegression()
lr.fit(age_df[features], age_df["Age"])
preds = lr.predict(age_missing[features])
preds = np.maximum(preds, (preds * -1) + (preds / 2))
ctrain.loc[ctrain["Age"].isnull(), "Age"] = preds

age_df = ctest[ctest["Age"].notnull()]
age_missing = ctest[ctest["Age"].isnull()]

lr = LinearRegression()
lr.fit(age_df[features], age_df["Age"])
preds = lr.predict(age_missing[features])
preds = np.maximum(preds, (preds * -1) + (preds / 2))
ctest.loc[ctest["Age"].isnull(), "Age"] = preds

### Removing outliers in train

In [176]:
for col in ["Age"]:
    Q1 = ctrain[col].quantile(0.25)
    Q3 = ctrain[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    ctrain = ctrain[(ctrain[col] >= lower_bound) & (ctrain[col] <= upper_bound)]

ctrain

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.000000,1,0,7.2500,2
1,1,1,0,38.000000,1,0,71.2833,0
2,1,3,0,26.000000,0,0,7.9250,2
3,1,1,0,35.000000,1,0,53.1000,2
4,0,3,1,35.000000,0,0,8.0500,2
...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,2
887,1,1,0,19.000000,0,0,30.0000,2
888,0,3,0,19.660027,1,2,23.4500,2
889,1,1,1,26.000000,0,0,30.0000,0


### Model training with default parameters

In [183]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

model = xgb.XGBClassifier(
    n_estimators=100,        # Number of boosting rounds
    max_depth=4,             # Controls complexity (3–6 is common)
    learning_rate=0.1,       # Lower learning rate, better generalization
    subsample=0.8,           # Row subsampling (prevents overfitting)
    colsample_bytree=0.8,    # Feature subsampling
    objective='binary:logistic',
    eval_metric='logloss',   # Metric to evaluate performance
    use_label_encoder=False, # Suppress label encoder warning
    random_state=42
)

features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Embarked", "Fare"]
X_train, X_test, y_train, y_test = train_test_split(ctrain[features], ctrain["Survived"], random_state=42)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)], 
    early_stopping_rounds=10,     
    verbose=True
)

preds = model.predict(ctest[features])
ctest["Survived"] = preds

ctest[["PassengerId", "Survived"]].to_csv("submission.csv", index=False)

[0]	validation_0-logloss:0.65531
[1]	validation_0-logloss:0.62717
[2]	validation_0-logloss:0.60348
[3]	validation_0-logloss:0.59248
[4]	validation_0-logloss:0.58331
[5]	validation_0-logloss:0.57435
[6]	validation_0-logloss:0.55352
[7]	validation_0-logloss:0.54721
[8]	validation_0-logloss:0.54119
[9]	validation_0-logloss:0.53407
[10]	validation_0-logloss:0.53047
[11]	validation_0-logloss:0.51731
[12]	validation_0-logloss:0.50465
[13]	validation_0-logloss:0.49573
[14]	validation_0-logloss:0.49364
[15]	validation_0-logloss:0.49288
[16]	validation_0-logloss:0.48742
[17]	validation_0-logloss:0.47982
[18]	validation_0-logloss:0.47809
[19]	validation_0-logloss:0.47575
[20]	validation_0-logloss:0.47125
[21]	validation_0-logloss:0.46834
[22]	validation_0-logloss:0.46570
[23]	validation_0-logloss:0.46409
[24]	validation_0-logloss:0.46226
[25]	validation_0-logloss:0.45853
[26]	validation_0-logloss:0.45479
[27]	validation_0-logloss:0.45238
[28]	validation_0-logloss:0.45122
[29]	validation_0-loglos



### Model training with grid search

In [179]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2, 0.5],
    'n_estimators': [50, 100, 200],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

preds = best_model.predict(ctest[features])
ctest["Survived"] = preds

ctest[["PassengerId", "Survived"]].to_csv("submission.csv", index=False)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
