In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [None]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

data = pd.concat([train_data, test_data], sort=False)

data.head()

In [None]:
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess','Don', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Royalty')
data['Title'] = data['Title'].replace(['Capt', 'Col', 'Dr', 'Major', 'Rev'], 'Officer')
data['Title'] = data['Title'].replace(['Mlle', 'Ms'], 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')


data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

data['IsAlone'] = (data["FamilySize"]==1).astype(int)


imputer = SimpleImputer(strategy='median')
data[['Age']] = imputer.fit_transform(data[['Age']])

data['Embarked'] = data['Embarked'].fillna('S')

data['Fare'] = imputer.fit_transform(data[['Fare']])

data['Deck'] = data['Cabin'].str[0].fillna('Unknown')

data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1, inplace=True)

data.head()

In [None]:
label_encoders = {}
categorical_features = ['Sex', 'Embarked', 'Title', 'Deck']

for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature])
    label_encoders[feature] = le

data.head()


In [None]:
train = data[:len(train_data)]
test = data[len(train_data):]

X = train.drop('Survived', axis = 1)
y = train['Survived']


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=1000,
    max_depth=5,
    learning_rate=0.01,
    colsample_bytree=0.7,
    subsample=0.8,
    random_state=42,
    use_label_encoder=False,
    early_stopping_round=50,
    eval_metric='logloss'
)

xgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)


In [None]:
# Make predictions on the validation set
y_pred = xgb_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")


In [None]:
# Predict survival on the test dataset
test_predictions = xgb_model.predict(test.drop('Survived', axis=1))

# Create a submission file
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")


In [None]:
# Create a submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],  # Ensure PassengerId is from the test.csv
    'Survived': test_predictions              # Predicted survival outcomes
})

# Save the DataFrame as a CSV file
submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created.")
