In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install xgboost catboost scikit-learn
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [3]:
train_data = pd.read_csv('/content/drive/MyDrive/titanic/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/titanic/test.csv')

In [4]:
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

In [5]:
print(train_df.shape)
print(test_df.shape)

(891, 12)
(418, 11)


In [6]:
set(train_df) - set(test_df)

{'Survived'}

In [7]:
print(train_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


In [8]:
print(train_df.head(5))

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [9]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
from sklearn.preprocessing import LabelEncoder

def clean(df):

    # Make a copy to avoid modifying the original
    df = df.copy()

    # 1️⃣ Drop columns that are not useful for model prediction
    df = df.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

    # 2️⃣ Fill missing numeric columns with median
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    for col in numeric_cols:
        median_value = df[col].median()
        df[col] = df[col].fillna(median_value)

    # 3️⃣ Fill missing categorical columns with mode
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if df[col].isnull().sum() > 0:
            mode_value = df[col].mode()[0]
            df[col] = df[col].fillna(mode_value)

    # 4️⃣ Encode categorical columns using LabelEncoder
    le = LabelEncoder()
    categorical_cols = df.select_dtypes(include=['object']).columns

    for col in categorical_cols:
        df[col] = le.fit_transform(df[col])

    return df


In [11]:
# ====== CLEANING & FEATURE ENGINEERING ======
def clean_and_engineer(df):
    df = df.copy()

    # Drop irrelevant columns
    df.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1, inplace=True)

    # Fill missing numeric values
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())

    # Fill missing categorical values
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    # Feature engineering
    df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)

    df['Title'] = df['Title'].replace(
        ['Lady','Countess','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'],
        'Rare'
    )
    title_mapping = {'Master':0, 'Miss':1, 'Ms':1, 'Mme':1, 'Mlle':1, 'Mrs':2, 'Mr':3, 'Rare':4}
    df['Title'] = df['Title'].map(title_mapping).astype(int)

    # Family features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

    # Age & Fare bins
    df['AgeBin'] = pd.cut(df['Age'], bins=[0,12,18,35,60,80], labels=False)
    df['FareBin'] = pd.qcut(df['Fare'], 4, labels=False)

    # Drop Name
    df.drop('Name', axis=1, inplace=True)

    # Label encode categorical
    le = LabelEncoder()
    for col in ['Sex', 'Embarked']:
        df[col] = le.fit_transform(df[col])

    return df

In [12]:
train_clean = clean_and_engineer(train_df)
test_clean = clean_and_engineer(test_df)

In [13]:
train_clean.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,AgeBin,FareBin
0,0,3,1,22.0,1,0,7.25,2,3,2,0,2,0
1,1,1,0,38.0,1,0,71.2833,0,2,2,0,3,3
2,1,3,0,26.0,0,0,7.925,2,1,1,1,2,1
3,1,1,0,35.0,1,0,53.1,2,2,2,0,2,3
4,0,3,1,35.0,0,0,8.05,2,3,1,1,2,1


In [14]:
train_clean.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,AgeBin,FareBin
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.647587,29.361582,0.523008,0.381594,32.204208,1.536476,2.333333,1.904602,0.602694,2.034792,1.497194
std,0.486592,0.836071,0.47799,13.019697,1.102743,0.806057,49.693429,0.791503,0.980713,1.613459,0.489615,0.839958,1.118156
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,1.0,1.0,1.0,0.0,2.0,0.5
50%,0.0,3.0,1.0,28.0,0.0,0.0,14.4542,2.0,3.0,1.0,1.0,2.0,1.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0,2.0,3.0,2.0,1.0,2.0,2.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0,4.0,11.0,1.0,4.0,3.0


In [15]:
# Import libraries
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_absolute_error, accuracy_score
from sklearn.linear_model import LogisticRegression

# Split data

X = train_clean.drop('Survived', axis=1)
y = train_clean['Survived']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define base models
xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)

cat_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=4,
    random_state=42,
    verbose=0  # silence CatBoost output
)

# Create ensemble
voting = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('cat', cat_model)
    ],
    voting='soft'  # 'soft' uses predicted probabilities for smoother results
)

# ====== STACKING ENSEMBLE ======
stack = StackingClassifier(
    estimators=[('xgb', xgb_model), ('cat', cat_model)],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5
)


#-----------------------
#Fittting
#-----------------------

xgb_model.fit(X_train, y_train)          # XGB Fit
cat_model.fit(X_train, y_train)          # Cat Fit

#Ensembles

voting.fit(X_train, y_train)           # Voting Fit
stack.fit(X_train, y_train)              # Stacking Fit


#-----------------------
# Predicting
#-----------------------


y_pred_voting = voting.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)
y_pred_cat = cat_model.predict(X_test)
y_pred_stack = stack.predict(X_test)


#-----------------------
# Calculate Accuracy Score [Validating]
#-----------------------

accuracy_score_voting = accuracy_score(y_test, y_pred_voting)
accuracy_score_xgb = accuracy_score(y_test, y_pred_xgb)
accuracy_score_cat = accuracy_score(y_test, y_pred_cat)
accuracy_score_stack = accuracy_score(y_test, y_pred_stack)

print("Accuracy Score Voting:", accuracy_score_voting)
print("Accuracy Score XGB:", accuracy_score_xgb)
print("Accuracy Score Cat:", accuracy_score_cat)
print("Accuracy Score Stack:", accuracy_score_stack)


Accuracy Score Voting: 0.8268156424581006
Accuracy Score XGB: 0.8379888268156425
Accuracy Score Cat: 0.8268156424581006
Accuracy Score Stack: 0.8268156424581006


In [16]:
# Predicting Test

y_pred_test = xgb_model.predict(test_clean)


# Titanic test.csv usually has a 'PassengerId' column that must be included in submission

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],   # use the original IDs from test.csv
    'Survived': y_pred_test.astype(int)      # predictions must be integers (0 or 1)
})

# Save to CSV

submission.to_csv('xgb_submission.csv', index=False)

print("✅ Predictions saved successfully to xgb_submission.csv")

✅ Predictions saved successfully to xgb_submission.csv
