In [51]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from xgboost import XGBClassifier

df_train_file = pd.read_csv(r"D:\PYTHON\Titanic_Survival_Kaggle\DATASET\train.csv")

print(f'Before Preprocessing stage: {df_train_file.isna().sum()}')

def title_group(title):
    if title in ['Mr']:
        return 1
    elif title in ['Miss', 'Ms']:
        return 2
    elif title in ['Mrs']:
        return 3
    elif title in ['Master']:
        return 4
    else:
        return 5


def preprocess_data(df_train):
    df_train['Title'] = df_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df_train[df_train['Age'].isna()]
    df_train['Age'].fillna(df_train['Age'].median(), inplace=True)
    df_train['Cabin'].fillna('No Cabin', inplace=True)
    df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace=True)
    df_train['Fare'].fillna(df_train['Fare'].median(), inplace=True)

    df_train['Sex'] = df_train['Sex'].map({'male': 0, 'female': 1})
    df_train['Embarked'] = df_train['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})


    df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch'] + 1
    df_train['IsAlone'] = 1  # Initialize to 1 (alone)
    df_train.loc[df_train['FamilySize'] > 1, 'IsAlone'] = 0

    df_train['Title'] = df_train['Title'].apply(title_group).astype(int)

    
    df_train['AgeBand'] = pd.cut(df_train['Age'], 5)
    df_train['AgeBand'] = df_train['AgeBand'].cat.codes
    #df_train['HasCabin'] = df_train['Cabin'].notnull().astype(int)
    df_train['HasCabin'] = df_train['Cabin'].apply(lambda x: 0 if x == 'No Cabin' else 1)
    
    df_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Age'], axis=1, inplace=True)
    #df_train['Fare'] = StandardScaler().fit_transform(df_train[['Fare']])
    return df_train

df_train = preprocess_data(df_train_file)
df_train.head()




Before Preprocessing stage: PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


  df_train['Title'] = df_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Age'].fillna(df_train['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Cabin'].fillna('No Cabin', inplace=True)
The behavior will change in pandas 

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Fare,Embarked,Title,FamilySize,IsAlone,AgeBand,HasCabin
0,0,3,0,1,0,7.25,0,1,2,0,1,0
1,1,1,1,1,0,71.2833,1,3,2,0,2,1
2,1,3,1,0,0,7.925,0,2,1,1,1,0
3,1,1,1,1,0,53.1,0,3,2,0,2,1
4,0,3,0,0,0,8.05,0,1,1,1,2,0


In [46]:
%pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB 162.5 kB/s eta 0:15:23
   ---------------------------------------- 0.0/150.0 MB 162.5 kB/s eta 0:15:23
   ---------------------------------------- 0.0/150.0 MB 145.2 kB/s eta 0:17:13
   ---------------------------------------- 0.1/150.0 MB 174.3 kB/s eta 0:14:21
   ---------------------------------------- 0.1/150.0 MB 204.8 kB/s eta 0:12:12
   ---------------------------------------- 0.1/150.0 MB 327.2 kB/s eta 0:07:39
   ---------------------------------------- 0.2/150.0 MB 512.0 kB/s eta 0:04:53
   ---------------------------------------- 0.3/150.0 MB 630.9 kB/s eta 0:03:58

# EDA

In [None]:
group = df_train.groupby(['Sex', 'Survived']).size().reset_index(name='Count')

# Calculate percentage within each gender group
group['Percent'] = group.groupby('Sex')['Count'].transform(lambda x: x / x.sum() * 100)

print(group)

sns.barplot(data=group, x='Sex', y='Percent', hue='Survived')
plt.title('Gender-wise Survival Percentage')
plt.ylabel('Percentage (%)')
plt.show()

In [None]:
sns.histplot(df_train['Age'], bins=30, kde=True, color='blue')
plt.title('Age Distribution of Passengers')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
sns.histplot(df_train['Fare'], bins=50, kde=True, color='green')
plt.title('Fare Distribution of Passengers')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.show()

# Machine Learning Model

In [53]:
x = df_train.drop(['Survived'], axis=1)
y= df_train['Survived']




x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.02,
    max_depth=3,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_weight=1,
    gamma=0.1,
    random_state=42
)


#model = RandomForestClassifier(criterion='entropy',n_estimators=200, random_state=42, max_depth=15, min_samples_split=12, min_samples_leaf=4, max_samples=0.8)
"""model = RandomForestClassifier(
    criterion='entropy',
    n_estimators=300,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=4,
    class_weight='balanced',  
    max_samples=0.8,
    random_state=42
)"""
model.fit(x_train, y_train)
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
"""print("training accuracy")
print(classification_report(y_train, pred_train))

print("test accuracy")
print(classification_report(y_test, pred_test))
print("Confusion Matrix:")
print(confusion_matrix(y_test, pred_test))"""
print(f"F1 Score for train {model.__class__.__name__}: {f1_score(y_train, pred_train)}")
print(f"F1 Score for test {model.__class__.__name__}: {f1_score(y_test, pred_test)}")

scores = cross_val_score(model, x, y, cv=5, scoring='accuracy')
print("CV Accuracy:", scores.mean())

F1 Score for train XGBClassifier: 0.847457627118644
F1 Score for test XGBClassifier: 0.8157894736842105
CV Accuracy: 0.8271734354403364


In [None]:
print("Confusion Matrix:")


cm = confusion_matrix(y_test, pred_test)
print(cm)

labels = ['Not Survived', 'Survived']

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix Heatmap')
plt.show()

# Prediction for Test dataset

In [49]:
df_test = pd.read_csv(r"D:\PYTHON\Titanic_Survival_Kaggle\DATASET\test.csv")
df_test1 = df_test.copy()

print(f'Before Preprocessing stage: {df_test.isna().sum()}')

df_test = preprocess_data(df_test)

print(f'After Preprocessing stage: {df_test.isna().sum()}')
df_test.head()
if 'Survived' in df_test.columns:
    df_test = df_test.drop('Survived', axis=1)




Before Preprocessing stage: PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64
After Preprocessing stage: Pclass        0
Sex           0
SibSp         0
Parch         0
Fare          0
Embarked      0
Title         0
FamilySize    0
IsAlone       0
AgeBand       0
HasCabin      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Age'].fillna(df_train['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['Cabin'].fillna('No Cabin', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

In [50]:
  # Ensure 'Survived' column is not present
df_test['Survived'] = model.predict(df_test)

sub = pd.DataFrame(
    {'PassengerId': df_test1['PassengerId'], 'Survived': df_test['Survived']}
)
print(sub)

sub.to_csv(r"D:\PYTHON\Titanic_Survival_Kaggle\DATASET\submission8.csv", index=False)


     PassengerId  Survived
0            892         0
1            893         1
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]
