In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold




In [2]:

# Load the data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")


In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train_data = train_data.drop(['PassengerId'],axis=1)


In [6]:
train_data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
#Feature Engineering

female = train_data.loc[train_data['Sex']=='female']['Survived']
print('% of female who survived : {:.3f}'.format(sum(female)/len(female)*100))

male = train_data[train_data['Sex']=='male']['Survived']
print('% of men who survived is : {:.3f}'.format(sum(male)/len(male)*100))


% of female who survived : 74.204
% of men who survived is : 18.891


In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [9]:
# Feature Engineering

# Extract Title from Name
title_train = [name.split(',')[1].split(".")[0].strip() for name in train_data['Name']]
train_data['Title'] = pd.Series(title_train)
 
title_test = [name.split(',')[1].split(".")[0].strip() for name in test_data['Name']]  
test_data['Title'] = pd.Series(title_test)
train_data['Title'].value_counts()

Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64

In [10]:

test_data['Title'].value_counts()

Title
Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Ms          1
Dr          1
Dona        1
Name: count, dtype: int64

In [11]:
train_data = train_data.drop(['Name'],axis=1)
test_data = test_data.drop(['Name'],axis=1)

In [12]:
#Replace rare titles with 'Rare'
rare_titles = ['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
train_data['Title'] = train_data['Title'].replace(rare_titles, 'Rare')
test_data['Title'] = test_data['Title'].replace(rare_titles, 'Rare')

In [13]:
# Create 'FamilySize' feature and 'IsAlone' feature
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1
train_data['IsAlone'] = 0
train_data.loc[train_data['FamilySize'] == 1, 'IsAlone'] = 1
test_data['IsAlone'] = 0
test_data.loc[test_data['FamilySize'] == 1, 'IsAlone'] = 1

In [14]:
# Handle Age missing data
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())
test_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())

In [15]:
# Bin 'Age' into categories
train_data['AgeBin'] = pd.cut(train_data['Age'].astype(int), 5, labels=['Child','Teenager','Adult','MiddleAged', 'Elderly'])
test_data['AgeBin'] = pd.cut(test_data['Age'].astype(int), 5, labels=['Child','Teenager','Adult','MiddleAged', 'Elderly'])


In [16]:
# Bin 'Fare' into categories
train_data['FareBin'] = pd.qcut(train_data['Fare'], 4, labels=['Low', 'Medium', 'High', 'VeryHigh'])
test_data['FareBin'] = pd.qcut(test_data['Fare'], 4, labels=['Low', 'Medium', 'High', 'VeryHigh'])


In [17]:
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Survived    891 non-null    int64   
 1   Pclass      891 non-null    int64   
 2   Sex         891 non-null    object  
 3   Age         891 non-null    float64 
 4   SibSp       891 non-null    int64   
 5   Parch       891 non-null    int64   
 6   Ticket      891 non-null    object  
 7   Fare        891 non-null    float64 
 8   Cabin       204 non-null    object  
 9   Embarked    889 non-null    object  
 10  Title       891 non-null    object  
 11  FamilySize  891 non-null    int64   
 12  IsAlone     891 non-null    int64   
 13  AgeBin      891 non-null    category
 14  FareBin     891 non-null    category
dtypes: category(2), float64(2), int64(6), object(5)
memory usage: 92.8+ KB


In [18]:
# Split the data
y_train = train_data['Survived']
X_train = train_data.drop(['Survived','SibSp','Parch','Age'], axis=1)


In [19]:
# Define preprocessing for numeric columns (normalize them so they're on the same scale)
numeric_features = ['Pclass','Fare','FamilySize','IsAlone']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Define preprocessing for categorical features (encode them as one-hot vectors)
categorical_features = ['Embarked', 'Sex', 'Pclass', 'Title', 'IsAlone', 'AgeBin', 'FareBin']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [20]:

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])



In [21]:
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Survived    891 non-null    int64   
 1   Pclass      891 non-null    int64   
 2   Sex         891 non-null    object  
 3   Age         891 non-null    float64 
 4   SibSp       891 non-null    int64   
 5   Parch       891 non-null    int64   
 6   Ticket      891 non-null    object  
 7   Fare        891 non-null    float64 
 8   Cabin       204 non-null    object  
 9   Embarked    889 non-null    object  
 10  Title       891 non-null    object  
 11  FamilySize  891 non-null    int64   
 12  IsAlone     891 non-null    int64   
 13  AgeBin      891 non-null    category
 14  FareBin     891 non-null    category
dtypes: category(2), float64(2), int64(6), object(5)
memory usage: 92.8+ KB


In [22]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

# Define the XGBClassifier model with early stopping
model = XGBClassifier(n_jobs=-1)

# Create the pipeline
xgb = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', model)])

# Define the parameter grid
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 5, 10],
    'classifier__min_child_weight': [1, 2, 4],
    'classifier__gamma': [0, 0.1, 0.2],
    'classifier__subsample': [0.5, 1],
    'classifier__colsample_bytree': [0.6, 0.8, 1.0]
}

# Create the randomized search object
random_search = RandomizedSearchCV(xgb, param_grid, cv=StratifiedKFold(n_splits=5), n_jobs=-1)

# Fit the model
random_search.fit(X_train, y_train)

# Print the best parameters
print("Best parameters: ", random_search.best_params_)

# Use the best model to make predictions
best_xgb = random_search.best_estimator_
predictions = best_xgb.predict(test_data)


Best parameters:  {'classifier__subsample': 1, 'classifier__n_estimators': 50, 'classifier__min_child_weight': 4, 'classifier__max_depth': 10, 'classifier__gamma': 0.1, 'classifier__colsample_bytree': 1.0}


In [23]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = random_search, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 82.95 %
Standard Deviation: 4.93 %


In [24]:
# Create a DataFrame with the passenger IDs and our prediction regarding whether they survived or not
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})

# Write the DataFrame to a CSV file
output.to_csv('submission_new10.csv', index=False)
