# Titanic Passanger Survival Analysis

In [1]:
from IPython.display import Image

# Display the image
Image(url="https://miro.medium.com/v2/resize:fit:786/format:webp/1*GulVod9PCsNLAx8Wjf1kcA.png")


In [2]:
 #Data Loading and Exploration

In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC


In [10]:
# Load the training and test datasets
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")


In [11]:
# Check for missing values in the datasets
print("Train missing values:\n", train.isnull().sum())
print("Test missing values:\n", test.isnull().sum())


Train missing values:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Test missing values:
 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [12]:
# Fill missing values for age with median age
train['Age'].fillna(train['Age'].median(), inplace=True)
test['Age'].fillna(test['Age'].median(), inplace=True)

# Fill missing values for fare with median fare
test['Fare'].fillna(test['Fare'].median(), inplace=True)

# Fill missing values for embarked with the mode (most frequent value)
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column if it exists
train.drop('Cabin', axis=1, inplace=True, errors='ignore')
test.drop('Cabin', axis=1, inplace=True, errors='ignore')


In [13]:
# Drop unnecessary features
features_to_drop = ['Name', 'Ticket', 'SibSp', 'Parch']
train.drop(features_to_drop, axis=1, inplace=True)
test.drop(features_to_drop, axis=1, inplace=True)


In [14]:
# Encode categorical variables
label_encoder = LabelEncoder()
train['Sex'] = label_encoder.fit_transform(train['Sex'])
test['Sex'] = label_encoder.transform(test['Sex'])

train['Embarked'] = label_encoder.fit_transform(train['Embarked'])
test['Embarked'] = label_encoder.transform(test['Embarked'])


In [15]:
# Define features and target variable
X = train.drop('Survived', axis=1)
y = train['Survived']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
# Initialize models
log_reg = LogisticRegression()
rf_classifier = RandomForestClassifier()
gb_classifier = GradientBoostingClassifier()
svm_classifier = SVC()

# Train models
log_reg.fit(X_train, y_train)
rf_classifier.fit(X_train, y_train)
gb_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)

# Evaluate models
print("Logistic Regression Accuracy:", log_reg.score(X_val, y_val))
print("Random Forest Accuracy:", rf_classifier.score(X_val, y_val))
print("Gradient Boosting Accuracy:", gb_classifier.score(X_val, y_val))
print("SVM Accuracy:", svm_classifier.score(X_val, y_val))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.7988826815642458
Random Forest Accuracy: 0.8212290502793296
Gradient Boosting Accuracy: 0.8212290502793296
SVM Accuracy: 0.5977653631284916


The  accuracies represent the performance of different machine learning models on a given dataset. description for each model's accuracy:

Logistic Regression Accuracy: The logistic regression model achieved an accuracy of approximately 79.89%. Logistic regression is a linear model used for binary classification tasks. It calculates the probability that an instance belongs to a particular class and makes predictions based on a threshold.

Random Forest Accuracy: The random forest model achieved an accuracy of approximately 83.80%. Random forest is an ensemble learning method that constructs multiple decision trees during training and outputs the mode of the classes (classification) or the average prediction (regression) of the individual trees.

Gradient Boosting Accuracy: The gradient boosting model achieved an accuracy of approximately 82.68%. Gradient boosting is another ensemble learning technique that builds a sequence of weak learners (typically decision trees) and combines their predictions to improve accuracy.

Support Vector Machine (SVM) Accuracy: The SVM model achieved an accuracy of approximately 59.78%. SVM is a supervised learning algorithm used for classification tasks. It finds the hyperplane that best separates the classes in the feature space and maximizes the margin between them.

In [17]:
#Logistic Regression hyperparameter

In [18]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Define the hyperparameter grid
param_dist = {
    'C': uniform(loc=0, scale=4)  # Regularization parameter
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=LogisticRegression(),
    param_distributions=param_dist,
    n_iter=100,  # Number of parameter settings that are sampled
    scoring='accuracy',  # Scoring method
    cv=5,  # Number of folds in cross-validation
    verbose=1,  # Controls the verbosity: the higher, the more messages
    n_jobs=-1  # Number of jobs to run in parallel
)

# Perform the hyperparameter search
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Evaluate the best model on the validation set
best_model = random_search.best_estimator_
print("Best Model Accuracy:", best_model.score(X_val, y_val))


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'C': 2.9235032785505792}
Best Model Accuracy: 0.7988826815642458


In [19]:
#Random Forest hyperparameter

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the hyperparameter grid
param_dist = {
    'n_estimators': randint(100, 1000),  # Number of trees in the forest
    'max_features': ['auto', 'sqrt'],  # Number of features to consider at every split
    'max_depth': randint(5, 50),  # Maximum depth of the tree
    'min_samples_split': randint(2, 20),  # Minimum number of samples required to split a node
    'min_samples_leaf': randint(1, 20),  # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]  # Method of selecting samples for training each tree
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(),
    param_distributions=param_dist,
    n_iter=100,  # Number of parameter settings that are sampled
    scoring='accuracy',  # Scoring method
    cv=5,  # Number of folds in cross-validation
    verbose=1,  # Controls the verbosity: the higher, the more messages
    n_jobs=-1  # Number of jobs to run in parallel
)

# Perform the hyperparameter search
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Evaluate the best model on the validation set
best_model = random_search.best_estimator_
print("Best Model Accuracy:", best_model.score(X_val, y_val))


Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
# hyperparameter for Gradient Boosting

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the hyperparameter grid
param_dist = {
    'n_estimators': randint(100, 1000),  # Number of boosting stages to be run
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],  # Learning rate shrinks the contribution of each tree
    'max_depth': randint(3, 10),  # Maximum depth of the individual regression estimators
    'min_samples_split': randint(2, 20),  # Minimum number of samples required to split an internal node
    'min_samples_leaf': randint(1, 20),  # Minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2']  # Number of features to consider when looking for the best split
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=GradientBoostingClassifier(),
    param_distributions=param_dist,
    n_iter=50,  # Number of parameter settings that are sampled
    scoring='accuracy',  # Scoring method
    cv=5,  # Number of folds in cross-validation
    verbose=1,  # Controls the verbosity: the higher, the more messages
    n_jobs=-1  # Number of jobs to run in parallel
)

# Perform the hyperparameter search
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Evaluate the best model on the validation set
best_model = random_search.best_estimator_
print("Best Model Accuracy:", best_model.score(X_val, y_val))


In [None]:
# hyperparameter for SVM 

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, loguniform

# Define the hyperparameter grid
param_dist = {
    'C': loguniform(1e-3, 1e3),  # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
    'gamma': ['scale', 'auto']  # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
}

# Initialize RandomizedSearchCV with reduced n_iter
random_search = RandomizedSearchCV(
    estimator=SVC(),
    param_distributions=param_dist,
    n_iter=50,  # Reduce the number of iterations
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Perform the hyperparameter search
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Evaluate the best model on the validation set
best_model = random_search.best_estimator_
print("Best Model Accuracy:", best_model.score(X_val, y_val))


In [None]:
# Feature Engineering

In [None]:
# Revert dropping the 'Name' column
train = pd.read_csv("input/train.csv")
test = pd.read_csv("input/test.csv")

# Extract titles from names
train['Title'] = train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Create a family size feature
train['FamilySize'] = train['SibSp'] + train['Parch'] + 1
test['FamilySize'] = test['SibSp'] + test['Parch'] + 1

# Binning numerical variables like age and fare into categories
train['AgeBin'] = pd.cut(train['Age'], bins=[0, 18, 35, 50, 65, 100], labels=['0-18', '19-35', '36-50', '51-65', '66-100'])
test['AgeBin'] = pd.cut(test['Age'], bins=[0, 18, 35, 50, 65, 100], labels=['0-18', '19-35', '36-50', '51-65', '66-100'])

train['FareBin'] = pd.qcut(train['Fare'], q=5, labels=False)
test['FareBin'] = pd.qcut(test['Fare'], q=5, labels=False)


In [None]:
#Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Define a function to create bar charts
def bar_chart(feature):
    survived = train[train['Survived'] == 1][feature].value_counts()
    dead = train[train['Survived'] == 0][feature].value_counts()
    df = pd.DataFrame([survived, dead])
    df.index = ['Survived', 'Dead']
    df.plot(kind='bar', stacked=True, figsize=(10, 5))
    plt.title(f'Survival by {feature}')
    plt.xlabel(feature)
    plt.ylabel('Number of passengers')
    plt.show()
    
    # Visualize survival by sex
bar_chart('Sex')

# Visualize survival by passenger class
bar_chart('Pclass')

# Visualize survival by number of parents/children
bar_chart('Parch')

# Visualize survival by port of embarkation
bar_chart('Embarked')

# Visualize survival rate by different features
sns.barplot(x='Sex', y='Survived', data=train)
plt.title("Survival Rate by Sex")
plt.show()

sns.barplot(x='Pclass', y='Survived', data=train)
plt.title("Survival Rate by Pclass")
plt.show()

# More visualizations can be added for other features


In [None]:
# Visualize survival by age
plt.figure(figsize=(10, 5))
sns.histplot(data=train, x='Age', hue='Survived', kde=True, bins=20, palette='husl')
plt.title('Survival by Age')
plt.xlabel('Age')
plt.ylabel('Number of passengers')
plt.legend(['Dead', 'Survived'])
plt.show()


In [None]:
# Visualize survival by fare
plt.figure(figsize=(10, 5))
sns.histplot(data=train, x='Fare', hue='Survived', kde=True, bins=20, palette='husl')
plt.title('Survival by Fare')
plt.xlabel('Fare')
plt.ylabel('Number of passengers')
plt.legend(['Dead', 'Survived'])
plt.show()

In [None]:
##Survival by Passenger Class and Sex:
#This chart will show survival rates based on both passenger class and sex,
#providing insights into potential differences in survival rates among different groups.

In [None]:
# Visualize survival by passenger class and sex
plt.figure(figsize=(10, 6))
sns.barplot(data=train, x='Pclass', y='Survived', hue='Sex', palette='Set1')
plt.title('Survival by Passenger Class and Sex')
plt.xlabel('Passenger Class')
plt.ylabel('Survival Rate')
plt.legend(title='Sex', loc='upper right')
plt.show()

In [None]:
##Survival by Embarked Port and Passenger Class:
#This chart will display survival rates based on the port of embarkation and passenger class, 
#providing insights into potential differences in survival rates among passengers who embarked from different ports.

In [None]:
# Visualize survival by embarked port and passenger class
plt.figure(figsize=(10, 6))
sns.barplot(data=train, x='Embarked', y='Survived', hue='Pclass', palette='Set2')
plt.title('Survival by Embarked Port and Passenger Class')
plt.xlabel('Embarked Port')
plt.ylabel('Survival Rate')
plt.legend(title='Passenger Class', loc='upper right')
plt.show()

In [None]:
#Ensemble Methods

In [None]:
from sklearn.ensemble import VotingClassifier

# Initialize individual classifiers
log_reg = LogisticRegression()
rf_classifier = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5, min_samples_leaf=1)
gb_classifier = GradientBoostingClassifier()
svm_classifier = SVC()

# Create a voting classifier
voting_classifier = VotingClassifier(
    estimators=[('lr', log_reg), ('rf', rf_classifier), ('gb', gb_classifier), ('svm', svm_classifier)],
    voting='hard'
)

# Train the voting classifier
voting_classifier.fit(X_train, y_train)

# Evaluate the voting classifier
print("Voting Classifier Accuracy:", voting_classifier.score(X_val, y_val))


In [None]:
#Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(rf_classifier, X, y, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


the cross-validation was performed using 5 folds, where the dataset was split into 5 equal parts, and the model was trained and evaluated 5 times, each time using a different fold as the validation set and the remaining folds as the training set.

Here's a breakdown of the  information:

Cross-Validation Scores: The array [0.7150838, 0.80898876, 0.83707865, 0.80337079, 0.8258427] contains the accuracy scores obtained for each fold of the cross-validation process. Each score represents the accuracy of the model on the validation set for a particular fold.

Mean CV Accuracy: The mean cross-validation accuracy, calculated by averaging the accuracy scores obtained across all folds. In this case, the mean cross-validation accuracy is approximately 0.7981, indicating the average accuracy of the model across all folds.

In [None]:
#Feature Selection

In [None]:
from sklearn.feature_selection import SelectFromModel

# Use feature importance scores for feature selection
feature_selection = SelectFromModel(rf_classifier)
feature_selection.fit(X_train, y_train)

# Get selected feature indices
selected_features = feature_selection.get_support(indices=True)

# Select features
X_train_selected = X_train.iloc[:, selected_features]
X_val_selected = X_val.iloc[:, selected_features]


In [None]:
#Handling Imbalanced Data

In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE for oversampling the minority class
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [None]:
#Advanced Models

In [None]:
from sklearn.neural_network import MLPClassifier

# Initialize and train a neural network model
nn_classifier = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500)
nn_classifier.fit(X_train, y_train)

# Evaluate the neural network model
print("Neural Network Accuracy:", nn_classifier.score(X_val, y_val))


neural network accuracy value of 0.7150837988826816 represents the proportion of correctly predicted instances out of the total instances in the dataset, as determined by the trained neural network model.

In [None]:
# Error Analysis

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Fit the RandomForestClassifier to the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = rf_classifier.predict(X_val)

# Create a confusion matrix
conf_matrix = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Visualize the confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
