## Problem Statement and Dataset

### *Objective:* Predict whether a passenger survived or not based on different features.  
### *Data:* Titanic dataset with features like PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked.

## Import necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

## Load the Dataset

In [2]:
data = pd.read_csv("D:\Yuktha\Codsoft\Task1_Titanic_Dataset_Kaggle\Titanic-Dataset.csv")

## Exploratory Data Analysis (EDA)

In [3]:
#To view the data
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
#Check for missing values
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
#Descriptive statistics
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Data Preprocessing

In [6]:
#Handling missing values, dropping unnecessary columns and encoding categorical variables

# Dropping unnecessary columns
data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Handling missing values for 'Age' and 'Embarked'
imputer_age = SimpleImputer(strategy='median')
data['Age'] = imputer_age.fit_transform(data[['Age']])

imputer_embarked = SimpleImputer(strategy='most_frequent')
data['Embarked'] = imputer_embarked.fit_transform(data[['Embarked']]).ravel()

# Converting categorical variables using OneHotEncoder
encoder_sex = OneHotEncoder(drop='first', sparse_output=False)
encoded_sex = encoder_sex.fit_transform(data[['Sex']])

encoder_embarked = OneHotEncoder(drop='first', sparse_output=False)
encoded_embarked = encoder_embarked.fit_transform(data[['Embarked']])

# Create DataFrames for encoded features
encoded_sex_df = pd.DataFrame(encoded_sex, columns=encoder_sex.get_feature_names_out(['Sex']))
encoded_embarked_df = pd.DataFrame(encoded_embarked, columns=encoder_embarked.get_feature_names_out(['Embarked']))

# Concatenate encoded features with the original DataFrame
data = pd.concat([data, encoded_sex_df, encoded_embarked_df], axis=1)

# Drop the original 'Sex' and 'Embarked' columns
data.drop(['Sex', 'Embarked'], axis=1, inplace=True)

# Display the first few rows of the processed DataFrame
data.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,1.0,0.0,1.0
1,1,1,38.0,1,0,71.2833,0.0,0.0,0.0
2,1,3,26.0,0,0,7.925,0.0,0.0,1.0
3,1,1,35.0,1,0,53.1,0.0,0.0,1.0
4,0,3,35.0,0,0,8.05,1.0,0.0,1.0


## Splitting the Data

In [7]:
X = data.drop('Survived', axis=1)
y = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.2500,1.0,0.0,1.0
1,1,38.0,1,0,71.2833,0.0,0.0,0.0
2,3,26.0,0,0,7.9250,0.0,0.0,1.0
3,1,35.0,1,0,53.1000,0.0,0.0,1.0
4,3,35.0,0,0,8.0500,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,1.0,0.0,1.0
887,1,19.0,0,0,30.0000,0.0,0.0,1.0
888,3,28.0,1,2,23.4500,0.0,0.0,1.0
889,1,26.0,0,0,30.0000,1.0,0.0,0.0


## Model Building

In [9]:
# Scaling the features
scaler = StandardScaler()
# Creating a RandomForest model pipeline
model = RandomForestClassifier(n_estimators=100, random_state=42)

pipeline = Pipeline(steps=[('scaler', scaler), ('model', model)])

# Fitting the model using the pipeline
pipeline.fit(X_train, y_train)

# Predicting using the pipeline
y_pred = pipeline.predict(X_test)
y_pred_prob = pipeline.predict_proba(X_test)[:, 1]

## Model Evaluation

In [10]:
# Predicting on the test set

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print('Confusion Matrix:')
print(cm)

# Generating classification report
class_report = classification_report(y_test, y_pred)

# Printing the classification report
print("Classification Report:")
print(class_report)

Accuracy: 0.8212290502793296
Precision: 0.8
Recall: 0.7567567567567568
F1 Score: 0.7777777777777778
Confusion Matrix:
[[91 14]
 [18 56]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.76      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



>>## As the recall score was lowest, improvising the model for recall using grid search

## Improvising the model and evaluating 

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid for GridSearchCV
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 5, 10, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

# Initialize the RandomForestClassifier inside the pipeline
model = RandomForestClassifier(random_state=42)

# Create a pipeline with scaler and the model
pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('model', model)])

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='recall', verbose=1, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Predict using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Best Model Parameters: {grid_search.best_params_}')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Model Parameters: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Accuracy: 0.8212
Precision: 0.8000
Recall: 0.7568
F1 Score: 0.7778
Confusion Matrix:
[[91 14]
 [18 56]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.76      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179



>>## Grid search also yielded the same results, further we can choose better models and expect a better performance.
>>## To maintain a balance between model complexity and computational efficiency, I chose to focus solely on the Random Forest algorithm.

## A few data records for titanic survival prediction

In [14]:
new_data = {
    'Pclass': [3, 1, 2, 1, 3],
    'Age': [25, 40, 22, 35, 60],
    'SibSp': [1, 0, 2, 1, 0],
    'Parch': [0, 1, 1, 0, 0],
    'Fare': [7.75, 71.2833, 15.0, 53.1, 8.05],
    'Sex_male': [1, 0, 1, 0, 1],
    'Embarked': ['Q', 'C', 'S', 'C', 'Q']
}

# Convert new data to DataFrame
new_df = pd.DataFrame(new_data)

# Derive Embarked_Q and Embarked_S
new_df['Embarked_Q'] = new_df['Embarked'].apply(lambda x: 1 if x == 'Q' else 0)
new_df['Embarked_S'] = new_df['Embarked'].apply(lambda x: 1 if x == 'S' else 0)

# Drop the original 'Embarked' column
new_df.drop('Embarked', axis=1, inplace=True)

# Ensure columns are in the same order as X_train columns
new_df = new_df[X_train.columns]

# Fit the pipeline on training data (assuming 'pipeline' is already defined and trained)
pipeline.fit(X_train, y_train)

# Predict using the fitted pipeline
new_pred = pipeline.predict(new_df)
new_pred_prob = pipeline.predict_proba(new_df)[:, 1]

# Print predictions for each record
print("New Data Points and Predictions:")
for i, row in new_df.iterrows():
    print(f"\nRecord {i + 1}:")
    print(f"Data: {row.to_dict()}")
    print(f"Predicted Class (Survived or Not Survived): {new_pred[i]}")
    print(f"Predicted Probability of Survival: {new_pred_prob[i]:.4f}")

New Data Points and Predictions:

Record 1:
Data: {'Pclass': 3.0, 'Age': 25.0, 'SibSp': 1.0, 'Parch': 0.0, 'Fare': 7.75, 'Sex_male': 1.0, 'Embarked_Q': 1.0, 'Embarked_S': 0.0}
Predicted Class (Survived or Not Survived): 0
Predicted Probability of Survival: 0.0794

Record 2:
Data: {'Pclass': 1.0, 'Age': 40.0, 'SibSp': 0.0, 'Parch': 1.0, 'Fare': 71.2833, 'Sex_male': 0.0, 'Embarked_Q': 0.0, 'Embarked_S': 0.0}
Predicted Class (Survived or Not Survived): 1
Predicted Probability of Survival: 1.0000

Record 3:
Data: {'Pclass': 2.0, 'Age': 22.0, 'SibSp': 2.0, 'Parch': 1.0, 'Fare': 15.0, 'Sex_male': 1.0, 'Embarked_Q': 0.0, 'Embarked_S': 1.0}
Predicted Class (Survived or Not Survived): 0
Predicted Probability of Survival: 0.2900

Record 4:
Data: {'Pclass': 1.0, 'Age': 35.0, 'SibSp': 1.0, 'Parch': 0.0, 'Fare': 53.1, 'Sex_male': 0.0, 'Embarked_Q': 0.0, 'Embarked_S': 0.0}
Predicted Class (Survived or Not Survived): 1
Predicted Probability of Survival: 0.9900

Record 5:
Data: {'Pclass': 3.0, 'Age': 