# 🔧  TITANIC SURVIVAL PREDICTION

In [1]:
# import the packages 
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

# 💻 Upload the data

In [2]:
data=pd.read_csv("C:/Users/LENOVO/Desktop/Project/Intership/Data/Titanic-Dataset.csv")
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [3]:
print(data.columns.tolist())
print(data.shape)

['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
(891, 12)


In [4]:
# to finf are their any null values
pd.isnull(data).sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Their are 177 missing values in Age and 687 missing values in Cabin also 2 missing values in Embarked 

In [5]:
# to know the duplicate values
data.duplicated().sum()

0

Their is no duplicate values 

In [6]:
data.describe(include='object')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


# ➡️ Handling missing values and Data cleaning

In [7]:
# Identify columns that are categorical (non-numeric)
categorical_cols = data.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)

# You can either drop the columns or encode them into numeric values
#  Convert categorical columns into dummy variables (one-hot encoding)
data= pd.get_dummies(data, columns=categorical_cols, drop_first=True)

Categorical columns: Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')


In [8]:
# Now, check the dataset again
print(data.head(5))

# Fill missing values with median or mode for numerical or categorical columns
data.fillna(data.median(), inplace=True)

   PassengerId  Survived  Pclass   Age  SibSp  Parch     Fare  \
0            1         0       3  22.0      1      0   7.2500   
1            2         1       1  38.0      1      0  71.2833   
2            3         1       3  26.0      0      0   7.9250   
3            4         1       1  35.0      1      0  53.1000   
4            5         0       3  35.0      0      0   8.0500   

   Name_Abbott, Mr. Rossmore Edward  Name_Abbott, Mrs. Stanton (Rosa Hunt)  \
0                             False                                  False   
1                             False                                  False   
2                             False                                  False   
3                             False                                  False   
4                             False                                  False   

   Name_Abelson, Mr. Samuel  ...  Cabin_F G63  Cabin_F G73  Cabin_F2  \
0                     False  ...        False        False     False

In [9]:
pd.isnull(data).sum()

PassengerId    0
Survived       0
Pclass         0
Age            0
SibSp          0
              ..
Cabin_F4       0
Cabin_G6       0
Cabin_T        0
Embarked_Q     0
Embarked_S     0
Length: 1726, dtype: int64

# 📊  Logistic Regression model

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Prepare the features (X) and target (y)
X = data.drop(columns=['Survived'])
y = data['Survived']

# Handle missing data
X = X.fillna(X.mean())  # Filling missing values with column mean (can also use median)

# Convert categorical columns to numeric (One-Hot Encoding)
X = pd.get_dummies(X, drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
X[['Age', 'Fare']] = scaler.fit_transform(X[['Age', 'Fare']])

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and Train Logistic Regression Model
model = LogisticRegression(max_iter=10000, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_val)

# Evaluate the accuracy
print(f'Accuracy: {accuracy_score(y_val, y_pred)}')


Accuracy: 0.8156424581005587


#  📊 Hyperparameter grid method to improvise the Accuracy 

In [13]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10],  # Regularization strength
    'solver': ['liblinear', 'saga'],  # Solvers for logistic regression
    'max_iter': [100, 200, 1000]  # Maximum number of iterations
}

# Initialize Logistic Regression model
model_lr = LogisticRegression(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model_lr, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit the model with the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Train the model with the best parameters
best_model = grid_search.best_estimator_


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters: {'C': 10, 'max_iter': 100, 'solver': 'liblinear'}


In [14]:
# Initialize the Logistic Regression model with best parameters
best_model = LogisticRegression(C=10, max_iter=100, solver='liblinear', random_state=42)

# Train the model with the training data
best_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = best_model.predict(X_val)

# Evaluate the model's accuracy
print(f'Improved Accuracy: {accuracy_score(y_val, y_pred)}')


Improved Accuracy: 0.8268156424581006


# 💾 Saving and Labeling Model Predictions for Titanic Survival Prediction

In [18]:
# Create a DataFrame with PassengerId and predicted Survived values
submission_df = pd.DataFrame({'PassengerId': X_val.index, 'Survived': y_pred})

# Specify the file path where you want to save the CSV
file_path = r'C:\Users\LENOVO\Desktop\Project\hackathon\titanic_predictions.csv'

# Save to CSV file
submission_df.to_csv(file_path, index=False)

print(f"Prediction file saved at: {file_path}")

Prediction file saved at: C:\Users\LENOVO\Desktop\Project\hackathon\titanic_predictions.csv


# ➡️ The final accuracy of the model is 82.7%