In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the Titanic dataset
df = pd.read_csv("../Data/Titanic-Dataset.csv")

# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Check basic information about the dataset
print("\nDataset Information:")
df.info()

In [None]:
# Summary statistics
print("\nSummary Statistics:")
df.describe()

In [None]:
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

In [None]:
# Handle missing values
# Fill missing 'Age' with median
df['Age'].fillna(df['Age'].median(), inplace=True)

In [None]:
# Drop 'Cabin' column due to too many missing values
df.drop('Cabin', axis=1, inplace=True)

In [None]:
# Drop rows with missing 'Embarked' values (only 2 rows)
df.dropna(subset=['Embarked'], inplace=True)

In [None]:
# Verify missing values after handling
print("\nMissing Values After Handling:")
print(df.isnull().sum())

In [None]:
# Distribution of Age
plt.figure(figsize=(8, 6))
sns.histplot(df['Age'], kde=True, color='blue')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Distribution of Fare
plt.figure(figsize=(8, 6))
sns.histplot(df['Fare'], kde=True, color='green')
plt.title('Distribution of Fare')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Count of passengers by Pclass
plt.figure(figsize=(8, 6))
sns.countplot(x='Pclass', data=df, palette='Set2')
plt.title('Passenger Count by Class')
plt.xlabel('Passenger Class')
plt.ylabel('Count')
plt.show()

In [None]:
# Survival count
plt.figure(figsize=(8, 6))
sns.countplot(x='Survived', data=df, palette='Set1')
plt.title('Survival Count')
plt.xlabel('Survived (1 = Yes, 0 = No)')
plt.ylabel('Count')
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Survival rate by gender
plt.figure(figsize=(8, 6))
sns.barplot(x='Sex', y='Survived', data=df, palette='Set3')
plt.title('Survival Rate by Gender')
plt.xlabel('Gender')
plt.ylabel('Survival Rate')
plt.show()

In [None]:
# Survival rate by passenger class
plt.figure(figsize=(8, 6))
sns.barplot(x='Pclass', y='Survived', data=df, palette='Set2')
plt.title('Survival Rate by Passenger Class')
plt.xlabel('Passenger Class')
plt.ylabel('Survival Rate')
plt.show()

In [None]:
# Age distribution by survival
plt.figure(figsize=(8, 6))
sns.histplot(data=df, x='Age', hue='Survived', kde=True, multiple='stack', palette='Set1')
plt.title('Age Distribution by Survival')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Fare distribution by survival
plt.figure(figsize=(8, 6))
sns.boxplot(x='Survived', y='Fare', data=df, palette='Set2')
plt.title('Fare Distribution by Survival')
plt.xlabel('Survived (1 = Yes, 0 = No)')
plt.ylabel('Fare')
plt.show()

In [None]:
# Survival rate by embarkation point
plt.figure(figsize=(8, 6))
sns.barplot(x='Embarked', y='Survived', data=df, palette='Set3')
plt.title('Survival Rate by Embarkation Point')
plt.xlabel('Embarkation Point')
plt.ylabel('Survival Rate')
plt.show()

# Summary of insights

1. Survival Rate: Approximately 38% of passengers survived.
2. Gender: Female passengers had a significantly higher survival rate than males.
3. Passenger Class: Passengers in first class had a higher survival rate compared to those in second and third classes.
4. Age: Younger passengers (children) had a higher survival rate.
5. Fare: Passengers who paid higher fares had a better chance of survival.
6. Embarkation Point: Passengers who embarked from Cherbourg had a higher survival rate.