In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# pd.options.display.float_format = '{:.2f}'.format(to remove this => pd.reset_option('display.float_format'))

# Data Ingestion
df = sns.load_dataset('titanic')

# Inspection
print("---First 5 Row---")
print(df.head())

---First 5 Row---
   survived  pclass     sex   age  sibsp  parch  fare embarked  class    who  \
0         0       3    male 22.00      1      0  7.25        S  Third    man   
1         1       1  female 38.00      1      0 71.28        C  First  woman   
2         1       3  female 26.00      0      0  7.92        S  Third  woman   
3         1       1  female 35.00      1      0 53.10        S  First  woman   
4         0       3    male 35.00      0      0  8.05        S  Third    man   

   adult_male deck  embark_town alive  alone  
0        True  NaN  Southampton    no  False  
1       False    C    Cherbourg   yes  False  
2       False  NaN  Southampton   yes   True  
3       False    C  Southampton   yes  False  
4        True  NaN  Southampton    no   True  


In [50]:
#Cheking for missing values
print("---Missing Values---")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])
print("\n---Missing Values Percentage---")
missing_percentage = (missing_values / len(df) * 100).sort_values(ascending=False)
print(missing_percentage[missing_percentage > 0])

# Statistical summary for numerical columns
print("--- Statistical Summary (Numerical Columns) ---")
print(df.describe())

print("\n--- Statistical Summary (Categorical Columns) ---")
print(df.describe(include='object'))

---Missing Values---
age            177
embarked         2
embark_town      2
dtype: int64

---Missing Values Percentage---
age           19.87
embarked       0.22
embark_town    0.22
dtype: float64
--- Statistical Summary (Numerical Columns) ---
       survived  pclass    age  sibsp  parch   fare  family_size
count    891.00  891.00 714.00 891.00 891.00 891.00       891.00
mean       0.38    2.31  29.70   0.52   0.38  32.20         1.90
std        0.49    0.84  14.53   1.10   0.81  49.69         1.61
min        0.00    1.00   0.42   0.00   0.00   0.00         1.00
25%        0.00    2.00  20.12   0.00   0.00   7.91         1.00
50%        0.00    3.00  28.00   0.00   0.00  14.45         1.00
75%        1.00    3.00  38.00   1.00   0.00  31.00         2.00
max        1.00    3.00  80.00   8.00   6.00 512.33        11.00

--- Statistical Summary (Categorical Columns) ---
         sex embarked  who  embark_town family_type
count    891      889  891          889         891
unique     2 

In [None]:
#Dropping deck column - Too many missing values, redundant with pclass
df.drop('deck', axis=1, inplace=True)
print("✓ Deck Column Dropped")

#Dropping alive column - Same with survived
df.drop('alive', axis=1, inplace=True)
print("✓ Alive Column Dropped")

#Dropping has_deck_info column - Redundant with deck
df.drop('has_deck_info', axis=1, inplace=True)
print("✓ Has Deck Info Column Dropped")

#Dropping embarked column - Redundant with embark_town
df.drop('embarked', axis=1, inplace=True)
print("✓ embarked Column Dropped")

✓ Deck Column Dropped
✓ Alive Column Dropped
✓ Has Deck Info Column Dropped


In [41]:
#Checking columns left
print(df.columns.tolist())
df.info()
df.describe()

['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alone']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  embark_town  889 non-null    object  
 12  alone        891 non-null    bool    
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 72.5+ KB


Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.38,2.31,29.7,0.52,0.38,32.2
std,0.49,0.84,14.53,1.1,0.81,49.69
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.12,0.0,0.0,7.91
50%,0.0,3.0,28.0,0.0,0.0,14.45
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.33


In [None]:
# Creating Family Size by combining all family members
df['family_size'] = df['sibsp'] + df['parch'] + 1


In [43]:
#Survival by Family Size
print("---Survival by Family Size---")
print(df.groupby('family_size')['survived'].mean().sort_index())

---Survival by Family Size---
family_size
1    0.30
2    0.55
3    0.58
4    0.72
5    0.20
6    0.14
7    0.33
8    0.00
11   0.00
Name: survived, dtype: float64


In [49]:
# Categorizing Family Size

def categorize_family(size):
    if size == 1:
        return 'Alone'
    elif 2 <= size <= 4:
        return 'Small_Family'
    else:
        return 'Large_Family'
df['family_type'] = df['family_size'].apply(categorize_family)

print(df.groupby('family_type')['survived'].agg(['count', 'mean']))

              count  mean
family_type              
Alone           537  0.30
Large_Family     62  0.16
Small_Family    292  0.58


In [75]:
print(df.columns.tolist())
df['who'].head(20)

['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'class', 'who', 'adult_male', 'embark_town', 'alone', 'family_size', 'family_type']


0       man
1     woman
2     woman
3     woman
4       man
5       man
6       man
7     child
8     woman
9     child
10    child
11    woman
12      man
13      man
14    child
15    woman
16    child
17      man
18    woman
19    woman
Name: who, dtype: object

In [79]:
# Checking embark_town distribution
print("---Embark_Town Distribution---")
print(df['embark_town'].value_counts())
print(f"\nMost common part: {df['embark_town'].mode()[0]}")

# Filling missing with mode
mode_embark_town = df['embark_town'].mode()[0]
df['embark_town'].fillna(mode_embark_town, inplace=True)

---Embark_Town Distribution---
embark_town
Southampton    644
Cherbourg      168
Queenstown      77
Name: count, dtype: int64

Most common part: Southampton


In [None]:
# Filling missing values with median in age column to avoid outliers instead of mode
# Grouping pclass and sex to have a better prediction for missing values

df['age'] = df['age'].fillna(df.groupby(['pclass', 'sex'])['age'].transform('median'))


In [95]:
# Verify no missing values remain
print("--- MISSING VALUES AFTER CLEANING ---")
missing_after = df.isnull().sum()
print(missing_after[missing_after > 0])

if df.isnull().sum().sum() == 0:
    print("\n✅ No missing values remaining!")
    print(f"\nDataset shape: {df.shape}")
    print(f"Total cells: {df.shape[0] * df.shape[1]}")
else:
    print(f"\n⚠️ Still have {df.isnull().sum().sum()} missing values")
    print("Columns with missing data:")
    print(missing_after[missing_after > 0])

--- MISSING VALUES AFTER CLEANING ---
Series([], dtype: int64)

✅ No missing values remaining!

Dataset shape: (891, 14)
Total cells: 12474


In [97]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"--- Duplicate Rows: {duplicates} ---")

--- Duplicate Rows: 118 ---


In [105]:
df[df.duplicated(keep=False)].sort_values(by=df.columns.tolist())

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,class,who,adult_male,embark_town,alone,family_size,family_type
263,0,1,male,40.00,0,0,0.00,First,man,True,Southampton,True,1,Alone
633,0,1,male,40.00,0,0,0.00,First,man,True,Southampton,True,1,Alone
815,0,1,male,40.00,0,0,0.00,First,man,True,Southampton,True,1,Alone
30,0,1,male,40.00,0,0,27.72,First,man,True,Cherbourg,True,1,Alone
64,0,1,male,40.00,0,0,27.72,First,man,True,Cherbourg,True,1,Alone
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
692,1,3,male,25.00,0,0,56.50,Third,man,True,Southampton,True,1,Alone
65,1,3,male,25.00,1,1,15.25,Third,man,True,Cherbourg,False,3,Small_Family
709,1,3,male,25.00,1,1,15.25,Third,man,True,Cherbourg,False,3,Small_Family
74,1,3,male,32.00,0,0,56.50,Third,man,True,Southampton,True,1,Alone


In [109]:
df.to_csv("titanic.csv", index=False)