In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/Titanic-Dataset.csv')

df_clean = df.copy()


In [None]:
# standardize Column names (Production Rule)
df_clean.columns =(
    df_clean.columns
    .str.strip()
    .str.lower()
    .str.replace(" ","_")
)

df_clean.columns

Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

In [None]:
# Data type fixing (Critical step)
# convert categorical variables
cat_cols = ['sex', 'embarked']
df_clean[cat_cols] = df_clean[cat_cols].astype('category')

In [None]:
# Convert target variable
df_clean['survived']= df_clean['survived'].astype('int')

In [None]:
#Missing value Treatment(Most important)
# Age - Median(robust to outliers)
df_clean['age']= df_clean['age'].fillna(df_clean['age'].median())

In [None]:
# Embarked mode (most frequent)
df_clean['embarked']= df_clean['embarked'].fillna(
    df_clean['embarked'].mode()[0]
)


In [None]:
# cabin - Drop (Too many missing, low business value)
df_clean.drop(columns=['cabin'], inplace=True)

In [None]:
# Duplicate Handling
df_clean.drop_duplicates(inplace=True)

In [None]:
#outlier treatment (Google style:Cap, Not remove)
# Fare capping(IQR method)
Q1 = df_clean['fare'].quantile(0.25)
Q3 = df_clean['fare'].quantile(0.75)
IQR = Q3 -Q1

lower = Q1 - 1.5*IQR
upper = Q3 + 1.5*IQR

df_clean['fare'] = np.where(
    df_clean['fare']> upper,upper,
    np.where(df_clean['fare']< lower, lower, df_clean['fare'])
)

In [None]:
# Feature Engineering(Cleaning + Intelligence)
# Family size
df_clean['family_size']=(
    df_clean['sibsp'] + df_clean['parch']+ 1
)

In [None]:
# Is alone flag
df_clean['is_alone']=np.where(
    df_clean['family_size']==1,1,0
)

In [None]:
#Encoding (For ML Readiness)
#Label Encoding
df_clean['sex']=df_clean['sex'].map({'male':0, 'female':1})
df_clean['embarked']=df_clean['embarked'].map({'S':0,'C':1, 'Q':2})

In [None]:
# Final data quality check (sanity)
df_clean.info()
df_clean.isnull().sum()
df_clean.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   passengerid  891 non-null    int64   
 1   survived     891 non-null    int64   
 2   pclass       891 non-null    int64   
 3   name         891 non-null    object  
 4   sex          0 non-null      float64 
 5   age          891 non-null    float64 
 6   sibsp        891 non-null    int64   
 7   parch        891 non-null    int64   
 8   ticket       891 non-null    object  
 9   fare         891 non-null    float64 
 10  embarked     891 non-null    category
 11  family_size  891 non-null    int64   
 12  is_alone     891 non-null    int64   
dtypes: category(1), float64(3), int64(7), object(2)
memory usage: 84.7+ KB


Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,embarked,family_size,is_alone
0,1,0,3,"Braund, Mr. Owen Harris",,22.0,1,0,A/5 21171,7.25,0,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",,38.0,1,0,PC 17599,65.6344,1,2,0
2,3,1,3,"Heikkinen, Miss. Laina",,26.0,0,0,STON/O2. 3101282,7.925,0,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",,35.0,1,0,113803,53.1,0,2,0
4,5,0,3,"Allen, Mr. William Henry",,35.0,0,0,373450,8.05,0,1,1


In [None]:
# save cleaned dataset (Production step)
df_clean.to_csv("titanic_cleaned.csv",index=False)