In [None]:
import pandas as pd
import seaborn as sns
import os
from ydata_profiling import ProfileReport
from datetime import datetime

In [None]:
def load_data(file_path):
    return pd.read_csv("./titanic_data.csv")

def clean_data(df):
    df = df.dropna(subset=["Embarked"]).copy()
    df.loc[:, "Age"] = df["Age"].fillna(df["Age"].mean())
    df.loc[:, "embark_town"] = df["embark_townAge"].fillna("Unknown")
    return df


def engineer_feature(df):
    # create age groups
    df["age_group"] = pd.cut(df["Age"], bins=[0, 12, 18, 35, 60, 120], labels=["Child", "Teen", "Young", "Adult", "Middle-Aged", "Senior"])
    
    # Encode sex
    df["sex_encoded"] = df["Sex"].map({"male": 0, "female": 1})
    return df

def validate_data(df):
    profile = ProfileReport(df, title="Titanic Dataset Profiling Report", explorative=True)
    profile.to_file("titanic_data_profile.html")
    
def export_cleaned_data(df):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"cleaned_titanic_data_{timestamp}.csv"
    df.to_csv(output_path, index=False)
    print(f"Cleaned data exported to {output_path}")

In [None]:
# pipeline
df = load_data()
df = clean_data(df)
df = engineer_feature(df)
validate_data(df)
export_cleaned_data(df)