In [6]:
""" Data Ingestion and Initialization
Run this cell to import the necessary libraries and set your configuration variables."""

import pandas as pd
import numpy as np
import os
import urllib.request

# --- CONFIGURATION & FETCH ---
# Public URL for the Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
FILE_PATH = 'titanic_sample.csv'

# Fetch the file locally so our load_data function can process it
if not os.path.exists(FILE_PATH):
    print("Fetching dataset...")
    urllib.request.urlretrieve(url, FILE_PATH)
    print("Dataset downloaded successfully!")

# Choose your missing value strategy: 'drop', 'fill_mean_mode', or 'none'
MISSING_STRATEGY = 'fill_mean_mode'


Fetching dataset...
Dataset downloaded successfully!


In [7]:
"""1. Data Loading
This function detects the file extension and loads the data accordingly. It uses Python's built-in sniffer for text files to auto-detect delimiters."""

def load_data(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} was not found.")
        
    ext = os.path.splitext(file_path)[-1].lower()
    
    try:
        if ext == '.csv':
            df = pd.read_csv(file_path)
        elif ext == '.json':
            df = pd.read_json(file_path)
        elif ext == '.txt':
            # sep=None and engine='python' allow pandas to automatically sniff the delimiter
            df = pd.read_csv(file_path, sep=None, engine='python')
        else:
            raise ValueError(f"Unsupported file extension: {ext}")
        
        print(f"Successfully loaded {file_path} with shape {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

df = load_data(FILE_PATH)

Successfully loaded titanic_sample.csv with shape (891, 12)


In [8]:
"""2. Normalization & Cleaning
Here we standardize the column names to `snake_case` and apply our chosen missing value strategy."""

def normalize_columns(df):
    """Converts column names to lowercase, replaces spaces with underscores, and strips whitespace."""
    df.columns = (df.columns
                  .str.strip()
                  .str.lower()
                  .str.replace(' ', '_')
                  .str.replace(r'[^\w\s]', '', regex=True)) # Removes special characters
    print("Columns normalized.")
    return df

def handle_missing_values(df, strategy='none'):
    """Handles missing values based on the selected strategy."""
    missing_count = df.isna().sum().sum()
    print(f"Total missing values found: {missing_count}")
    
    if strategy == 'drop':
        df = df.dropna()
        print("Dropped rows with missing values.")
    elif strategy == 'fill_mean_mode':
        for col in df.columns:
            if df[col].dtype in ['int64', 'float64']:
                df[col] = df[col].fillna(df[col].mean())
            else:
                df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else "Unknown")
        print("Filled missing values (mean for numerical, mode for categorical).")
    elif strategy == 'none':
        print("Missing values left untouched.")
    else:
        print(f"Unrecognized strategy '{strategy}'. Missing values left untouched.")
        
    return df

if df is not None:
    df = normalize_columns(df)
    df = handle_missing_values(df, strategy=MISSING_STRATEGY)

Columns normalized.
Total missing values found: 866
Filled missing values (mean for numerical, mode for categorical).


In [9]:
"""3. Exploratory Data Analysis (EDA)
Running basic statistical summaries and distributions to understand the dataset."""

def run_simple_eda(df):
    print("\n--- Data Preview (head) ---")
    display(df.head())
    
    print("\n--- Statistical Summary (describe) ---")
    display(df.describe(include='all'))
    
    print("\n--- Categorical Distributions (value_counts) ---")
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(categorical_cols) > 0:
        for col in categorical_cols:
            print(f"\nValue counts for '{col}':")
            # Show top 5 categories to keep output clean
            display(df[col].value_counts().head(5)) 
    else:
        print("No categorical columns found.")

if df is not None:
    run_simple_eda(df)


--- Data Preview (head) ---


Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S



--- Statistical Summary (describe) ---


Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
count,891.0,891.0,891.0,891,891,891.0,891.0,891.0,891.0,891.0,891,891
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,691,646
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,13.002015,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,22.0,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,29.699118,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,35.0,1.0,0.0,,31.0,,



--- Categorical Distributions (value_counts) ---

Value counts for 'name':


name
Braund, Mr. Owen Harris             1
Boulos, Mr. Hanna                   1
Frolicher-Stehli, Mr. Maxmillian    1
Gilinski, Mr. Eliezer               1
Murdlin, Mr. Joseph                 1
Name: count, dtype: int64


Value counts for 'sex':


sex
male      577
female    314
Name: count, dtype: int64


Value counts for 'ticket':


ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
Name: count, dtype: int64


Value counts for 'cabin':


cabin
B96 B98        691
G6               4
C23 C25 C27      4
C22 C26          3
F33              3
Name: count, dtype: int64


Value counts for 'embarked':


embarked
S    646
C    168
Q     77
Name: count, dtype: int64

In [10]:
"""4. Export
Saving the finalized, cleaned dataset to a new CSV."""

if df is not None:
    output_filename = 'cleaned_data.csv'
    df.to_csv(output_filename, index=False)
    print(f"\nPipeline complete. Cleaned data saved to '{output_filename}'.")


Pipeline complete. Cleaned data saved to 'cleaned_data.csv'.
