In [16]:
import pandas as pd
import numpy as np
import os
import json

# I highly recommend reading this blog post for beginners https://medium.com/@taribosamuel/titanic-dataset-survival-patterns-4d8d22ab6a5b

In [17]:
# Reading csv file 
base_dir = os.getcwd()  
file_path = os.path.join(base_dir, "sample_csv", "titanic.csv")

try:
    df = pd.read_csv(file_path)
    print(f"Loaded '{file_path}' with shape {df.shape}")
except Exception as e:
    raise ValueError(f"could not load file: {e}")

df.shape

Loaded 'c:\Users\Cheethan\OneDrive\Desktop\Github_contribution\AutoEDA-Automated-Data-Preprocessing-Toolkit\notebooks\sample_csv\titanic.csv' with shape (891, 12)


(891, 12)

In [None]:
# Remove duplicate rows 
df.drop_duplicates(inplace=True)

# Fill missing values in numeric columns with (mean)
for col in df.select_dtypes(include='number'):
    df[col] = df[col].fillna(df[col].mean())  

# Fill missing values in categorical columns (mode)
for col in df.select_dtypes(include='object'):
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])

# Strip leading/trailing whitespace from string columns t
for col in df.select_dtypes(include='object'):
    df[col] = df[col].str.strip()

print(f"Cleaned data shape: {df.shape}")


Cleaned data shape: (891, 12)


### Note on FutureWarning in Pandas

Pandas raises a `FutureWarning` when using `inplace=True` on expressions like `df[col]` because it creates an intermediate object (a copy), not a direct reference to the original DataFrame.

This behavior **will not be supported in pandas 3.0+**, and your changes may not apply as expected.

#####  Best Practice:
    Instead of this:

    df[col].fillna(df[col].mode()[0], inplace=True)  

    use: **df[col] = df[col].fillna(df[col].mode()[0])**

    Didn't update the code as ppl might notice it and stop their habit of using inplace 

In [None]:
# Get all columns with numeric data types
numeric_cols = df.select_dtypes(include='number').columns

numerical_cols = []
encoded_categorical_cols = []

# Heuristic: If the column has 10 or fewer unique integer values, treat as categorical
for col in numeric_cols:
    unique_vals = df[col].dropna().unique()  
    if len(unique_vals) <= 10 and all(float(val).is_integer() for val in unique_vals):
        encoded_categorical_cols.append(col)
    else:
        numerical_cols.append(col)

In [None]:

df_numerical = df[numerical_cols]

df_categorical = df.select_dtypes(include='object')

# Add detected encoded categorical columns to the categorical DataFrame
df_categorical = pd.concat([df_categorical, df[encoded_categorical_cols]], axis=1)


print("Numerical columns:", list(df_numerical.columns))
print("Categorical columns (including encoded):", list(df_categorical.columns))


Numerical columns: ['PassengerId', 'Age', 'Fare']
Categorical columns (including encoded): ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Survived', 'Pclass', 'SibSp', 'Parch']


In [None]:

# only to be used on numerical data 
def numerical_stats(num_df):
    stats = {}
    for col in num_df.columns:
        stats[col] = {
            "mean": num_df[col].mean(),
            "median": num_df[col].median(),
            "min": num_df[col].min(),
            "max": num_df[col].max(),
            "std": num_df[col].std(),
            "missing": num_df[col].isnull().sum()
        }
    return stats

# returns most frequent values 
def most_frequent_values(series):
    freq = series.value_counts()
    return freq[freq == freq.max()].index.tolist()


def categorical_stats(cat_df):
    stats = {}
    for col in cat_df.columns:
        stats[col] = {
            "most_frequent_value": most_frequent_values(cat_df[col]),
            "unique_counts": cat_df[col].nunique()
        }
    return stats

# calls numerical_stats and categorical_stats func
def full_stats(numerical_df, categorical_df):
    return {
        "Numerical Columns": numerical_stats(numerical_df),
        "Categorical Columns": categorical_stats(categorical_df)
    }

#### Note: Functions like .mean(), .median(), and .mode() automatically ignore NaN values,
#### so missing data does not need to be removed for these calculations to work correctly.
#### However, .isnull() specifically identifies and counts NaN values.


In [24]:
def convert_to_builtin_types(obj):
    if isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32)):
        return float(obj)
    elif isinstance(obj, (np.bool_)):
        return bool(obj)
    elif isinstance(obj, (np.ndarray, list)):
        return list(obj)
    elif isinstance(obj, dict):
        return {k: convert_to_builtin_types(v) for k, v in obj.items()}
    else:
        return obj

#### Why We Use `convert_to_builtin_types()`

When exporting a Python dictionary (such as the summary statistics) to a `.json` file using `json.dump()`, you may encounter the following error:



This happens because pandas and NumPy use specialized data types such as:
-  `np.int64`, `np.float64`, `np.bool_`
-  `np.ndarray`

These types are **not supported by Python's built-in `json` module**.

 To handle this, we use the `convert_to_builtin_types()` function, which:

- Converts NumPy integers to standard Python `int`
- Converts NumPy floats to standard Python `float`
- Converts NumPy booleans to Python `bool`
- Converts NumPy arrays to Python lists
- Recursively applies these conversions to values inside dictionaries

This ensures that the final dictionary contains **only native Python types**, making it safe to export to JSON format.

In [25]:
cleaned_stats = convert_to_builtin_types(full_stats(df_numerical, df_categorical))

filename = os.path.splitext(os.path.basename(file_path))[0]


summary_folder = os.path.join("output-files", "statistics_summary")
os.makedirs(summary_folder, exist_ok=True)

json_export_path = os.path.join(summary_folder, f"{filename}_stats_summary.json")

# Write the JSON file
with open(json_export_path, "w") as f:
    json.dump(cleaned_stats, f, indent=4)

print(f"Exported summary stats to: {json_export_path}")

Exported summary stats to: output-files\statistics_summary\titanic_stats_summary.json
