In [None]:
import numpy as np
import pandas as pd
from os.path import realpath as realpath

# Monkey patching NumPy for compatibility with version >= 1.24
np.float = np.float64
np.int = np.int_
np.object = np.object_
np.bool = np.bool_

pd.set_option("mode.copy_on_write", True)


# EDA Level 0 — Pure Understanding of Original Data

In [None]:
def column_summary(df):
    summary_data = []

    for col_name in df.columns:
        col_dtype = df[col_name].dtype
        num_of_nulls = df[col_name].isnull().sum()
        num_of_nun_nulls = df[col_name].notnull.sum()
        num_of_distinct_values = df[col_name].nunique()

        if num_of_distinct_values <= 10:
            distinct_values_counts = df[col_name].value_counts().to_dict()
        else:
            top_10_values_counts = df[col_name].value_counts().head(10).to_dict()
            distinct_values_counts = {k: v for k, v in sorted(top_10_values_counts.items(), key=lambda item: item[1], reverse=True)}
        
        summary_data.append({
            "col_name": col_name,
            "col_dtype": col_dtype,
            "num_of_nulls": num_of_nulls,
            "num_of_nun_nulls": num_of_nun_nulls,
            "num_of_distinct_values": num_of_distinct_values,
            "distinct_values_counts": distinct_values_counts
        })

    summary_data = pd.DataFrame(summary_data)
    return summary_data