In [None]:
# Data Type Optimization in autoEDA

This notebook demonstrates how to optimize data types in a pandas DataFrame to reduce memory usage and improve performance.

In [12]:

import pandas as pd
import numpy as np

In [13]:
# Sample DataFrame for demonstration
df = pd.DataFrame({
    'id': range(1, 1001),
    'category': ['A', 'B', 'C', 'D'] * 250,
    'value': np.random.randn(1000),
    'big_int': np.random.randint(1e6, 1e7, 1000),
    'date_col': pd.date_range('2023-01-01', periods=1000, freq='D').astype(str)
})

In [14]:
print("Memory usage before optimization:", df.memory_usage(deep=True).sum(), "bytes")

Memory usage before optimization: 129132 bytes


In [16]:
def optimize_dtypes(df):
    # Convert object columns with low cardinality to category
    for col in df.select_dtypes(include='object'):
        if df[col].nunique() < 50:
            df[col] = df[col].astype('category')
    # Downcast numeric columns
    for col in df.select_dtypes(include=['float']):
        df[col] = pd.to_numeric(df[col], downcast='float')
    for col in df.select_dtypes(include=['int']):
        df[col] = pd.to_numeric(df[col], downcast='integer')
    # Convert date-like columns
    for col in df.columns:
        if any(x in col.lower() for x in ['date', 'time', 'timestamp']):
            try:
                df[col] = pd.to_datetime(df[col])
            except Exception:
                pass
    return df

In [17]:
df_optimized = optimize_dtypes(df.copy())

In [18]:
print("Memory usage after optimization:", df_optimized.memory_usage(deep=True).sum(), "bytes")

Memory usage after optimization: 19504 bytes


In [None]:
## Observations

- Memory usage before and after optimization is shown above.
- Converting columns to more efficient types can save significant memory.
- This approach can be applied to any dataset for better performance.