In [1]:
import math
import pandas as pd

In [2]:
# Entropy calculation functions
def entropy_categorical(values):
    """Compute entropy of a categorical column"""
    total = len(values)
    freq = {}
    for v in values:
        freq[v] = freq.get(v, 0) + 1
    
    entropy = 0
    for count in freq.values():
        p = count / total
        entropy -= p * math.log2(p)
    return entropy



def entropy_continuous(values, bins=10):
    """Bin continuous values and compute entropy"""
    min_val, max_val = min(values), max(values)

    # If all values are the same → entropy = 0
    if min_val == max_val:
        return 0.0

    bin_size = (max_val - min_val) / bins
    freq = [0] * bins
    
    for v in values:
        idx = min(int((v - min_val) / bin_size), bins - 1)
        freq[idx] += 1
    
    total = len(values)
    entropy = 0
    for count in freq:
        if count > 0:
            p = count / total
            entropy -= p * math.log2(p)
    return entropy



In [3]:
# Helper: Decide categorical vs continuous
def compute_entropy(values, dtype, bins=10, categorical_threshold=10):
    """
    Compute entropy based on column type.
    - If object dtype → categorical.
    - If numeric with very few unique values → treat as categorical.
    - Otherwise → continuous.
    """
    if len(values) == 0:
        return None 

    if dtype == 'object':
        return entropy_categorical(values)
    else:
        unique_vals = len(set(values))
        if unique_vals <= categorical_threshold:
            return entropy_categorical(values)
        else:
            return entropy_continuous(values, bins=bins)

In [4]:
# Apply to datasets
datasets = ["advertising.csv", "Housing.csv", "faa_ai_prelim.csv"]

for file in datasets:
    try:
        df = pd.read_csv(file)
    except FileNotFoundError:
        print(f"\n⚠️ Skipping {file} (file not found)")
        continue

    print(f"\nDataset: {file}")
    for col in df.columns:
        values = df[col].dropna().tolist()
        if len(values) == 0:
            print(f"Entropy of {col}: skipped (empty column)")
            continue

        e = compute_entropy(values, df[col].dtype, bins=10)
        if e is None:
            print(f"Entropy of {col}: skipped (empty column)")
        else:
            print(f"Entropy of {col}: {e:.4f}")


Dataset: advertising.csv
Entropy of TV: 3.2985
Entropy of Radio: 3.2896
Entropy of Newspaper: 2.7522
Entropy of Sales: 3.0530

Dataset: Housing.csv
Entropy of price: 2.5584
Entropy of area: 2.4166
Entropy of bedrooms: 1.5784
Entropy of bathrooms: 0.9448
Entropy of stories: 1.6013
Entropy of mainroad: 0.5876
Entropy of guestroom: 0.6756
Entropy of basement: 0.9345
Entropy of hotwaterheating: 0.2686
Entropy of airconditioning: 0.8995
Entropy of parking: 1.5476
Entropy of prefarea: 0.7864
Entropy of furnishingstatus: 1.5573

Dataset: faa_ai_prelim.csv
Entropy of UPDATED: 0.2243
Entropy of ENTRY_DATE: 2.7897
Entropy of EVENT_LCL_DATE: 3.8502
Entropy of EVENT_LCL_TIME: 6.1973
Entropy of LOC_CITY_NAME: 6.2787
Entropy of LOC_STATE_NAME: 4.3330
Entropy of LOC_CNTRY_NAME: 0.0000
Entropy of RMK_TEXT: 6.3268
Entropy of EVENT_TYPE_DESC: 0.9101
Entropy of FSDO_DESC: 5.2054
Entropy of REGIST_NBR: 6.3750
Entropy of FLT_NBR: 2.5850
Entropy of ACFT_OPRTR: 2.5216
Entropy of ACFT_MAKE_NAME: 3.8938
Entro