In [None]:
import pandas as pd
import numpy as np

def calculate_entropy_robust(series):
    """
    Calculates the entropy of a pandas Series with robust handling for
    numeric binning and constant columns.
    """
    # Drop missing values for entropy calculation
    series = series.dropna()

    # If all values are the same or series is empty, entropy is 0
    if series.nunique() <= 1:
        return 0.0

    # For numerical series, bin them.
    if pd.api.types.is_numeric_dtype(series):
        # Use the number of unique values if less than 10, otherwise use 10 bins
        n_bins = min(10, series.nunique())
        if n_bins > 1:
            try:
                series = pd.cut(series, bins=n_bins, labels=False, include_lowest=True, duplicates='drop')
            except ValueError:
                # If cutting fails, treat as categorical (fallback)
                pass

    # Recalculate value counts on the (potentially binned) series
    value_counts = series.value_counts()
    probabilities = value_counts / len(series)

    # Calculate entropy
    entropy = -np.sum(probabilities * np.log2(probabilities + 1e-9))
    return entropy

# File paths
advertising_file = 'advertising.csv' 
housing_file = 'Housing.csv'     
faa_file = 'dataset.csv'         

# --- Process advertising.csv ---
print("--- Advertising Dataset (advertising.csv) ---")
try:
    advertising_df = pd.read_csv(advertising_file)
    advertising_entropy = {}
    for col in advertising_df.columns:
        advertising_entropy[col] = calculate_entropy_robust(advertising_df[col])
    for col, entropy in advertising_entropy.items():
        print(f"Entropy of '{col}': {entropy:.4f}")
except FileNotFoundError:
    print(f"Error: {advertising_file} not found.")

print("\n" + "="*50 + "\n")

# --- Process Housing.csv ---
print("--- Housing Dataset (Housing.csv) ---")
try:
    housing_df = pd.read_csv(housing_file)
    housing_entropy = {}
    for col in housing_df.columns:
        housing_entropy[col] = calculate_entropy_robust(housing_df[col])
    for col, entropy in housing_entropy.items():
        print(f"Entropy of '{col}': {entropy:.4f}")
except FileNotFoundError:
    print(f"Error: {housing_file} not found.")

print("\n" + "="*50 + "\n")

# --- Process faa_ai_prelim.csv ---
print("--- FAA Preliminary Dataset (faa_ai_prelim.csv) ---")
try:
    # Use a different quote character for this specific CSV to handle internal quotes
    faa_df = pd.read_csv(faa_file, quotechar='"')
    faa_entropy = {}
    for col in faa_df.columns:
        faa_entropy[col] = calculate_entropy_robust(faa_df[col])
    for col, entropy in faa_entropy.items():
        print(f"Entropy of '{col}': {entropy:.4f}")
except FileNotFoundError:
    print(f"Error: {faa_file} not found.")
except Exception as e:
    print(f"An error occurred: {e}")

--- Advertising Dataset (advertising.csv) ---
Entropy of 'TV': 3.2985
Entropy of 'Radio': 3.2896
Entropy of 'Newspaper': 2.7522
Entropy of 'Sales': 3.0530


--- Housing Dataset (Housing.csv) ---
Entropy of 'price': 2.5489
Entropy of 'area': 2.4152
Entropy of 'bedrooms': 1.5784
Entropy of 'bathrooms': 0.9448
Entropy of 'stories': 1.6013
Entropy of 'mainroad': 0.5876
Entropy of 'guestroom': 0.6756
Entropy of 'basement': 0.9345
Entropy of 'hotwaterheating': 0.2686
Entropy of 'airconditioning': 0.8995
Entropy of 'parking': 1.5476
Entropy of 'prefarea': 0.7864
Entropy of 'furnishingstatus': 1.5573


--- FAA Preliminary Dataset (faa_ai_prelim.csv) ---
Entropy of 'UPDATED': 0.2243
Entropy of 'ENTRY_DATE': 2.7897
Entropy of 'EVENT_LCL_DATE': 3.8502
Entropy of 'EVENT_LCL_TIME': 6.1973
Entropy of 'LOC_CITY_NAME': 6.2787
Entropy of 'LOC_STATE_NAME': 4.3330
Entropy of 'LOC_CNTRY_NAME': 0.0000
Entropy of 'RMK_TEXT': 6.3268
Entropy of 'EVENT_TYPE_DESC': 0.9101
Entropy of 'FSDO_DESC': 5.2054
Entropy 