Upload project_archive.zip file first

In [5]:
import pandas as pd
import zipfile
from IPython.display import display

# --- 1. Define relative paths ---
zip_path = 'project_archive.zip'
csv_path = 'fl-tabular/health.csv' # Relative path

try:
    # --- 2. Unzip the project archive to access the CSV file ---
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall('.')
    print(f"Successfully extracted {zip_path}")

    # --- 3. Load the dataset ---
    df = pd.read_csv(csv_path)

    # --- 4. Define the descriptions from your paper's table ---
    descriptions = {
        # Note: 'gender' and 'location' from the paper table are not in the CSV, so they are omitted.
        "sex": "The gender of the primary member (male/female).",
        "hereditary_diseases": "The type of hereditary disease, if any.",
        "smoker": "Whether the member is a smoker (1 for yes, 0 for no).",
        "city": "The geographical location (city) of the member.",
        "diabetes": "Whether the member has diabetes (1 for yes, 0 for no).",
        "regular_ex": "Whether the member exercises regularly (1 for yes, 0 for no).",
        "job_title": "The job title of the primary member.",
        "age": "The age of the primary member.",
        "weight": "The weight of the primary member in kilograms.",
        "bmi": "The Body Mass Index of the primary member.",
        "no_of_dependents": "The number of dependents covered by the plan.",
        "bloodpressure": "The blood pressure reading of the member.",
        "claim": "The monetary value of the insurance claim."
    }

    # --- 5. Analyze the dataframe to create the table ---
    table_data = []

    # MODIFIED: Explicitly define categorical and numerical columns for correct ordering and typing
    categorical_cols = [
        "sex", "hereditary_diseases", "smoker", "city",
        "diabetes", "regular_ex", "job_title"
    ]
    numerical_cols = [
        "age", "weight", "bmi", "no_of_dependents",
        "bloodpressure", "claim"
    ]
    ordered_columns = categorical_cols + numerical_cols

    # Iterate through the ordered columns to build the table
    for col in ordered_columns:
        if col in df.columns: # Check if the column exists in the dataframe
            if col in numerical_cols:
                data_type = "Numerical"
                unique_values = df[col].nunique()
                if unique_values > 20:
                    unique_values_str = "Continuous"
                else:
                    unique_values_str = str(unique_values)
            else: # Assumes all other columns in our list are categorical
                data_type = "Categorical"
                unique_values_str = str(df[col].nunique())

            table_data.append({
                "Feature": col,
                "Data Type": data_type,
                "Unique Values": unique_values_str,
                "Description": descriptions.get(col, "N/A")
            })

    # --- 6. Create and display the final table ---
    feature_table = pd.DataFrame(table_data)

    print("\nTable 1: Description of Features in the Health Insurance Claim Dataset")
    display(feature_table)

except FileNotFoundError:
    print(f"Error: Could not find {zip_path}. Please make sure 'project_archive.zip' is uploaded to your Colab session.")

Successfully extracted project_archive.zip

Table 1: Description of Features in the Health Insurance Claim Dataset


Unnamed: 0,Feature,Data Type,Unique Values,Description
0,sex,Categorical,2,The gender of the primary member (male/female).
1,hereditary_diseases,Categorical,10,"The type of hereditary disease, if any."
2,smoker,Categorical,2,"Whether the member is a smoker (1 for yes, 0 f..."
3,city,Categorical,91,The geographical location (city) of the member.
4,diabetes,Categorical,2,"Whether the member has diabetes (1 for yes, 0 ..."
5,regular_ex,Categorical,2,Whether the member exercises regularly (1 for ...
6,job_title,Categorical,35,The job title of the primary member.
7,age,Numerical,Continuous,The age of the primary member.
8,weight,Numerical,Continuous,The weight of the primary member in kilograms.
9,bmi,Numerical,Continuous,The Body Mass Index of the primary member.
