In [9]:
import pandas as pd

# Load the dataset
file_path = "data/current-healthcare-facility-listing.csv"  # Update with the actual file path
df = pd.read_csv(file_path)

# Convert 'FACILITY_STATUS_DATE' to datetime format and extract the year
df["FACILITY_STATUS_DATE"] = pd.to_datetime(df["FACILITY_STATUS_DATE"], errors='coerce')
df["Year"] = df["FACILITY_STATUS_DATE"].dt.year

# Apply filters: Only Open Hospitals for the years 2019, 2020, 2021
filtered_df = df[(df["FACILITY_STATUS_DESC"] == "Open") &
                 (df["LICENSE_TYPE_DESC"] == "Hospital") &
                 (df["Year"].between(2019, 2021))]

# Group by Year and County, then count facilities
result_df = (
    filtered_df.groupby(["Year", "COUNTY_NAME"])
    .size()
    .reset_index(name="Counts")
)

# Save to CSV (optional)
output_file = "data/cleaned_data/filtered_facilities_count_2019_2021.csv"
result_df.to_csv(output_file, index=False)

# Display the processed data
print(result_df)  # Prints first few rows


    Year    COUNTY_NAME  Counts
0   2019        Alameda       1
1   2019      San Diego       1
2   2019  San Francisco       1
3   2019    Santa Clara       3
4   2020         Fresno       1
5   2020          Modoc       1
6   2020      Riverside       1
7   2020         Sutter       1
8   2020        Ventura       1
9   2021          Butte       1
10  2021     Sacramento       2
11  2021      San Diego       1
