In [None]:
import os
os.chdir("..")
os.listdir()

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

from src.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

# Show all rows and columns
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Show full width
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 0)

### Shelter System Flow

In [None]:
sys_flow = pd.read_csv(RAW_DATA_DIR / "toronto-shelter-system-flow.csv")
sys_flow.head(15)

In [None]:
sys_flow.isna().sum()

In [None]:
sys_flow["date(mmm-yy)"] = pd.to_datetime(sys_flow["date(mmm-yy)"], format="%b-%y")

cols_to_sum = ["returned_from_housing", "newly_identified", "actively_homeless"]
sys_flow["true_total"] = sys_flow[cols_to_sum].fillna(0).sum(axis=1)

pivot_df = sys_flow.pivot(
    index="date(mmm-yy)",
    columns="population_group",
    values="true_total"
).sort_index()

ax = pivot_df.plot(kind="line", figsize=(12, 7))
ax.set_title("Estimated 'True' Homeless Count Over Time by Population Group")
ax.set_xlabel("Date")
ax.set_ylabel("returned_from_housing + newly_identified + actively_homeless")
ax.grid(True)

### Demand for each month (Net Flow)

In [None]:
sys_flow_2024 = sys_flow[sys_flow["date(mmm-yy)"].dt.year == 2024]

single_adult_df = sys_flow_2024[sys_flow_2024["population_group"] == "Single Adult"]

single_adult_df = single_adult_df[
    ["date(mmm-yy)", "true_total", "gender_male", "gender_female"]
]

single_adult_df = single_adult_df.sort_values("date(mmm-yy)").reset_index(drop=True)
single_adult_df

In [None]:
single_adult_df["ratio_male_with_total"] = single_adult_df["gender_male"] / single_adult_df["true_total"]
single_adult_df["ratio_female_with_total"] = single_adult_df["gender_female"] / single_adult_df["true_total"]
single_adult_df["ratio_check"] = single_adult_df["ratio_male_with_total"] + single_adult_df["ratio_female_with_total"]
single_adult_df

#### Cleaning and Imputation 

In [None]:
# Known and unclassified counts
single_adult_df["known_gender_total"] = (
    single_adult_df["gender_male"] + single_adult_df["gender_female"]
)
single_adult_df["unclassified"] = (
    single_adult_df["true_total"] - single_adult_df["known_gender_total"]
)

# Observed male share among known-gender adults
single_adult_df["male_share_known"] = (
    single_adult_df["gender_male"] / single_adult_df["known_gender_total"]
)

# Impute missing gender proportionally
single_adult_df["adj_gender_male"] = (
    single_adult_df["gender_male"]
    + single_adult_df["unclassified"] * single_adult_df["male_share_known"]
)

single_adult_df["adj_gender_female"] = (
    single_adult_df["true_total"] - single_adult_df["adj_gender_male"]
)

# Ratios over the full true_total
single_adult_df["ratio_male"] = (
    single_adult_df["adj_gender_male"] / single_adult_df["true_total"]
)
single_adult_df["ratio_female"] = (
    single_adult_df["adj_gender_female"] / single_adult_df["true_total"]
)

single_adult_df["ratio_check"] = (
    single_adult_df["ratio_male"] + single_adult_df["ratio_female"]
)

# Add v2 integer-adjusted counts
single_adult_df["gender_male_v2"] = single_adult_df["adj_gender_male"].round().astype(int)
single_adult_df["gender_female_v2"] = (
    single_adult_df["true_total"] - single_adult_df["gender_male_v2"]
)

single_adult_df

In [None]:
cols = [
    "true_total",
    "ratio_male",
    "ratio_female",
    "gender_male_v2",
    "gender_female_v2"
]

extracted_df = single_adult_df[cols]
extracted_df.index = extracted_df.index + 1
extracted_df.index.name = "month"
extracted_df

In [None]:
extracted_df.to_csv(PROCESSED_DATA_DIR / "demand_per_month_by_gender.csv")

### Shelter Locations (City Hostel Serviced)

In [None]:
xls_path = RAW_DATA_DIR / "shelter-locations-readme.xls"

xls = pd.ExcelFile(xls_path, engine="xlrd")
print(xls.sheet_names)  # See which sheets exist

df = pd.read_excel(
    xls_path,
    engine="xlrd",
    header=1,  # Real headers start on the second row
    usecols="A:C",  # Item name, Description, Datatype cols
)

df = df.dropna(how="all").reset_index(drop=True)
df.columns = ["ITEM_NAME", "DESCRIPTION", "DATATYPE"]
df = df.drop(0)
df

In [None]:
import geopandas as gpd

shelter_locations_df = gpd.read_file(RAW_DATA_DIR / "shelters_wgs84.shp")
shelter_locations_df.to_csv(RAW_DATA_DIR / "shelter_wgs8.csv")

shelter_locations_df.head(5)

In [None]:
shelter_locations_df.isna().sum()

In [None]:
import contextily as cx

shelters_webmer = shelter_locations_df.to_crs(epsg=3857)

print(shelter_locations_df["TYPE2"].unique())

shelter_locations_single_wom = shelters_webmer[
    shelter_locations_df["TYPE2"] == "Single Women"
]
shelter_locations_mixed = shelters_webmer[
    shelter_locations_df["TYPE2"] == "Mixed Adult"
]
shelter_locations_family = shelters_webmer[shelter_locations_df["TYPE2"] == "Family"]
shelter_locations_youth = shelters_webmer[shelter_locations_df["TYPE2"] == "Youth"]
shelter_locations_single_men = shelters_webmer[
    shelter_locations_df["TYPE2"] == "Single Men"
]

fig, ax = plt.subplots(figsize=(12, 10))

shelter_locations_single_wom.plot(
    ax=ax, color="lightpink", markersize=30, label="Single Women"
)
shelter_locations_mixed.plot(ax=ax, color="yellow", markersize=30, label="Mixed")
shelter_locations_family.plot(ax=ax, color="lightgreen", markersize=30, label="Family")
shelter_locations_youth.plot(ax=ax, color="purple", markersize=30, label="Youth")
shelter_locations_single_men.plot(
    ax=ax, color="lightblue", markersize=30, label="Single Men"
)

cx.add_basemap(ax=ax, source=cx.providers.CartoDB.Positron)

plt.title("Toronto Homeless Shelters by Type")
plt.axis("off")
plt.legend()
plt.show()

### Daily Shelter & Overnight Service Occupancy & Capacity

In [None]:
daily_occ_cap_df = pd.read_csv(
    RAW_DATA_DIR / "daily-shelter-overnight-service-occupancy-capacity-2024.csv"
)

daily_occ_cap_df.head(10)

In [None]:
# Keeping shelters that work with beds instead of rooms
daily_occ_cap_df = daily_occ_cap_df[daily_occ_cap_df['CAPACITY_TYPE'] == 'Bed Based Capacity']
daily_occ_cap_df.head()

In [None]:
print(daily_occ_cap_df.shape)
daily_occ_cap_df[
    [
        "SERVICE_USER_COUNT",
        "CAPACITY_ACTUAL_BED",
        "CAPACITY_FUNDING_BED",
        "OCCUPIED_BEDS",
        "UNOCCUPIED_BEDS",
        "UNAVAILABLE_BEDS",
        "OCCUPANCY_RATE_BEDS",
    ]
].describe().drop(index='count')

In [None]:
daily_occ_cap_df.isna().sum()

In [None]:
num_shelter_daily_occ_cap = daily_occ_cap_df["LOCATION_ADDRESS"].nunique()
num_shelter_locations = shelter_locations_df["NAME"].nunique()

print(f"There are a total of {num_shelter_daily_occ_cap} shelters in the daily occ + cap dataset")
print(f"There are a total of {num_shelter_locations} shelters in the shelter location dataset")

num_programs = daily_occ_cap_df["PROGRAM_NAME"].nunique()
actual_programs = daily_occ_cap_df["PROGRAM_NAME"].unique()
print(f"There are a total of {num_programs} programs")

num_sectors = daily_occ_cap_df["SECTOR"].nunique()
actual_sectors = daily_occ_cap_df["SECTOR"].unique()
print(f"There are a total of {num_sectors}: {actual_sectors}")

In [None]:
for sector in actual_sectors:
    num_locations = daily_occ_cap_df[daily_occ_cap_df['SECTOR'] == sector]['LOCATION_ADDRESS'].nunique()
    print(f"There are {num_locations} shelters for the {sector} sector")

print("\nOverlap possible since a shelter can accodomate multiple different sectors")

#### Service Occupancy vs Capacity

In [None]:
import matplotlib.pyplot as plt

grouped = daily_occ_cap_df.groupby("PROGRAM_ID")
program_groups = list(grouped)

plots_per_page = 10
for i in range(0, len(program_groups), plots_per_page):
    fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(12, 18)) 
    axes = axes.flatten()

    page_groups = program_groups[i : i + plots_per_page]

    for ax, (program_id, df) in zip(axes, page_groups):
        df = df.set_index("OCCUPANCY_DATE")[["OCCUPIED_BEDS", "CAPACITY_ACTUAL_BED"]].sort_index()
        df.plot(ax=ax, title=f"Program {program_id}")
        
        ax.tick_params(axis='x', labelrotation=90)

    for j in range(len(page_groups), plots_per_page):
        axes[j].set_visible(False)

    plt.tight_layout()
    plt.show()

#### Mapping

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as cx

from geopandas.tools import geocode

daily_occ_cap_df["FULL_ADDRESS"] = (
    daily_occ_cap_df["LOCATION_ADDRESS"].fillna("").str.strip()
    + ", "
    + daily_occ_cap_df["LOCATION_CITY"].fillna("").str.strip()
    + ", "
    + daily_occ_cap_df["LOCATION_PROVINCE"].fillna("").str.strip()
    + ", Canada"
)

daily_occ_cap_filtered_df = daily_occ_cap_df[["FULL_ADDRESS", "PROGRAM_AREA", "SECTOR"]]

unique_addrs = (
    daily_occ_cap_filtered_df.dropna(subset=["FULL_ADDRESS"])
    .drop_duplicates(subset=["FULL_ADDRESS"])
    .reset_index(drop=True)
)

unique_addrs = unique_addrs.drop([34, 74, 77])
unique_addrs = unique_addrs[
    (unique_addrs["SECTOR"] != "Mixed Adult")
    & (unique_addrs["SECTOR"] != "Youth")
]

gdf_geo = geocode(
    unique_addrs["FULL_ADDRESS"],
    provider="nominatim",
    user_agent="geo_example",
    timeout=10,
)

gdf = gpd.GeoDataFrame(
    pd.concat([unique_addrs.reset_index(drop=True), gdf_geo[["geometry"]]], axis=1),
    geometry="geometry",
    crs="EPSG:4326",
)

gdf_3857 = gdf.to_crs(epsg=3857)

ax = gdf_3857.plot(figsize=(8, 8), markersize=10, column="SECTOR", legend=True)
cx.add_basemap(ax, source=cx.providers.CartoDB.Positron)
ax.set_axis_off()
plt.title("Shelter Locations")
plt.show()