In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
import piplite
await piplite.install('seaborn')
import seaborn as sns

# Load the data
df = pd.read_csv('toronto.csv')

# Data Cleaning
# Drop irrelevant columns
columns_to_drop = ['PROGRAM_NAME', 'PROGRAM_AREA', 'OVERNIGHT_SERVICE_TYPE', 'PROGRAM_ID']
df_cleaned = df.drop(columns=columns_to_drop)


# Exploratory Data Analysis
# Visualizations

# Grouping data by month and calculating average occupancy rate

# Calculate occupancy rate for beds
df_cleaned['Occupancy_Rate_Bed'] = (df_cleaned['OCCUPIED_BEDS'] / df_cleaned['CAPACITY_ACTUAL_BED']) * 100

# Calculate occupancy rate for rooms
df_cleaned['Occupancy_Rate_Room'] = (df_cleaned['OCCUPIED_ROOMS'] / df_cleaned['CAPACITY_ACTUAL_ROOM']) * 100

# Calculate an overall occupancy rate by taking the average of occupancy rates for beds and rooms, replace null with 0
df_cleaned['Occupancy_Rate_Overall'] = (df_cleaned['Occupancy_Rate_Bed'].fillna(0) + df_cleaned['Occupancy_Rate_Room'].fillna(0)) / 2

# Convert 'date' column to datetime format
df_cleaned['OCCUPANCY_DATE'] = pd.to_datetime(df_cleaned['OCCUPANCY_DATE'])

# Grouping data by month and calculating average occupancy rate
monthly_occupancy = df_cleaned.groupby(df_cleaned['OCCUPANCY_DATE'].dt.to_period('M'))['Occupancy_Rate_Overall'].mean()

# Plotting the occupancy rate over time
plt.figure(figsize=(10, 6))
plt.plot(monthly_occupancy.index.to_timestamp(), monthly_occupancy.values, marker='8', linestyle='-')
plt.title('Average Shelter Occupancy Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Occupancy Rate (%)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

# Plotting average occupancy rate by program type
plt.figure(figsize=(10, 6))
sns.barplot(x='PROGRAM_MODEL', y='Occupancy_Rate_Overall', data=df_cleaned)
plt.title('Average Occupancy Rate by Program Type')
plt.xlabel('Program Type')
plt.ylabel('Occupancy Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plotting occupancy rate distribution by shelter organization
plt.figure(figsize=(30, 6))
sns.boxplot(x='ORGANIZATION_NAME', y='Occupancy_Rate_Overall', data=df_cleaned)
plt.title('Occupancy Rate Distribution by Shelter Organization')
plt.xlabel('Shelter Organization')
plt.ylabel('Occupancy Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Conduct t-test to compare occupancy rates between different seasons or months
# Compare occupancy rates between winter and summer months

winter_occupancy = df_cleaned[df_cleaned['OCCUPANCY_DATE'].dt.month.isin([12, 1, 2])]['Occupancy_Rate_Overall']
summer_occupancy = df_cleaned[df_cleaned['OCCUPANCY_DATE'].dt.month.isin([6, 7, 8])]['Occupancy_Rate_Overall']

t_stat, p_value = ttest_ind(winter_occupancy, summer_occupancy)
print("T-Test Results for Seasonal Trends:")
print(f"T-Statistic: {t_stat}")
print(f"P-Value: {p_value}")

# Compare occupancy rates between different program models

program_type_1_occupancy = df_cleaned[df_cleaned['PROGRAM_MODEL'] == 'Emergency']['Occupancy_Rate_Overall']
program_type_2_occupancy = df_cleaned[df_cleaned['PROGRAM_MODEL'] == 'Transitional']['Occupancy_Rate_Overall']

t_stat, p_value = ttest_ind(program_type_1_occupancy, program_type_2_occupancy)
print("T-Test Results for Programmatic Disparities:")
print(f"T-Statistic: {t_stat}")
print(f"P-Value: {p_value}")

# Compare occupancy rates between different shelter organizations

org_1_occupancy = df_cleaned[df_cleaned['ORGANIZATION_NAME'] == 'City of Toronto']['Occupancy_Rate_Overall']
org_2_occupancy = df_cleaned[df_cleaned['ORGANIZATION_NAME'] == 'The Salvation Army of Canada']['Occupancy_Rate_Overall']

t_stat, p_value = ttest_ind(org_1_occupancy, org_2_occupancy)
print("T-Test Results for Organizational Performance:")
print(f"T-Statistic: {t_stat}")
print(f"P-Value: {p_value}")

T-Test Results for Seasonal Trends:
T-Statistic: 13.732150632203595
P-Value: 9.256519912353863e-43
T-Test Results for Programmatic Disparities:
T-Statistic: 39.0749698065413
P-Value: 0.0
T-Test Results for Organizational Performance:
T-Statistic: -27.399364757745072
P-Value: 1.9965633835452346e-161
