In [11]:
import os
import pandas as pd

base_dir = "/Users/natika/Downloads/past semesters/CHN-Project/BoS (Business Objects) Raw Data Reports - Deidentified/" 
program_ids = ["143", "1371 FINAL", "8319", "11495", "Erin Park", "MC PATH", "OC PATH", "SPC"]


In [None]:
for program_id in program_ids:
    file_path = f"{base_dir}{program_id} RAW Client Data Export v3_EE Workflow.xlsx"
    
    if "TEMPLATE" in file_path.upper():
        print(f"\nSkipping template file: {file_path}")
        continue

    if not os.path.exists(file_path):
        print(f"\nWarning: File not found for program {program_id}: {file_path}")
        continue

    print(f"\nProcessing program: {program_id}")

    try:
        all_sheets = pd.read_excel(file_path, sheet_name=None)

        for sheet_name, df in all_sheets.items():
            print(f"\n--- Program: {program_id} | Sheet: {sheet_name} ---")
            print(df.head(3))

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

### Number of clients who exited to permanent housing


In [None]:
target_column = 'Housing Move-in Date(12855)'

for program_id in program_ids:
    file_path = f"{base_dir}{program_id} RAW Client Data Export v3_EE Workflow.xlsx"
    
    if not os.path.exists(file_path):
        print(f"File not found for program {program_id}")
        continue

    try:
        df = pd.read_excel(file_path, sheet_name="ENTRY-EXIT", dtype=str)
        total_clients = len(df)

        if target_column not in df.columns:
            print(f"Program: {program_id} | Column '{target_column}' not found.")
            continue

        num_permanent_housing = df[target_column].dropna().loc[lambda x: x.str.strip() != ''].count()

        print(f"Program: {program_id} | Total clients: {total_clients} | Exited to permanent housing: {num_permanent_housing}")

    except Exception as e:
        print(f"Error processing {program_id}: {e}")


### Breakdown of exit destinations (permanent housing, shelter, streets, etc.)

In [None]:
destination_col = 'Destination'

for program_id in program_ids:
    file_path = f"{base_dir}{program_id} RAW Client Data Export v3_EE Workflow.xlsx"

    if not os.path.exists(file_path):
        print(f"File not found for program {program_id}")
        continue

    try:
        df = pd.read_excel(file_path, sheet_name="ENTRY-EXIT", dtype=str)

        if destination_col not in df.columns:
            print(f"Program: {program_id} | Column '{destination_col}' not found.")
            continue

        value_counts = df[destination_col].dropna().str.strip().value_counts()

        print(f"\nProgram: {program_id} | Destination value counts:")
        print(value_counts.to_string())  # clean printout without index formatting

    except Exception as e:
        print(f"Error processing {program_id}: {e}")


In [None]:
destination_col = 'Destination'

# List to collect destination data
destination_results = []

for program_id in program_ids:
    file_path = f"{base_dir}{program_id} RAW Client Data Export v3_EE Workflow.xlsx"

    if not os.path.exists(file_path):
        print(f"File not found for program {program_id}")
        continue

    try:
        df = pd.read_excel(file_path, sheet_name="ENTRY-EXIT", dtype=str)

        if destination_col in df.columns:
            value_counts = df[destination_col].dropna().str.strip().value_counts()
            for destination, count in value_counts.items():
                destination_results.append({
                    "Program ID": program_id,
                    "Exit Destination": destination,
                    "Count": count
                })
        else:
            print(f"Program: {program_id} | Column '{destination_col}' not found.")

    except Exception as e:
        print(f"Error processing {program_id}: {e}")

# Convert to DataFrame
destination_df = pd.DataFrame(destination_results)

# Save to Excel
output_path = "destination_value_counts.xlsx"
destination_df.to_excel(output_path, sheet_name="Exit Destinations", index=False)

print(f"\n Exit destinations saved to: {output_path}")


In [None]:
movein_col = 'Housing Move-in Date(12855)'

# List to collect permanent housing counts
housing_movein_results = []

for program_id in program_ids:
    file_path = f"{base_dir}{program_id} RAW Client Data Export v3_EE Workflow.xlsx"

    if not os.path.exists(file_path):
        print(f"File not found for program {program_id}")
        continue

    try:
        df = pd.read_excel(file_path, sheet_name="ENTRY-EXIT", dtype=str)
        total_clients = len(df)

        if movein_col in df.columns:
            num_permanent = df[movein_col].dropna().loc[lambda x: x.str.strip() != ''].count()
            housing_movein_results.append({
                "Program ID": program_id,
                "Total Clients": total_clients,
                "Exited to Permanent Housing": num_permanent
            })
        else:
            print(f"Program: {program_id} | Column '{movein_col}' not found.")

    except Exception as e:
        print(f"Error processing {program_id}: {e}")

# Convert to DataFrame
housing_df = pd.DataFrame(housing_movein_results)

# Save to Excel
output_path = "permanent_housing_counts.xlsx"
housing_df.to_excel(output_path, sheet_name="Permanent Housing Counts", index=False)

print(f"\n Permanent housing counts saved to: {output_path}")

### Average length of time to housing move-in 

In [13]:

keywords = ["Rapid Rehousing Program", "Permanent Supportive Housing"]  

# Loop through each program
for program_id in program_ids:
    file_path = f"{base_dir}{program_id} RAW Client Data Export v3_EE Workflow.xlsx"
    
    if "TEMPLATE" in file_path.upper():
        continue
    if not os.path.exists(file_path):
        print(f"File not found for program {program_id}: {file_path}")
        continue

    try:
        # Read all sheets from the Excel file
        all_sheets = pd.read_excel(file_path, sheet_name=None, dtype=str)  # Read all sheets as strings

        # Loop through each sheet
        for sheet_name, df in all_sheets.items():
            print(f"Processing sheet: {sheet_name}...")  # Debugging output
            
            # Loop through each column in the sheet
            for col in df.columns:
                # Clean data by stripping any leading/trailing spaces
                df[col] = df[col].str.strip()

                # Check for rows that contain any of the keywords (whole phrases)
                matches = df[df[col].astype(str).str.contains('|'.join(keywords), case=False, na=False)]
                
                # Check if any matches are found
                if not matches.empty:
                    print(f"\n🔍 Match found!")
                    print(f"Program: {program_id}")
                    print(f"Sheet: {sheet_name}")
                    print(f"Column: {col}")
                    print(f"Rows matching any of the keywords {', '.join(keywords)}:")
                    print(matches[[col]].head(3))  # Show top 3 matching rows
                else:
                    print(f"No matches found in column: {col}")

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

Processing sheet: Additional Info...
No matches found in column: Unnamed: 0

🔍 Match found!
Program: 143
Sheet: Additional Info
Column: Unnamed: 1
Rows matching any of the keywords Rapid Rehousing Program, Permanent Supportive Housing:
                                          Unnamed: 1
7  Community Housing Network, Inc. - Grafton SHU ...
Processing sheet: CARF WORKSHEET...
No matches found in column: Housing Status(6209)
No matches found in column: Unnamed: 1
No matches found in column: Total Homeless
No matches found in column: 255
No matches found in column: Unnamed: 4
No matches found in column: Steps
Processing sheet: ENTRY-EXIT...

🔍 Match found!
Program: 143
Sheet: ENTRY-EXIT
Column: EE Provider ID
Rows matching any of the keywords Rapid Rehousing Program, Permanent Supportive Housing:
                                      EE Provider ID
0  Community Housing Network - Oakland County - R...
1  Community Housing Network - Oakland County - R...
2  Community Housing Network - Oakla

In [14]:
# Define the phrases to search for in the PROVIDER sheet
keywords = ["Rapid Re-Housing", "Permanent Supportive Housing"]

# Loop through each program
for program_id in program_ids:
    file_path = f"{base_dir}{program_id} RAW Client Data Export v3_EE Workflow.xlsx"
    
    if "TEMPLATE" in file_path.upper():
        continue
    if not os.path.exists(file_path):
        print(f"File not found for program {program_id}: {file_path}")
        continue

    try:
        # Read all sheets from the Excel file
        all_sheets = pd.read_excel(file_path, sheet_name=None, dtype=str)  # Read all sheets as strings

        # Get PROVIDER and ENTRY-EXIT sheets
        provider_df = all_sheets.get("PROVIDER", pd.DataFrame())
        entry_exit_df = all_sheets.get("ENTRY-EXIT", pd.DataFrame())

        # Check if the PROVIDER sheet contains the necessary columns
        if provider_df.empty:
            print(f"Program {program_id}: PROVIDER sheet is missing or empty.")
            continue

        # Filter rows in the PROVIDER sheet that contain either of the keywords
        provider_matches = provider_df[provider_df.apply(lambda row: row.astype(str).str.contains('|'.join(keywords), case=False, na=False).any(), axis=1)]

        if provider_matches.empty:
            print(f"Program {program_id}: No matching rows found in PROVIDER sheet.")
            continue

        # Get the indices of the matched rows from the PROVIDER sheet
        matching_indices = provider_matches.index

        # Filter the ENTRY-EXIT sheet based on the matching indices
        entry_exit_filtered = entry_exit_df.loc[matching_indices]

        # Combine PROVIDER and ENTRY-EXIT filtered rows into a single dataset
        combined_df = pd.concat([provider_matches, entry_exit_filtered], axis=1)

        # Save the combined dataset to a single sheet in Excel
        output_path = f"{program_id}_combined_filtered_data.xlsx"
        combined_df.to_excel(output_path, sheet_name="Filtered Data", index=False)

        print(f"Program {program_id}: Filtered data saved to {output_path}")

    except Exception as e:
        print(f"Error processing {file_path}: {e}")


Program 143: Filtered data saved to 143_combined_filtered_data.xlsx
Program 1371 FINAL: Filtered data saved to 1371 FINAL_combined_filtered_data.xlsx
Program 8319: Filtered data saved to 8319_combined_filtered_data.xlsx
Program 11495: Filtered data saved to 11495_combined_filtered_data.xlsx
Program Erin Park: No matching rows found in PROVIDER sheet.
Program MC PATH: No matching rows found in PROVIDER sheet.
Program OC PATH: No matching rows found in PROVIDER sheet.
Program SPC: Filtered data saved to SPC_combined_filtered_data.xlsx


In [25]:
output_base_dir = "/Users/natika/Downloads/past semesters/CHN-Project/TableExtraction"

In [32]:
 #List of program IDs that had data saved
program_ids_with_data = ["143", "1371 FINAL", "8319", "11495", "SPC"]

# List to store DataFrames of all programs
all_program_data = []

# Loop through each program and read the corresponding dataset
for program_id in program_ids_with_data:
    file_path = os.path.join(output_base_dir, f"{program_id}_combined_filtered_data.xlsx")

    if os.path.exists(file_path):
        try:
            # Read the Excel file into a DataFrame
            df = pd.read_excel(file_path, sheet_name="Filtered Data")
            all_program_data.append(df)
            print(f"Program {program_id}: Data loaded successfully.")
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    else:
        print(f"File for Program {program_id} not found at {file_path}.")

# Combine all the DataFrames into one
combined_df = pd.concat(all_program_data, ignore_index=True)


combined_df.reset_index(drop=True, inplace=True)


combined_output_path = "/Users/natika/Downloads/past semesters/CHN-Project/days_to_movein.xlsx"
combined_df.to_excel(combined_output_path, index=False)

# Print confirmation
print(f"All datasets combined and saved to: {combined_output_path}")

Program 143: Data loaded successfully.
Program 1371 FINAL: Data loaded successfully.
Program 8319: Data loaded successfully.
Program 11495: Data loaded successfully.
Program SPC: Data loaded successfully.
All datasets combined and saved to: /Users/natika/Downloads/past semesters/CHN-Project/days_to_movein.xlsx


In [33]:
combined_df['Entry Date'] = pd.to_datetime(combined_df['Entry Date'], errors='coerce')
combined_df['Housing Move-in Date'] = pd.to_datetime(combined_df['Housing Move-in Date(12855)'], errors='coerce')

# Create a new column for the duration (in days) between Entry Date and Housing Move-in Date
combined_df['Days_to_MoveIn'] = (combined_df['Housing Move-in Date'] - combined_df['Entry Date']).dt.days

# Display the first few rows of the dataframe to confirm the new column
print(combined_df[['Entry Date', 'Housing Move-in Date(12855)', 'Days_to_MoveIn']].head())

           Entry Date Housing Move-in Date(12855)  Days_to_MoveIn
0 2024-03-15 02:00:00         2024-03-15 12:00:00             0.0
1 2024-03-15 02:00:00         2024-03-15 12:00:00             0.0
2 2024-02-21 02:00:00         2024-03-09 12:00:00            17.0
3 2024-02-21 02:00:00         2024-03-09 12:00:00            17.0
4 2024-02-21 02:00:00         2024-03-09 12:00:00            17.0


In [34]:
new_dataset = combined_df[['Program Type Code', 'EE Provider ID', 'Days_to_MoveIn']]

# Display the first few rows of the new dataset
print(new_dataset.head())

                                   Program Type Code  \
0  PH - Permanent Supportive Housing (disability ...   
1  PH - Permanent Supportive Housing (disability ...   
2  PH - Permanent Supportive Housing (disability ...   
3  PH - Permanent Supportive Housing (disability ...   
4  PH - Permanent Supportive Housing (disability ...   

                                      EE Provider ID  Days_to_MoveIn  
0  Community Housing Network, Inc. - Oakland Coun...             0.0  
1  Community Housing Network, Inc. - Oakland Coun...             0.0  
2  Community Housing Network, Inc. - Oakland Coun...            17.0  
3  Community Housing Network, Inc. - Oakland Coun...            17.0  
4  Community Housing Network, Inc. - Oakland Coun...            17.0  


In [35]:

# Print the unique program types
print(new_dataset['Program Type Code'].unique())

['PH - Permanent Supportive Housing (disability required for entry) (HUD)'
 'PH - Rapid Re-Housing (HUD)']


In [None]:
new_dataset = df[['Program Type Code', 'EE Provider ID', 'Days_to_MoveIn']]


output_path = 'provider_move_in_data.xlsx'
new_dataset.to_excel(output_path, index=False)

# Display the first few rows of the new dataset
print(new_dataset.head())

In [36]:
# Group the dataset by 'Program Type Code' and calculate the average 'Days_to_MoveIn' for each program type
grouped_avg = new_dataset.groupby('Program Type Code')['Days_to_MoveIn'].mean()

print(grouped_avg)

Program Type Code
PH - Permanent Supportive Housing (disability required for entry) (HUD)    55.662109
PH - Rapid Re-Housing (HUD)                                                 3.325243
Name: Days_to_MoveIn, dtype: float64


In [22]:
# Check unique values in 'Program Type Code' column
print(new_dataset['Program Type Code'].unique())


['PH - Permanent Supportive Housing (disability required for entry) (HUD)']


In [37]:
# Map the average values back to the new dataset
new_dataset['Avg_Days_to_MoveIn'] = new_dataset['Program Type Code'].map(grouped_avg)

# Save the updated dataset to Excel
output_path = 'Avg_days_to_Movein.xlsx'
new_dataset.to_excel(output_path, index=False)

# Display the first few rows of the updated dataset
print(new_dataset.head())

                                   Program Type Code  \
0  PH - Permanent Supportive Housing (disability ...   
1  PH - Permanent Supportive Housing (disability ...   
2  PH - Permanent Supportive Housing (disability ...   
3  PH - Permanent Supportive Housing (disability ...   
4  PH - Permanent Supportive Housing (disability ...   

                                      EE Provider ID  Days_to_MoveIn  \
0  Community Housing Network, Inc. - Oakland Coun...             0.0   
1  Community Housing Network, Inc. - Oakland Coun...             0.0   
2  Community Housing Network, Inc. - Oakland Coun...            17.0   
3  Community Housing Network, Inc. - Oakland Coun...            17.0   
4  Community Housing Network, Inc. - Oakland Coun...            17.0   

   Avg_Days_to_MoveIn  
0           55.662109  
1           55.662109  
2           55.662109  
3           55.662109  
4           55.662109  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_dataset['Avg_Days_to_MoveIn'] = new_dataset['Program Type Code'].map(grouped_avg)
