## Test the function for preprocessing

In [120]:
# open the excel file on 22-23 Stats sheet as a pandas dataframe
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [121]:
# load and read the Status sheet from the workbook
status_df = pd.read_excel('/Users/thomasdoherty/Desktop/canadian-psi-project/psi_data/cleaning_copy_excel/on_college_2012-/2012-2022 college_enrolment_headcount.xlsx', sheet_name='Status')

In [122]:
status_df

Unnamed: 0,College Name,Fiscal Year,Status In Canada Description,Headcount Full-Time Fall
0,Algonquin College,2012-2013,Aboriginal ancestry (where self-identified),279
1,Algonquin College,2012-2013,Attending an off-shore school,0
2,Algonquin College,2012-2013,Canadian citizen,14239
3,Algonquin College,2012-2013,Convention refugee (protected person),13
4,Algonquin College,2012-2013,In Canada on authority of another visa (includ...,34
...,...,...,...,...
2010,St. Lawrence College,2022-2023,In Canada on authority of another visa (includ...,*
2011,St. Lawrence College,2022-2023,Other,*
2012,St. Lawrence College,2022-2023,"Permanent resident, formerly called landed imm...",165
2013,St. Lawrence College,2022-2023,Student permit/ Student visa: a permit obtaine...,6939


In [123]:

def preprocess_student_data(df, fiscal_year_col, group_col, count_col, new_total_col="Total Headcount"):
    """
    Preprocess the student demographic data by transforming and restructuring the dataframe.
    
    Parameters:
    - df: DataFrame to preprocess
    - fiscal_year_col: str, column name for fiscal year
    - group_col: str, column name for the demographic category (e.g., Gender, Status in Canada)
    - count_col: str, column name for the student count in each demographic group
    - new_total_col: str, optional, name for the column to store the total full-time enrollment
    
    Returns:
    - DataFrame: Transformed DataFrame with both absolute numbers, percentage shares for each demographic category, and YoY growth
    """
    
    # Step 1: Replace '*' representing 0-9 in the count data with an estimate of 5
    df[count_col] = df[count_col].replace('*', 5).astype(int)
    
    # Step 2: Check if fiscal year column needs conversion and extract start year if it's in "YYYY-YYYY" format
    if df[fiscal_year_col].dtype == 'object':  # Check if fiscal year is a string
        df[fiscal_year_col] = df[fiscal_year_col].apply(lambda x: int(x[:4]))  # Convert "2012-2013" to 2012
    
    # Step 3: Rename the fiscal year column to "FY Start"
    df.rename(columns={fiscal_year_col: "FY Start"}, inplace=True)
    
    # Step 4: Calculate total enrollment per college and fiscal year, ensuring it is an integer
    df[new_total_col] = df.groupby(['College Name', 'FY Start'])[count_col].transform('sum').astype(int)
    df = df.reset_index(drop=True)  # Ensure consistent indexing
    
    # Step 5: Calculate percentage share of headcount for each demographic category, rounded to 2 decimal places
    df['% Share of Headcount'] = round((df[count_col] / df[new_total_col]) * 100, 2)
    
    # Step 6: Calculate year-over-year (YoY) growth based on the total headcount
    #df = df.sort_values(['College Name', 'FY Start'])  # Sort to ensure correct order for shift
    #df['Headcount Growth YoY %'] = df.groupby('College Name')[new_total_col].apply(lambda x: x.pct_change() * 100).round(2)
    
    # Step 7: Pivot the table to get absolute headcounts per demographic category
    headcount_pivot = df.pivot_table(
        index=['College Name', 'FY Start', new_total_col],
        columns=group_col,
        values=count_col
    ).reset_index()

    # Step 8: Pivot the table to get percentage shares per demographic category
    share_pivot = df.pivot_table(
        index=['College Name', 'FY Start', new_total_col],
        columns=group_col,
        values='% Share of Headcount'
    ).reset_index()
    
    # Step 9: Rename the columns in share_pivot to include % for clarity
    share_pivot.columns = [
        f"{col} %" if col not in ['College Name', 'FY Start', new_total_col] else col for col in share_pivot.columns
    ]
    
    # Step 10: Merge the absolute and percentage pivots on common columns
    final_df = pd.merge(headcount_pivot, share_pivot, on=['College Name', 'FY Start', new_total_col])
    
    # Step 11: Fill NaNs in raw count columns with zero and convert to integers
    raw_count_columns = [col for col in final_df.columns if '%' not in col and col not in ['College Name', 'FY Start', new_total_col]]
    final_df[raw_count_columns] = final_df[raw_count_columns].fillna(0).astype(int)
    
    return final_df




In [124]:
# Assuming status_df is the DataFrame for the Status in Canada sheet
status_df_processed = preprocess_student_data(
    df=status_df,
    fiscal_year_col='Fiscal Year',
    group_col='Status In Canada Description',
    count_col='Headcount Full-Time Fall'
)


  df[count_col] = df[count_col].replace('*', 5).astype(int)


In [125]:
status_df_processed

Unnamed: 0,College Name,FY Start,Total Headcount,Aboriginal ancestry (where self-identified),Attending an off-shore school,Canadian citizen,Convention refugee (protected person),"In Canada on authority of another visa (including students who are in Canada on diplomatic, trade or other missions)",Other,"Permanent resident, formerly called landed immigrant",...,Aboriginal ancestry (where self-identified) %,Attending an off-shore school %,Canadian citizen %,Convention refugee (protected person) %,"In Canada on authority of another visa (including students who are in Canada on diplomatic, trade or other missions) %",Other %,"Permanent resident, formerly called landed immigrant %",Refugee status %,Student permit/ Student visa: a permit obtained by a student to enter Canada for the sole purpose of attending an educational postsecondary institution %,Unknown %
0,Algonquin College,2012,16068,279,0,14239,13,34,31,658,...,1.74,0.0,88.62,0.08,0.21,0.19,4.10,0.16,4.90,0.00
1,Algonquin College,2013,16844,290,0,14757,15,43,34,753,...,1.72,0.0,87.61,0.09,0.26,0.20,4.47,0.13,5.52,
2,Algonquin College,2014,17025,322,0,14655,16,44,35,794,...,1.89,0.0,86.08,0.09,0.26,0.21,4.66,0.08,6.73,
3,Algonquin College,2015,17435,361,0,14885,15,39,31,788,...,2.07,0.0,85.37,0.09,0.22,0.18,4.52,0.10,7.44,
4,Algonquin College,2016,17385,275,0,14601,26,36,41,801,...,1.58,,83.99,0.15,0.21,0.24,4.61,0.13,9.11,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,St. Lawrence College,2018,8792,459,0,5060,0,0,5,158,...,5.22,,57.55,,0.00,0.06,1.80,,34.53,0.84
260,St. Lawrence College,2019,9014,423,0,4812,0,0,5,165,...,4.69,,53.38,,,0.06,1.83,,39.52,0.52
261,St. Lawrence College,2020,9229,337,0,4346,0,0,5,154,...,3.65,,47.09,,,0.05,1.67,,46.95,0.59
262,St. Lawrence College,2021,9039,310,0,4137,0,5,5,138,...,3.43,,45.77,,0.06,0.06,1.53,,48.80,0.37


In [126]:
status_df_processed.columns

Index(['College Name', 'FY Start', 'Total Headcount',
       'Aboriginal ancestry (where self-identified)',
       'Attending an off-shore school', 'Canadian citizen',
       'Convention refugee (protected person)',
       'In Canada on authority of another visa (including students who are in Canada on diplomatic, trade or other missions)',
       'Other', 'Permanent resident, formerly called landed immigrant',
       'Refugee status',
       'Student permit/ Student visa: a permit obtained by a student to enter Canada for the sole purpose of attending an educational postsecondary institution',
       'Unknown', 'Aboriginal ancestry (where self-identified) %',
       'Attending an off-shore school %', 'Canadian citizen %',
       'Convention refugee (protected person) %',
       'In Canada on authority of another visa (including students who are in Canada on diplomatic, trade or other missions) %',
       'Other %', 'Permanent resident, formerly called landed immigrant %',
       'Refu

In [127]:
import re

def rename_columns(df):
    """
    Renames columns in the DataFrame based on specified terms.
    
    Parameters:
    - df: DataFrame with columns to rename
    
    Returns:
    - DataFrame with renamed columns
    """
    
    # Define a mapping of keywords to new names
    rename_map = {
        'Aboriginal': 'Aboriginal',
        'Permanent Resident': 'PR',
        'Canadian Citizen': 'Canadian',
        'off-shore': 'Off-shore',
        'refugee': 'Refugee',
        'Student Permit|Student Visa': 'International',
        'another visa': 'Other visa'
    }
    
    # Go through each column in the DataFrame
    new_columns = []
    for col in df.columns:
        new_name = col  # Start with the original column name
        
        # Check if the column contains '%'; if so, leave it as part of the name
        if '%' in col:
            for pattern, replacement in rename_map.items():
                if re.search(pattern, col, re.IGNORECASE):
                    new_name = replacement + ' %'
                    break  # Stop once the first match is found
        else:
            # For non-percentage columns, replace with the mapped names
            for pattern, replacement in rename_map.items():
                if re.search(pattern, col, re.IGNORECASE):
                    new_name = replacement
                    break  # Stop once the first match is found

        # Append the final name to the new_columns list
        new_columns.append(new_name)
    
    # Update DataFrame columns
    df.columns = new_columns
    return df


In [128]:
status_df_processed = rename_columns(status_df_processed)

In [129]:
status_df_processed

Unnamed: 0,College Name,FY Start,Total Headcount,Aboriginal,Off-shore,Canadian,Refugee,Other visa,Other,PR,...,Aboriginal %,Off-shore %,Canadian %,Refugee %,Other visa %,Other %,PR %,Refugee %.1,International %,Unknown %
0,Algonquin College,2012,16068,279,0,14239,13,34,31,658,...,1.74,0.0,88.62,0.08,0.21,0.19,4.10,0.16,4.90,0.00
1,Algonquin College,2013,16844,290,0,14757,15,43,34,753,...,1.72,0.0,87.61,0.09,0.26,0.20,4.47,0.13,5.52,
2,Algonquin College,2014,17025,322,0,14655,16,44,35,794,...,1.89,0.0,86.08,0.09,0.26,0.21,4.66,0.08,6.73,
3,Algonquin College,2015,17435,361,0,14885,15,39,31,788,...,2.07,0.0,85.37,0.09,0.22,0.18,4.52,0.10,7.44,
4,Algonquin College,2016,17385,275,0,14601,26,36,41,801,...,1.58,,83.99,0.15,0.21,0.24,4.61,0.13,9.11,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,St. Lawrence College,2018,8792,459,0,5060,0,0,5,158,...,5.22,,57.55,,0.00,0.06,1.80,,34.53,0.84
260,St. Lawrence College,2019,9014,423,0,4812,0,0,5,165,...,4.69,,53.38,,,0.06,1.83,,39.52,0.52
261,St. Lawrence College,2020,9229,337,0,4346,0,0,5,154,...,3.65,,47.09,,,0.05,1.67,,46.95,0.59
262,St. Lawrence College,2021,9039,310,0,4137,0,5,5,138,...,3.43,,45.77,,0.06,0.06,1.53,,48.80,0.37
