## Test the function for preprocessing

In [3]:
# open the excel file on 22-23 Stats sheet as a pandas dataframe
# imports

import openpyxl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# load and read the Status sheet from the workbook
status_df = pd.read_excel('/Users/thomasdoherty/Desktop/canadian-psi-project/psi_data/cleaning_copy_excel/on_college_2012-/2012-2022 college_enrolment_headcount.xlsx', sheet_name='Status')

In [6]:
status_df

Unnamed: 0,College Name,Fiscal Year,Status In Canada Description,Headcount Full-Time Fall,Unnamed: 4
0,Algonquin College,2012-2013,Aboriginal ancestry (where self-identified),279,
1,Algonquin College,2012-2013,Attending an off-shore school,0,
2,Algonquin College,2012-2013,Canadian citizen,14239,
3,Algonquin College,2012-2013,Convention refugee (protected person),13,
4,Algonquin College,2012-2013,In Canada on authority of another visa (includ...,34,
...,...,...,...,...,...
2010,St. Lawrence College,2022-2023,In Canada on authority of another visa (includ...,*,
2011,St. Lawrence College,2022-2023,Other,*,
2012,St. Lawrence College,2022-2023,"Permanent resident, formerly called landed imm...",165,
2013,St. Lawrence College,2022-2023,Student permit/ Student visa: a permit obtaine...,6939,


In [9]:
status_df.drop("Unnamed: 4", inplace=True, axis=1)

In [10]:
status_df

Unnamed: 0,College Name,Fiscal Year,Status In Canada Description,Headcount Full-Time Fall
0,Algonquin College,2012-2013,Aboriginal ancestry (where self-identified),279
1,Algonquin College,2012-2013,Attending an off-shore school,0
2,Algonquin College,2012-2013,Canadian citizen,14239
3,Algonquin College,2012-2013,Convention refugee (protected person),13
4,Algonquin College,2012-2013,In Canada on authority of another visa (includ...,34
...,...,...,...,...
2010,St. Lawrence College,2022-2023,In Canada on authority of another visa (includ...,*
2011,St. Lawrence College,2022-2023,Other,*
2012,St. Lawrence College,2022-2023,"Permanent resident, formerly called landed imm...",165
2013,St. Lawrence College,2022-2023,Student permit/ Student visa: a permit obtaine...,6939


In [None]:
# define the preprocessing function

def preprocess_student_data(df, fiscal_year_col, group_col, count_col, new_total_col="School's Total Headcount This Year"):
    """
    Preprocess the student demographic data by transforming and restructuring the dataframe.
    
    Parameters:
    - df: DataFrame to preprocess
    - fiscal_year_col: str, column name for fiscal year
    - group_col: str, column name for the demographic category (e.g., Gender, Status in Canada)
    - count_col: str, column name for the student count in each demographic group
    - new_total_col: str, optional, name for the column to store the total full-time enrollment
    
    Returns:
    - DataFrame: Transformed DataFrame with pivoted percentage shares for each demographic category
    """
    
    # Step 1: Replace '*' representing 0-9 in the count data with an estimate of 5
    df[count_col] = df[count_col].replace('*', 5).astype(float)
    
    # Step 2: Shorten the fiscal year format (e.g., 2012-2013 -> 12-13)
    df[fiscal_year_col] = df[fiscal_year_col].apply(lambda x: f"{x[2:4]}-{x[7:]}")
    
    # Step 3: Calculate total enrollment per college and fiscal year
    df[new_total_col] = df.groupby(['College Name', fiscal_year_col])[count_col].transform('sum')
    
    # Step 4: Calculate percentage share of headcount for each demographic category
    df['% Share of Headcount'] = (df[count_col] / df[new_total_col]) * 100
    
    # Step 5: Pivot the table so that each demographic group becomes its own column
    df_pivot = df.pivot_table(
        index=['College Name', fiscal_year_col, new_total_col],
        columns=group_col,
        values='% Share of Headcount'
    ).reset_index()
    
    # Step 6: Rename the pivoted columns to include % sign for clarity
    df_pivot.columns = [f"{col} %" if col not in ['College Name', fiscal_year_col, new_total_col] else col for col in df_pivot.columns]
    
    return df_pivot



In [12]:
# Assuming status_df is the DataFrame for the Status in Canada sheet
status_df_processed = preprocess_student_data(
    df=status_df,
    fiscal_year_col='Fiscal Year',
    group_col='Status In Canada Description',
    count_col='Headcount Full-Time Fall'
)


  df[count_col] = df[count_col].replace('*', 5).astype(float)


In [13]:
status_df_processed

Unnamed: 0,College Name,Fiscal Year,School's Total Headcount This Year,Aboriginal ancestry (where self-identified) %,Attending an off-shore school %,Canadian citizen %,Convention refugee (protected person) %,"In Canada on authority of another visa (including students who are in Canada on diplomatic, trade or other missions) %",Other %,"Permanent resident, formerly called landed immigrant %",Refugee status %,Student permit/ Student visa: a permit obtained by a student to enter Canada for the sole purpose of attending an educational postsecondary institution %,Unknown %
0,Algonquin College,12-13,16068.0,1.736370,0.0,88.617127,0.080906,0.211601,0.192930,4.095096,0.161812,4.904157,0.000000
1,Algonquin College,13-14,16844.0,1.721681,0.0,87.609831,0.089052,0.255284,0.201852,4.470435,0.130610,5.521254,
2,Algonquin College,14-15,17025.0,1.891336,0.0,86.079295,0.093979,0.258443,0.205580,4.663730,0.076358,6.731278,
3,Algonquin College,15-16,17435.0,2.070548,0.0,85.374247,0.086034,0.223688,0.177803,4.519644,0.103241,7.444795,
4,Algonquin College,16-17,17385.0,1.581823,,83.986195,0.149554,0.207075,0.235835,4.607420,0.126546,9.105551,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,St. Lawrence College,18-19,8792.0,5.220655,,57.552320,,0.000000,0.056870,1.797088,,34.531392,0.841674
260,St. Lawrence College,19-20,9014.0,4.692700,,53.383625,,,0.055469,1.830486,,39.516308,0.521411
261,St. Lawrence College,20-21,9229.0,3.651533,,47.090692,,,0.054177,1.668653,,46.949832,0.585112
262,St. Lawrence College,21-22,9039.0,3.429583,,45.768337,,0.055316,0.055316,1.526718,,48.799646,0.365085
