# StatCan Data on Credential type across institutions from 2011-2021

As Canadian PSIs have declined in domestic enrolment and replaced this with international students, has their credential offering changed as a result?

In [158]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [159]:
# Reading in the CSV
df = pd.read_csv("/Users/thomasdoherty/Desktop/canadian-psi-project/psi_data/statcan_data/statcan-credentials.csv", encoding='utf-8')

In [160]:
df.sample(10)

Unnamed: 0,REF_DATE,GEO,DGUID,Field of study,Program type,Credential type,Institution type,Registration status,Status of student in Canada,Gender,...,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
11395,2020/2021,"Olds College, Alberta",,"Total, field of study","Total, program type",Diploma,"Total, institution type",Full-time student,International students,"Total, gender",...,223,units,0,v1444722098,214.1.1.4.1.2.3.1,63.0,,,,0
765,2018/2019,"Acadia University, Nova Scotia",,"Total, field of study","Total, program type",Certificate,"Total, institution type",Full-time student,International students,"Total, gender",...,223,units,0,v1439544187,10.1.1.3.1.2.3.1,6.0,,,,0
8187,2012/2013,Loyalist College of Applied Arts and Technolog...,,"Total, field of study","Total, program type",Certificate,"Total, institution type",Full-time student,Canadian students,"Total, gender",...,223,units,0,v1443109171,158.1.1.3.1.2.2.1,246.0,,,,0
3335,2016/2017,"Télé-université, Quebec",,"Total, field of study","Total, program type",Degree (includes applied degree),"Total, institution type",Full-time student,International students,"Total, gender",...,223,units,0,v1440842975,47.1.1.5.1.2.3.1,3.0,,,,0
12602,2018/2019,"Simon Fraser University, British Columbia",,"Total, field of study","Total, program type",Certificate,"Total, institution type",Full-time student,Canadian students,"Total, gender",...,223,units,0,v1445259134,230.1.1.3.1.2.2.1,543.0,,,,0
10474,2012/2013,Alberta,2021A000248,"Total, field of study","Total, program type","Not applicable, credential type","Total, institution type",Full-time student,International students,"Total, gender",...,223,units,0,v1444313660,199.1.1.9.1.2.3.1,1386.0,,,,0
8172,2021/2022,Georgian College of Applied Arts and Technolog...,,"Total, field of study","Total, program type",Degree (includes applied degree),"Total, institution type",Full-time student,Canadian students,"Total, gender",...,223,units,0,v1443069471,157.1.1.5.1.2.2.1,720.0,,,,0
11599,2012/2013,"Lethbridge College, Alberta",,"Total, field of study","Total, program type",Certificate,"Total, institution type",Full-time student,Canadian students,"Total, gender",...,223,units,0,v1444766560,217.1.1.3.1.2.2.1,249.0,,,,0
7612,2015/2016,Algonquin College of Applied Arts and Technolo...,,"Total, field of study","Total, program type",Degree (includes applied degree),"Total, institution type",Full-time student,International students,"Total, gender",...,223,units,0,v1442726651,149.1.1.5.1.2.3.1,105.0,,,,0
1645,2016/2017,"University of New Brunswick, New Brunswick",,"Total, field of study","Total, program type",Diploma,"Total, institution type",Full-time student,International students,"Total, gender",...,223,units,0,v1439890715,24.1.1.4.1.2.3.1,3.0,,,,0


In [161]:
print(df.columns)

Index(['REF_DATE', 'GEO', 'DGUID', 'Field of study', 'Program type',
       'Credential type', 'Institution type', 'Registration status',
       'Status of student in Canada', 'Gender', 'UOM', 'UOM_ID',
       'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS',
       'SYMBOL', 'TERMINATED', 'DECIMALS'],
      dtype='object')


drop unnecessary columns

In [162]:
df.drop(['DGUID', 'Field of study', 'Program type', 'Institution type', 'Registration status', 'Gender', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS'], axis=1, inplace=True)

In [163]:
df.sample(5)

Unnamed: 0,REF_DATE,GEO,Credential type,Status of student in Canada,VALUE
8219,2013/2014,Loyalist College of Applied Arts and Technolog...,Diploma,International students,72.0
9896,2013/2014,"L'École Technique et Professionnelle, Manitoba",Certificate,Canadian students,24.0
4178,2012/2013,"CÉGEP de Matane, Quebec","Not applicable, credential type",Canadian students,24.0
820,2019/2020,"Atlantic School of Theology, Nova Scotia",Certificate,Canadian students,0.0
10435,2017/2018,Alberta,Diploma,International students,3714.0


Rename columns so it is clearer to read - same processing as the international/domestic split

In [164]:
# rename columns
df.rename(columns={"REF_DATE": "FY Start","GEO": "School/Locality", "Status of student in Canada": "Status", "VALUE": "Enrolment"}, inplace=True)

In [165]:
df["FY Start"] = df["FY Start"].apply(lambda x: int(x[:4]))

split up the provincial data from individual schools

In [166]:
# List of provinces and territories in Canada
canadian_provinces_territories = [
    "Alberta", "British Columbia", "Manitoba", "New Brunswick", "Newfoundland and Labrador",
    "Nova Scotia", "Ontario", "Prince Edward Island", "Quebec", "Saskatchewan",
    "Northwest Territories", "Nunavut", "Yukon", "Canada"
]

# Convert the list to a set for fast exact matching
province_set = set(canadian_provinces_territories)

# Create a mask for exact matches with Canada or any province/territory
exact_match_mask = df['School/Locality'].isin(province_set)

# Create the Canada & Provinces DataFrame (exact matches)
canada_df = df[exact_match_mask]

# For rows that do not match exactly, extract the college/university name before the last comma
# We will create a mask where School/Locality does not match any province or Canada
schools_df = df[~exact_match_mask]

# Split 'School/Locality' by ", " from the right and extract school name and province/territory, delimiter is ', '
schools_df[['Institution Name', 'Province/Territory']] = schools_df['School/Locality'].str.rsplit(", ", n=1, expand=True)

# Drop the original 'School/Locality' column if no longer needed
schools_df.drop(columns=['School/Locality'], inplace=True)



# Now, `canada_df` contains rows where School/Locality is exactly a province or territory, and
# `schools_df` contains rows with college/university names (before the last comma)

# Display the results
print(f"Number of rows in canada_df: {len(canada_df)}")
print(f"Number of rows in colleges_universities_df: {len(schools_df)}")

Number of rows in canada_df: 1313
Number of rows in colleges_universities_df: 13903


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schools_df[['Institution Name', 'Province/Territory']] = schools_df['School/Locality'].str.rsplit(", ", n=1, expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schools_df[['Institution Name', 'Province/Territory']] = schools_df['School/Locality'].str.rsplit(", ", n=1, expand=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schools_df.drop(co

In [167]:
# move Institution Name to the front of the DataFrame
cols = schools_df.columns.tolist()
cols = cols[-2:] + cols[:-2] # this just redefines cols as the last element of cols first, then the rest of the elements in the same order
schools_df = schools_df[cols]

# light cleaing - remove "of Applied Arts & Technology"ArithmeticError# Remove "of Applied Arts and Technology" from all school names
schools_df['Institution Name'] = schools_df['Institution Name'].str.replace(
    " of Applied Arts and Technology", "", regex=False
)

schools_df['Institution Name'] = schools_df['Institution Name'].str.replace(
    " Institute of Technology and Advanced Learning", "", regex=False
)

In [168]:
schools_df.sample(10)

Unnamed: 0,Institution Name,Province/Territory,FY Start,Credential type,Status,Enrolment
13143,Douglas College,British Columbia,2020,Associate degree,Canadian students,1353.0
2004,New Brunswick College of Craft and Design,New Brunswick,2015,Other type of credential associated with a pro...,Canadian students,3.0
6439,Laurentian University,Ontario,2020,"Not applicable, credential type",Canadian students,15.0
11838,Keyano College,Alberta,2018,"Not applicable, credential type",Canadian students,528.0
12026,Northern Alberta Institute of Technology,Alberta,2019,"Not applicable, credential type",International students,48.0
5458,Collège LaSalle,Quebec,2019,"Not applicable, credential type",International students,102.0
287,Memorial University of Newfoundland,Newfoundland and Labrador,2017,Degree (includes applied degree),Canadian students,11397.0
5986,Collège TAV,Quebec,2021,"Not applicable, credential type",International students,3.0
7022,Western University,Ontario,2017,Degree (includes applied degree),International students,4383.0
15082,Aurora College,Northwest Territories,2012,Certificate,Canadian students,39.0


In [169]:
# rename the School/Locality column to Province in canada_df
canada_df.rename(columns={"School/Locality": "Province/Territory"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canada_df.rename(columns={"School/Locality": "Province/Territory"}, inplace=True)


In [170]:
canada_df.sample(5)

Unnamed: 0,FY Start,Province/Territory,Credential type,Status,Enrolment
12387,2019,British Columbia,Other type of credential associated with a pro...,Canadian students,3048.0
194,2018,Newfoundland and Labrador,Degree (includes applied degree),International students,2862.0
652,2011,Nova Scotia,Certificate,Canadian students,2367.0
6067,2018,Ontario,Degree (includes applied degree),Canadian students,406830.0
15055,2019,Northwest Territories,Diploma,Canadian students,90.0


Add Francophone tag

In [172]:
# List of strings that indicate the school is Francophone
francophone_schools = [
    "Université Sainte-Anne", "Collège Boréal", "Collège d'Alfred", 
    "Collège dominicain", "La Cité collégiale", "Université de Hearst", 
    "Université de l'Ontario français", "Université de Moncton", 
    "Collège Communautaire du Nouveau-Brunswick", "Collège de l'Île", "L'École Technique et Professionnelle"
]

# Create the "Francophone" column with 0 as the default value
schools_df['Francophone'] = 0

# Update the "Francophone" column based on the School Name
schools_df['Francophone'] = schools_df.apply(
    lambda row: 1 if (
        any(francophone in row['Institution Name'] for francophone in francophone_schools) or 
        (row['Province/Territory'] == "Quebec" and "McGill University" not in row['Institution Name'])
    ) else 0,
    axis=1
)

# Display the updated DataFrame
print(schools_df[['Institution Name', 'Province/Territory', 'Francophone']].head())


                        Institution Name         Province/Territory  \
238  Memorial University of Newfoundland  Newfoundland and Labrador   
239  Memorial University of Newfoundland  Newfoundland and Labrador   
240  Memorial University of Newfoundland  Newfoundland and Labrador   
241  Memorial University of Newfoundland  Newfoundland and Labrador   
242  Memorial University of Newfoundland  Newfoundland and Labrador   

     Francophone  
238            0  
239            0  
240            0  
241            0  
242            0  


In [174]:
schools_df.sample(5)

Unnamed: 0,Institution Name,Province/Territory,FY Start,Credential type,Status,Enrolment,Francophone
5690,Collège Stanislas inc.,Quebec,2011,"Not applicable, credential type",International students,42.0,1
5039,Collège Heritage / Heritage College,Quebec,2018,Diploma,Canadian students,1050.0,1
11084,Grant MacEwan University,Alberta,2015,"Not applicable, credential type",International students,360.0,0
5295,École de Musique Vincent d'Indy,Quebec,2014,Diploma,Canadian students,39.0,1
8316,Niagara College Canada,Ontario,2020,Certificate,Canadian students,807.0,0


## National & Provincial level changes in credentials offered 2011-2021

In [176]:
canada_df

Unnamed: 0,FY Start,Province/Territory,Credential type,Status,Enrolment
0,2011,Canada,Certificate,Canadian students,68367.0
1,2012,Canada,Certificate,Canadian students,69231.0
2,2013,Canada,Certificate,Canadian students,74811.0
3,2014,Canada,Certificate,Canadian students,74370.0
4,2015,Canada,Certificate,Canadian students,71352.0
...,...,...,...,...,...
15166,2017,Nunavut,"Not applicable, credential type",Canadian students,99.0
15167,2018,Nunavut,"Not applicable, credential type",Canadian students,117.0
15168,2019,Nunavut,"Not applicable, credential type",Canadian students,129.0
15169,2020,Nunavut,"Not applicable, credential type",Canadian students,147.0


Let's pivot the table so that every row is a different province/territory in a given year for either Canadian or international students, columns will be certificate / diploma / degree

In [178]:
# List of territories to exclude - numbers are too small
territories = ["Yukon", "Northwest Territories", "Nunavut"]

canada_df = canada_df[
    ~canada_df['Province/Territory'].isin(territories)                     
]

# pivot the table for separate columns for each unique credential type
canada_df = canada_df.pivot_table(
    index=['FY Start', 'Province/Territory'], # specify what is staying the same
    columns='Credential type', # specify what is being pivoted
    values='Enrolment', # specifying the values to fill the new pivoted columns
    aggfunc='sum', 
    fill_value=0 # replace NaN with 0
).reset_index()

# rename the index column
canada_df.columns.name = None

In [179]:
canada_df

Unnamed: 0,FY Start,Province/Territory,Associate degree,Certificate,Degree (includes applied degree),Diploma,"Not applicable, credential type",Other type of credential associated with a program
0,2011,Alberta,0.0,8664.0,93621.0,27378.0,21093.0,0.0
1,2011,British Columbia,7968.0,11004.0,86202.0,27714.0,3489.0,6288.0
2,2011,Canada,7968.0,75144.0,886653.0,390870.0,68448.0,35484.0
3,2011,Manitoba,0.0,3072.0,29652.0,4890.0,5820.0,1092.0
4,2011,New Brunswick,0.0,2655.0,17931.0,3810.0,870.0,0.0
...,...,...,...,...,...,...,...,...
116,2021,Nova Scotia,0.0,1179.0,36012.0,6876.0,1344.0,15.0
117,2021,Ontario,0.0,67830.0,508533.0,166200.0,10551.0,1785.0
118,2021,Prince Edward Island,0.0,495.0,4749.0,1617.0,99.0,0.0
119,2021,Quebec,0.0,9366.0,191985.0,172713.0,14547.0,0.0


## Visualising school changes in credentials offered over time

Similar to the international/domestic notebook, we probably want to ignore the territories for now due to low numbers here.

We will also now pivot the table so that every school and every province shows enrolment by unique credential type as a column. 

First find the full name of what is listed in Credential type

In [None]:
# List of territories to exclude
territories = ["Yukon", "Northwest Territories", "Nunavut"]

# Step 1: Filter out territories from colleges_universities_df and include 'Province/Territory' in the index
schools_df = schools_df[
    (colleges_universities_df['Status'] == 'Canadian students') &
    (~colleges_universities_df['Province/Territory'].isin(territories))
].pivot_table(
    index=['School Name', 'Province/Territory', 'FY Start'],
    values='Enrolment',
    aggfunc='sum'
).reset_index()

# Rename the 'Enrolment' column to 'Domestic Enrolment'
domestic_df.rename(columns={'Enrolment': 'Domestic Enrolment'}, inplace=True)

# Step 2: Filter out territories from colleges_universities_df_intl and include 'Province/Territory' in the index
international_df = colleges_universities_df_intl[
    (colleges_universities_df_intl['Status'] == 'International students') &
    (~colleges_universities_df_intl['Province/Territory'].isin(territories))
].pivot_table(
    index=['School Name', 'Province/Territory', 'FY Start'],
    values='Enrolment',
    aggfunc='sum'
).reset_index()

# Rename the 'Enrolment' column to 'International Enrolment'
international_df.rename(columns={'Enrolment': 'International Enrolment'}, inplace=True)

# Step 3: Join the two DataFrames on 'School Name', 'Province/Territory', and 'FY Start'
combined_df = pd.merge(
    domestic_df,
    international_df,
    on=['School Name', 'Province/Territory', 'FY Start'],
    how='outer'
)

# Fill any NaN values with 0, as some institutions might not have data for both domestic and international enrolments
combined_df.fillna(0, inplace=True)

# Display the combined DataFrame
print(combined_df)

NameError: name 'colleges_universities_df' is not defined