# StatCan Data on Credential type across institutions from 2011-2021

As Canadian PSIs have declined in domestic enrolment and replaced this with international students, has their credential offering changed as a result?

In [23]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [24]:
# Reading in the CSV
df = pd.read_csv("/Users/thomasdoherty/Desktop/canadian-psi-project/psi_data/statcan_data/statcan-credentials.csv", encoding='utf-8')

In [25]:
df

Unnamed: 0,REF_DATE,GEO,DGUID,Field of study,Program type,Credential type,Institution type,Registration status,Status of student in Canada,Gender,...,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,2011/2012,Canada,2021A000011124,"Total, field of study","Total, program type",Certificate,"Total, institution type",Full-time student,Canadian students,"Total, gender",...,223,units,0,v1438513299,1.1.1.3.1.2.2.1,68367.0,,,,0
1,2012/2013,Canada,2021A000011124,"Total, field of study","Total, program type",Certificate,"Total, institution type",Full-time student,Canadian students,"Total, gender",...,223,units,0,v1438513299,1.1.1.3.1.2.2.1,69231.0,,,,0
2,2013/2014,Canada,2021A000011124,"Total, field of study","Total, program type",Certificate,"Total, institution type",Full-time student,Canadian students,"Total, gender",...,223,units,0,v1438513299,1.1.1.3.1.2.2.1,74811.0,,,,0
3,2014/2015,Canada,2021A000011124,"Total, field of study","Total, program type",Certificate,"Total, institution type",Full-time student,Canadian students,"Total, gender",...,223,units,0,v1438513299,1.1.1.3.1.2.2.1,74370.0,,,,0
4,2015/2016,Canada,2021A000011124,"Total, field of study","Total, program type",Certificate,"Total, institution type",Full-time student,Canadian students,"Total, gender",...,223,units,0,v1438513299,1.1.1.3.1.2.2.1,71352.0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15211,2017/2018,"Nunavut Arctic College, Nunavut",,"Total, field of study","Total, program type","Not applicable, credential type","Total, institution type",Full-time student,Canadian students,"Total, gender",...,223,units,0,v1445971484,258.1.1.9.1.2.2.1,99.0,,,,0
15212,2018/2019,"Nunavut Arctic College, Nunavut",,"Total, field of study","Total, program type","Not applicable, credential type","Total, institution type",Full-time student,Canadian students,"Total, gender",...,223,units,0,v1445971484,258.1.1.9.1.2.2.1,117.0,,,,0
15213,2019/2020,"Nunavut Arctic College, Nunavut",,"Total, field of study","Total, program type","Not applicable, credential type","Total, institution type",Full-time student,Canadian students,"Total, gender",...,223,units,0,v1445971484,258.1.1.9.1.2.2.1,129.0,,,,0
15214,2020/2021,"Nunavut Arctic College, Nunavut",,"Total, field of study","Total, program type","Not applicable, credential type","Total, institution type",Full-time student,Canadian students,"Total, gender",...,223,units,0,v1445971484,258.1.1.9.1.2.2.1,147.0,,,,0


In [26]:
print(df.columns)

Index(['REF_DATE', 'GEO', 'DGUID', 'Field of study', 'Program type',
       'Credential type', 'Institution type', 'Registration status',
       'Status of student in Canada', 'Gender', 'UOM', 'UOM_ID',
       'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS',
       'SYMBOL', 'TERMINATED', 'DECIMALS'],
      dtype='object')


drop unnecessary columns

In [27]:
df.drop(['DGUID', 'Field of study', 'Program type', 'Institution type', 'Gender', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS'], axis=1, inplace=True)

In [28]:
df

Unnamed: 0,REF_DATE,GEO,Credential type,Registration status,Status of student in Canada,VALUE
0,2011/2012,Canada,Certificate,Full-time student,Canadian students,68367.0
1,2012/2013,Canada,Certificate,Full-time student,Canadian students,69231.0
2,2013/2014,Canada,Certificate,Full-time student,Canadian students,74811.0
3,2014/2015,Canada,Certificate,Full-time student,Canadian students,74370.0
4,2015/2016,Canada,Certificate,Full-time student,Canadian students,71352.0
...,...,...,...,...,...,...
15211,2017/2018,"Nunavut Arctic College, Nunavut","Not applicable, credential type",Full-time student,Canadian students,99.0
15212,2018/2019,"Nunavut Arctic College, Nunavut","Not applicable, credential type",Full-time student,Canadian students,117.0
15213,2019/2020,"Nunavut Arctic College, Nunavut","Not applicable, credential type",Full-time student,Canadian students,129.0
15214,2020/2021,"Nunavut Arctic College, Nunavut","Not applicable, credential type",Full-time student,Canadian students,147.0


Rename columns so it is clearer to read - same processing as the international/domestic split

In [29]:
# rename columns
df.rename(columns={"REF_DATE": "FY Start","GEO": "School/Locality", "Status of student in Canada": "Status", "VALUE": "Enrolment"}, inplace=True)

In [30]:
df["FY Start"] = df["FY Start"].apply(lambda x: int(x[:4]))

split up the provincial data from individual schools

In [31]:
# List of provinces and territories in Canada
canadian_provinces_territories = [
    "Alberta", "British Columbia", "Manitoba", "New Brunswick", "Newfoundland and Labrador",
    "Nova Scotia", "Ontario", "Prince Edward Island", "Quebec", "Saskatchewan",
    "Northwest Territories", "Nunavut", "Yukon", "Canada"
]

# Convert the list to a set for fast exact matching
province_set = set(canadian_provinces_territories)

# Create a mask for exact matches with Canada or any province/territory
exact_match_mask = df['School/Locality'].isin(province_set)

# Create the Canada & Provinces DataFrame (exact matches)
canada_df = df[exact_match_mask]

# For rows that do not match exactly, extract the college/university name before the last comma
# We will create a mask where School/Locality does not match any province or Canada
schools_df = df[~exact_match_mask]

# Correctly extract the institution name (everything before the last comma)
# Ensure you use rsplit correctly, specifying the delimiter (comma) and maxsplit=1
schools_df['Institution Name'] = schools_df['School/Locality'].str.rsplit(',', n=1).str[0]

# Now, `canada_df` contains rows where School/Locality is exactly a province or territory, and
# `schools_df` contains rows with college/university names (before the last comma)
# If you want to merge both DataFrames back into a single one:
schools_df.drop(columns=['School/Locality'], inplace=True)

# Display the results
print(f"Number of rows in canada_df: {len(canada_df)}")
print(f"Number of rows in colleges_universities_df: {len(schools_df)}")

Number of rows in canada_df: 1313
Number of rows in colleges_universities_df: 13903


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schools_df['Institution Name'] = schools_df['School/Locality'].str.rsplit(',', n=1).str[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schools_df.drop(columns=['School/Locality'], inplace=True)


In [33]:
# move Institution Name to the front of the DataFrame
cols = schools_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
schools_df = schools_df[cols]

In [39]:
schools_df.sample(10)

Unnamed: 0,Institution Name,FY Start,Credential type,Registration status,Status,Enrolment
10599,Athabasca University,2015,Diploma,Full-time student,International students,3.0
5154,Collège Centennal / Centennial College,2014,Diploma,Full-time student,Canadian students,75.0
3192,Université du Québec à Rimouski,2019,"Not applicable, credential type",Full-time student,Canadian students,6.0
6783,University of Toronto,2020,Certificate,Full-time student,Canadian students,9.0
10821,University of Lethbridge,2016,Certificate,Full-time student,Canadian students,9.0
10724,University of Calgary,2015,"Not applicable, credential type",Full-time student,International students,309.0
12886,British Columbia Institute of Technology,2015,Diploma,Full-time student,Canadian students,4977.0
5220,Collège André-Grasset,2013,Diploma,Full-time student,Canadian students,1212.0
10391,Great Plains College,2017,"Not applicable, credential type",Full-time student,Canadian students,12.0
6962,University of Waterloo,2021,Degree (includes applied degree),Full-time student,International students,8619.0


In [38]:
canada_df.sample(10)

Unnamed: 0,FY Start,School/Locality,Credential type,Registration status,Status,Enrolment
10430,2012,Alberta,Diploma,Full-time student,International students,2517.0
9152,2016,Manitoba,Other type of credential associated with a pro...,Full-time student,Canadian students,948.0
50,2017,Canada,Degree (includes applied degree),Full-time student,Canadian students,839289.0
6091,2020,Ontario,Other type of credential associated with a pro...,Full-time student,Canadian students,1422.0
55,2011,Canada,Degree (includes applied degree),Full-time student,International students,86757.0
10479,2017,Alberta,"Not applicable, credential type",Full-time student,International students,1608.0
67,2012,Canada,Associate degree,Full-time student,Canadian students,9510.0
9163,2016,Manitoba,Other type of credential associated with a pro...,Full-time student,International students,93.0
9086,2016,Manitoba,Certificate,Full-time student,International students,627.0
705,2020,Nova Scotia,Degree (includes applied degree),Full-time student,Canadian students,26622.0


In [40]:
# rename the School/Locality column to Province in canada_df
canada_df.rename(columns={"School/Locality": "Province"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canada_df.rename(columns={"School/Locality": "Province"}, inplace=True)


In [42]:
canada_df.sample(5)

Unnamed: 0,FY Start,Province,Credential type,Registration status,Status,Enrolment
12356,2021,British Columbia,Degree (includes applied degree),Full-time student,International students,25836.0
15074,2015,Northwest Territories,"Not applicable, credential type",Full-time student,Canadian students,3.0
79,2013,Canada,Associate degree,Full-time student,International students,828.0
6103,2021,Ontario,Other type of credential associated with a pro...,Full-time student,International students,291.0
9084,2014,Manitoba,Certificate,Full-time student,International students,261.0
