In [301]:
# 3rd Party Imports
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt 
import numpy as np
import matplotlib as plt
import geopandas as gpd
from fuzzywuzzy import fuzz, process

# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')

In [302]:
# Import All Data
eqao = gpd.read_file("csv/2021_EQAO.csv")
osslt = gpd.read_file("csv/2021_OSSLT.csv")
high = gpd.read_file("csv/secondary_schools_combined.csv")

In [303]:
# Clean High School Ranking Data
high = high[high["field_4"] == "Toronto"]
high = high.drop(columns=['field_5', 'field_6'])
high = high.reset_index(drop=True)
high.columns = ["School Name", "Score", "Rank", "City"]
high = high.drop(columns="City")
high.head(30)

Unnamed: 0,School Name,Score,Rank
0,A Y Jackson,7.9,77/689
1,Agincourt,8.3,41/689
2,Albert Campbell,7.0,210/689
3,Archbishop Romero,1.7,674/689
4,Birchmount Park,5.2,496/689
5,Bishop Allen,8.0,66/689
6,Bishop Marrocco/Thomas Merton,3.8,611/689
7,Blessed Mother Teresa,3.9,606/689
8,Bloor,8.5,26/689
9,Brebeuf,6.1,357/689


In [304]:
# Clean EQAO DataFrame
eqao = eqao.dropna()
eqao.head(30)
eqao.shape

(69, 5)

In [305]:
# Clean OSSLT Data
osslt = osslt.dropna()
osslt.head(30)
osslt.shape

(104, 6)

In [306]:
# Rename Columns
eqao.columns = ['EQAO_' + col if col != 'School Name' else col for col in eqao.columns]
osslt.columns = ['OSSLT_' + col if col != 'School Name' else col for col in osslt.columns]

In [307]:
# suffixes = ['Secondary School', "Collegiate Institute", "Technical School", "Academy", "Collegiate and Technical Institute", "High School", "School of the Arts", "Technical-Commercial School"]

# Normalize the 'School_Name' in df_ranking by removing the suffixes
# for suffix in suffixes:
#     eqao['School Name'] = eqao['School Name'].str.replace(suffix, '', regex=False)
#     osslt['School Name'] = osslt['School Name'].str.replace(suffix, '', regex=False)

# Strip any leading/trailing spaces after removing suffixes
# eqao['School Name'] = eqao['School Name'].str.strip()
# osslt['School Name'] = osslt['School Name'].str.strip()

# Merge the dataframes (assuming the normalized 'School_Name' is the key)
merged_df = pd.merge(eqao, osslt, on='School Name')

# Display the merged dataframe
merged_df

Unnamed: 0,School Name,EQAO_Total # of\nStudents,EQAO_# of\nParticipating\nStudents,EQAO_# of\nStudents at\nLevels 3/4,EQAO_% at Levels\n3/4,OSSLT_Number of\nPreviously\nEligible\nStudents,OSSLT_Absent\n(All\nPreviously\nEligible\nStudents),OSSLT_Deferred\n(All\nPreviously\nEligible\nStudents),OSSLT_Successful\n(Fully\nParticipating),OSSLT_Not\nSuccessful\n(Fully\nParticipating)
0,A Y Jackson Secondary School,112,103,96,93%,312,8%,0%,88%,12%
1,Agincourt Collegiate Institute,184,172,125,73%,388,1%,<1%,94%,6%
2,Albert Campbell Collegiate Institute,169,158,100,63%,261,4%,7%,88%,12%
3,Birchmount Park Collegiate Institute,104,87,28,32%,169,22%,1%,79%,21%
4,Bloor Collegiate Institute,78,75,59,79%,197,4%,0%,94%,6%
...,...,...,...,...,...,...,...,...,...,...
59,Wexford Collegiate School for the Arts,118,107,45,42%,257,257,257,257,257
60,Winston Churchill Collegiate Institute,64,54,11,20%,173,173,173,173,173
61,Woburn Collegiate Institute,92,83,49,59%,247,247,247,247,247
62,York Memorial Collegiate Institute,67,63,21,33%,210,210,210,210,210


In [308]:
# Example dataframes (replace these with your actual data)
# merged_df = pd.DataFrame({'School Name': ['School A', 'School B', 'School C']})
# high = pd.DataFrame({'School Name': ['Sch A', 'Sch B', 'Sch D']})

# Extract the 'School Name' columns
merged_schools = merged_df['School Name'].str.lower()  # Convert to lowercase for case-insensitive comparison
high_schools = high['School Name'].str.lower()  # Convert to lowercase for case-insensitive comparison

# Find common schools (partial match in high within full names in merged_df)
common_schools = []
for partial_name in high_schools:
    matches = merged_schools[merged_schools.str.contains(partial_name, case=False, na=False)]
    common_schools.extend(matches.tolist())

# Find schools in merged_df but not in high
merged_only_schools = merged_schools[~merged_schools.isin(common_schools)].tolist()

# Find schools in high but not in merged_df (these are the partial names with no full match)
high_only_schools = high_schools[~high_schools.isin([name.lower() for name in common_schools])].tolist()

# Print the results
print("Common Schools (partial match in high within full names in merged_df):", common_schools)
print("Schools only in merged_df:", merged_only_schools)
print("Schools only in high:", high_only_schools)

Common Schools (partial match in high within full names in merged_df): ['a y jackson secondary school', 'agincourt collegiate institute', 'albert campbell collegiate institute', 'birchmount park collegiate institute', 'bloor collegiate institute', 'c w jefferys collegiate institute', 'cedarbrae collegiate institute', 'central technical school', 'central toronto academy', 'don mills collegiate institute', 'downsview secondary school', 'dr norman bethune collegiate institute', 'earl haig secondary school', 'east york collegiate institute', 'etobicoke collegiate institute', 'etobicoke school of the arts', 'forest hill collegiate institute', 'george s henry academy', 'georges vanier secondary school', 'harbord collegiate institute', 'humberside collegiate institute', 'jarvis collegiate institute', 'john polanyi collegiate institute', 'kipling collegiate institute', "l'amoreaux collegiate institute", 'lakeshore collegiate institute', 'lawrence park collegiate institute', 'leaside high schoo

In [309]:
# Problem Schools
# Greenwood Secondary School: Not included as only for Immigrant Students
# Heydon Park Secondary School: Not included as only for Young Women, Transgender and Non-Binary Students
# Native Learning Centre East: Not included as only for Indigeneous Students
# Rosedale Heights School of the Arts:
# SATEC @ WA Porter Collegiate Institute:
# Wexford Collegiate School for the Arts:

In [310]:
# Extract the 'School Name' columns
merged_schools = merged_df['School Name'].str.lower()  # Convert to lowercase for case-insensitive comparison
high_schools = high['School Name'].str.lower()  # Convert to lowercase for case-insensitive comparison

# Manually add the shortened names for special schools to the high_schools list
high_schools = high_schools.tolist() 

# Find common schools (partial match in high within full names in merged_df)
common_schools = []
for partial_name in high_schools:
    matches = merged_schools[merged_schools.str.contains(partial_name, case=False, na=False)]
    common_schools.extend(matches.tolist())

# Find schools in merged_df but not in high
merged_only_schools = merged_schools[~merged_schools.isin(common_schools)].tolist()

# Find schools in high but not in merged_df (these are the partial names with no full match)
# We need to check that each name in high_schools is not in the common_schools
high_only_schools = [name for name in high_schools if name.lower() not in [s.lower() for s in common_schools]]

# Add a new "Partial Name" column in merged_df
merged_df['Partial Name'] = merged_df['School Name'].apply(lambda x: next((name for name in high_schools if name.lower() in x.lower()), None))

# Add a new "Partial Name" column in high
high['Partial Name'] = high['School Name'].apply(lambda x: next((name for name in merged_df['School Name'] if name.lower() in x.lower()), None))

# Print the results
print("Common Schools (partial match in high within full names in merged_df):", common_schools)
print("Schools only in merged_df:", merged_only_schools)
print("Schools only in high:", high_only_schools)

high["School Name"]


Common Schools (partial match in high within full names in merged_df): ['a y jackson secondary school', 'agincourt collegiate institute', 'albert campbell collegiate institute', 'birchmount park collegiate institute', 'bloor collegiate institute', 'c w jefferys collegiate institute', 'cedarbrae collegiate institute', 'central technical school', 'central toronto academy', 'don mills collegiate institute', 'downsview secondary school', 'dr norman bethune collegiate institute', 'earl haig secondary school', 'east york collegiate institute', 'etobicoke collegiate institute', 'etobicoke school of the arts', 'forest hill collegiate institute', 'george s henry academy', 'georges vanier secondary school', 'harbord collegiate institute', 'humberside collegiate institute', 'jarvis collegiate institute', 'john polanyi collegiate institute', 'kipling collegiate institute', "l'amoreaux collegiate institute", 'lakeshore collegiate institute', 'lawrence park collegiate institute', 'leaside high schoo

0            A Y Jackson
1              Agincourt
2        Albert Campbell
3      Archbishop Romero
4        Birchmount Park
             ...        
100    Winston Churchill
101               Woburn
102        York Memorial
103           York Mills
104        Étienne-Brûlé
Name: School Name, Length: 105, dtype: object

In [311]:
special_schools = ['rosedale heights-arts', 'w a porter', 'wexford collegiate-arts']
merged_df.loc[merged_df['School Name'] == 'Rosedale Heights School of the Arts', 'Partial Name'] = 'rosedale heights-arts'
merged_df.loc[merged_df['School Name'] == 'SATEC @ WA Porter Collegiate Institute', 'Partial Name'] = 'w a porter'
merged_df.loc[merged_df['School Name'] == 'Wexford Collegiate School for the Arts', 'Partial Name'] = 'wexford collegiate-arts'

In [312]:
# Ensure both columns are strings and convert to lowercase, handling NaNs
merged_df["Partial Name"] = merged_df["Partial Name"].fillna("").astype(str).str.lower()
high["School Name"] = high["School Name"].fillna("").astype(str).str.lower()

# Perform a left merge, keeping all rows from merged_df and adding matching rows from high
merged_result = pd.merge(merged_df, high, how='left', left_on='Partial Name', right_on='School Name')

# Optional: Drop the 'School Name' column from the merged result if you don't need it
# merged_result = merged_result.drop(columns=['School Name'])

# Display the merged dataframe
merged_result

Unnamed: 0,School Name_x,EQAO_Total # of\nStudents,EQAO_# of\nParticipating\nStudents,EQAO_# of\nStudents at\nLevels 3/4,EQAO_% at Levels\n3/4,OSSLT_Number of\nPreviously\nEligible\nStudents,OSSLT_Absent\n(All\nPreviously\nEligible\nStudents),OSSLT_Deferred\n(All\nPreviously\nEligible\nStudents),OSSLT_Successful\n(Fully\nParticipating),OSSLT_Not\nSuccessful\n(Fully\nParticipating),Partial Name_x,School Name_y,Score,Rank,Partial Name_y
0,A Y Jackson Secondary School,112,103,96,93%,312,8%,0%,88%,12%,a y jackson,a y jackson,7.9,77/689,
1,Agincourt Collegiate Institute,184,172,125,73%,388,1%,<1%,94%,6%,agincourt,agincourt,8.3,41/689,
2,Albert Campbell Collegiate Institute,169,158,100,63%,261,4%,7%,88%,12%,albert campbell,albert campbell,7,210/689,
3,Birchmount Park Collegiate Institute,104,87,28,32%,169,22%,1%,79%,21%,birchmount park,birchmount park,5.2,496/689,
4,Bloor Collegiate Institute,78,75,59,79%,197,4%,0%,94%,6%,bloor,bloor,8.5,26/689,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,Wexford Collegiate School for the Arts,118,107,45,42%,257,257,257,257,257,wexford collegiate-arts,wexford collegiate-arts,6.6,273/689,
60,Winston Churchill Collegiate Institute,64,54,11,20%,173,173,173,173,173,winston churchill,winston churchill,3.4,628/689,
61,Woburn Collegiate Institute,92,83,49,59%,247,247,247,247,247,woburn,woburn,6.9,227/689,
62,York Memorial Collegiate Institute,67,63,21,33%,210,210,210,210,210,york memorial,york memorial,6.7,260/689,


In [313]:
merged_result = merged_result.drop(columns=['Partial Name_y'])
merged_result = merged_result.dropna()
merged_result


Unnamed: 0,School Name_x,EQAO_Total # of\nStudents,EQAO_# of\nParticipating\nStudents,EQAO_# of\nStudents at\nLevels 3/4,EQAO_% at Levels\n3/4,OSSLT_Number of\nPreviously\nEligible\nStudents,OSSLT_Absent\n(All\nPreviously\nEligible\nStudents),OSSLT_Deferred\n(All\nPreviously\nEligible\nStudents),OSSLT_Successful\n(Fully\nParticipating),OSSLT_Not\nSuccessful\n(Fully\nParticipating),Partial Name_x,School Name_y,Score,Rank
0,A Y Jackson Secondary School,112,103,96,93%,312,8%,0%,88%,12%,a y jackson,a y jackson,7.9,77/689
1,Agincourt Collegiate Institute,184,172,125,73%,388,1%,<1%,94%,6%,agincourt,agincourt,8.3,41/689
2,Albert Campbell Collegiate Institute,169,158,100,63%,261,4%,7%,88%,12%,albert campbell,albert campbell,7,210/689
3,Birchmount Park Collegiate Institute,104,87,28,32%,169,22%,1%,79%,21%,birchmount park,birchmount park,5.2,496/689
4,Bloor Collegiate Institute,78,75,59,79%,197,4%,0%,94%,6%,bloor,bloor,8.5,26/689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,Wexford Collegiate School for the Arts,118,107,45,42%,257,257,257,257,257,wexford collegiate-arts,wexford collegiate-arts,6.6,273/689
60,Winston Churchill Collegiate Institute,64,54,11,20%,173,173,173,173,173,winston churchill,winston churchill,3.4,628/689
61,Woburn Collegiate Institute,92,83,49,59%,247,247,247,247,247,woburn,woburn,6.9,227/689
62,York Memorial Collegiate Institute,67,63,21,33%,210,210,210,210,210,york memorial,york memorial,6.7,260/689


In [315]:
merged_result.to_csv("combined_schools.csv", index = False)