In [1]:
# 3rd Party Imports
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt 
import numpy as np
import matplotlib as plt
import geopandas as gpd
from fuzzywuzzy import fuzz, process

# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path



In [2]:
# Import eqao, osslt, and highschool data
current_dir = Path.cwd()
parent_dir = current_dir.parent

eqao = pd.read_csv(parent_dir / "2_data_cleaning" / "cleaned_csv" / "EQAO_final.csv")
osslt = pd.read_csv(parent_dir / "2_data_cleaning" / "cleaned_csv" / "OSSLT_final.csv")
high = pd.read_csv(parent_dir / "2_data_cleaning" / "cleaned_csv" / "Highschool_final.csv")

In [3]:
merged_df = pd.merge(eqao, osslt, on='School Name')

merged_df

Unnamed: 0,School Name,EQAO % at Levels 3/4,OSSLT % Successful
0,A Y Jackson Secondary School,93.0,88.0
1,Agincourt Collegiate Institute,73.0,94.0
2,Albert Campbell Collegiate Institute,63.0,88.0
3,Birchmount Park Collegiate Institute,32.0,79.0
4,Bloor Collegiate Institute,79.0,94.0
...,...,...,...
59,Wexford Collegiate School for the Arts,42.0,86.0
60,Winston Churchill Collegiate Institute,20.0,70.0
61,Woburn Collegiate Institute,59.0,82.0
62,York Memorial Collegiate Institute,33.0,83.0


In [4]:
# investigate issues

merged_schools = merged_df['School Name'].str.lower()
high_schools = high['School Name'].str.lower()

# Find common schools (partial match in high within full names in merged_df)
common_schools = []
for partial_name in high_schools:
    matches = merged_schools[merged_schools.str.contains(partial_name, case=False, na=False)]
    common_schools.extend(matches.tolist())

# Find schools in merged_df but not in high
merged_only_schools = merged_schools[~merged_schools.isin(common_schools)].tolist()

# Find schools in high but not in merged_df (these are the partial names with no full match)
high_only_schools = high_schools[~high_schools.isin([name.lower() for name in common_schools])].tolist()

# Print the results
print("Common Schools (partial match in high within full names in merged_df):", common_schools)
print("Schools only in merged_df:", merged_only_schools)
print("Schools only in high:", high_only_schools)

Common Schools (partial match in high within full names in merged_df): ['a y jackson secondary school', 'agincourt collegiate institute', 'albert campbell collegiate institute', 'birchmount park collegiate institute', 'bloor collegiate institute', 'c w jefferys collegiate institute', 'cedarbrae collegiate institute', 'central technical school', 'central toronto academy', 'don mills collegiate institute', 'downsview secondary school', 'dr norman bethune collegiate institute', 'earl haig secondary school', 'east york collegiate institute', 'etobicoke collegiate institute', 'etobicoke school of the arts', 'forest hill collegiate institute', 'george s henry academy', 'georges vanier secondary school', 'harbord collegiate institute', 'humberside collegiate institute', 'jarvis collegiate institute', 'john polanyi collegiate institute', 'kipling collegiate institute', "l'amoreaux collegiate institute", 'lakeshore collegiate institute', 'lawrence park collegiate institute', 'leaside high schoo

In [5]:
# Problem Schools
# Greenwood Secondary School: Not included as only for Immigrant Students
# Heydon Park Secondary School: Not included as only for Young Women, Transgender and Non-Binary Students
# Native Learning Centre East: Not included as only for Indigeneous Students
# Rosedale Heights School of the Arts:
# SATEC @ WA Porter Collegiate Institute:
# Wexford Collegiate School for the Arts:

In [6]:
# merge osslt and eqao with fraiser ranking csv

# Extract the 'School Name' columns
merged_schools = merged_df['School Name'].str.lower()  # Convert to lowercase for case-insensitive comparison
high_schools = high['School Name'].str.lower()  # Convert to lowercase for case-insensitive comparison

high_schools = high_schools.tolist() 

# Find common schools (partial match in high within full names in merged_df)
common_schools = []
for partial_name in high_schools:
    matches = merged_schools[merged_schools.str.contains(partial_name, case=False, na=False)]
    common_schools.extend(matches.tolist())

# Find schools in merged_df but not in high
merged_only_schools = merged_schools[~merged_schools.isin(common_schools)].tolist()

# Find schools in high but not in merged_df (these are the partial names with no full match)
# We need to check that each name in high_schools is not in the common_schools
high_only_schools = [name for name in high_schools if name.lower() not in [s.lower() for s in common_schools]]

# Add a new "Partial Name" column in merged_df
merged_df['Partial Name'] = merged_df['School Name'].apply(lambda x: next((name for name in high_schools if name.lower() in x.lower()), None))

# Add a new "Partial Name" column in high
high['Partial Name'] = high['School Name'].apply(lambda x: next((name for name in merged_df['School Name'] if name.lower() in x.lower()), None))


In [7]:
# fix problem schools

special_schools = ['rosedale heights-arts', 'w a porter', 'wexford collegiate-arts']
merged_df.loc[merged_df['School Name'] == 'Rosedale Heights School of the Arts', 'Partial Name'] = 'rosedale heights-arts'
merged_df.loc[merged_df['School Name'] == 'SATEC @ WA Porter Collegiate Institute', 'Partial Name'] = 'w a porter'
merged_df.loc[merged_df['School Name'] == 'Wexford Collegiate School for the Arts', 'Partial Name'] = 'wexford collegiate-arts'

In [8]:
# Ensure both columns are strings and convert to lowercase, handling NaNs
merged_df["Partial Name"] = merged_df["Partial Name"].fillna("").astype(str).str.lower()
high["School Name"] = high["School Name"].fillna("").astype(str).str.lower()

# Perform a left merge, keeping all rows from merged_df and adding matching rows from high
merged_result = pd.merge(merged_df, high, how='left', left_on='Partial Name', right_on='School Name')

# Optional: Drop the 'School Name' column from the merged result if you don't need it
# merged_result = merged_result.drop(columns=['School Name'])

In [9]:
# clean columns 
merged_result = merged_result.drop(columns=['Partial Name_y', 'Partial Name_x', 'School Name_y'])
merged_result = merged_result.dropna()

merged_result = merged_result.rename(columns={'School Name_x': 'School Name'})
merged_result


Unnamed: 0,School Name,EQAO % at Levels 3/4,OSSLT % Successful,Fraiser Score
0,A Y Jackson Secondary School,93.0,88.0,7.9
1,Agincourt Collegiate Institute,73.0,94.0,8.3
2,Albert Campbell Collegiate Institute,63.0,88.0,7.0
3,Birchmount Park Collegiate Institute,32.0,79.0,5.2
4,Bloor Collegiate Institute,79.0,94.0,8.5
...,...,...,...,...
59,Wexford Collegiate School for the Arts,42.0,86.0,6.6
60,Winston Churchill Collegiate Institute,20.0,70.0,3.4
61,Woburn Collegiate Institute,59.0,82.0,6.9
62,York Memorial Collegiate Institute,33.0,83.0,6.7


In [10]:
# Export to CSV
over_ride = False
if over_ride:
    merged_result.to_csv(parent_dir / "2_data_cleaning" / "cleaned_csv" / "Combined_schools_final.csv", index = False)