In [35]:
# 3rd Party Imports
import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt 
import numpy as np
import matplotlib as plt
import geopandas as gpd
from fuzzywuzzy import fuzz, process

# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path

In [36]:
# Import eqao, osslt, and highschool data
current_dir = Path.cwd()
parent_dir = current_dir.parent

eqao = pd.read_csv(parent_dir / "data" / "csv" / "School" / "EQAO_final.csv")
osslt = pd.read_csv(parent_dir / "data" / "csv" / "School" / "OSSLT_final.csv")
high = pd.read_csv(parent_dir / "data" / "csv" / "School" / "Highschool_final.csv")

In [37]:
merged_df = pd.merge(eqao, osslt, on='School Name')

merged_df

Unnamed: 0,School Name,EQAO % at Levels 3/4,OSSLT % Successful
0,A Y Jackson Secondary School,93.0,88.0
1,Agincourt Collegiate Institute,73.0,94.0
2,Albert Campbell Collegiate Institute,63.0,88.0
3,Birchmount Park Collegiate Institute,32.0,79.0
4,Bloor Collegiate Institute,79.0,94.0
...,...,...,...
59,Wexford Collegiate School for the Arts,42.0,86.0
60,Winston Churchill Collegiate Institute,20.0,70.0
61,Woburn Collegiate Institute,59.0,82.0
62,York Memorial Collegiate Institute,33.0,83.0


In [38]:
# investigate issues

merged_schools = merged_df['School Name'].str.lower()
high_schools = high['School Name'].str.lower()

# Find common schools (partial match in high within full names in merged_df)
common_schools = []
for partial_name in high_schools:
    matches = merged_schools[merged_schools.str.contains(partial_name, case=False, na=False)]
    common_schools.extend(matches.tolist())

# Find schools in merged_df but not in high
merged_only_schools = merged_schools[~merged_schools.isin(common_schools)].tolist()

# Find schools in high but not in merged_df (these are the partial names with no full match)
high_only_schools = high_schools[~high_schools.isin([name.lower() for name in common_schools])].tolist()

# Print the results
print("Common Schools (partial match in high within full names in merged_df):", common_schools)
print("Schools only in merged_df:", merged_only_schools)
print("Schools only in high:", high_only_schools)

Common Schools (partial match in high within full names in merged_df): ['a y jackson secondary school', 'agincourt collegiate institute', 'albert campbell collegiate institute', 'birchmount park collegiate institute', 'bloor collegiate institute', 'c w jefferys collegiate institute', 'cedarbrae collegiate institute', 'central technical school', 'central toronto academy', 'don mills collegiate institute', 'downsview secondary school', 'dr norman bethune collegiate institute', 'earl haig secondary school', 'east york collegiate institute', 'etobicoke collegiate institute', 'etobicoke school of the arts', 'forest hill collegiate institute', 'george s henry academy', 'georges vanier secondary school', 'harbord collegiate institute', 'humberside collegiate institute', 'jarvis collegiate institute', 'john polanyi collegiate institute', 'kipling collegiate institute', "l'amoreaux collegiate institute", 'lakeshore collegiate institute', 'lawrence park collegiate institute', 'leaside high schoo

In [39]:
# Problem Schools
# Greenwood Secondary School: Not included as only for Immigrant Students
# Heydon Park Secondary School: Not included as only for Young Women, Transgender and Non-Binary Students
# Native Learning Centre East: Not included as only for Indigeneous Students
# Rosedale Heights School of the Arts:
# SATEC @ WA Porter Collegiate Institute:
# Wexford Collegiate School for the Arts:

In [40]:
# merge osslt and eqao with fraiser ranking csv

# Extract the 'School Name' columns
merged_schools = merged_df['School Name'].str.lower()  # Convert to lowercase for case-insensitive comparison
high_schools = high['School Name'].str.lower()  # Convert to lowercase for case-insensitive comparison

high_schools = high_schools.tolist() 

# Find common schools (partial match in high within full names in merged_df)
common_schools = []
for partial_name in high_schools:
    matches = merged_schools[merged_schools.str.contains(partial_name, case=False, na=False)]
    common_schools.extend(matches.tolist())

# Find schools in merged_df but not in high
merged_only_schools = merged_schools[~merged_schools.isin(common_schools)].tolist()

# Find schools in high but not in merged_df (these are the partial names with no full match)
# We need to check that each name in high_schools is not in the common_schools
high_only_schools = [name for name in high_schools if name.lower() not in [s.lower() for s in common_schools]]

# Add a new "Partial Name" column in merged_df
merged_df['Partial Name'] = merged_df['School Name'].apply(lambda x: next((name for name in high_schools if name.lower() in x.lower()), None))

# Add a new "Partial Name" column in high
high['Partial Name'] = high['School Name'].apply(lambda x: next((name for name in merged_df['School Name'] if name.lower() in x.lower()), None))


In [41]:
# fix problem schools

special_schools = ['rosedale heights-arts', 'w a porter', 'wexford collegiate-arts']
merged_df.loc[merged_df['School Name'] == 'Rosedale Heights School of the Arts', 'Partial Name'] = 'rosedale heights-arts'
merged_df.loc[merged_df['School Name'] == 'SATEC @ WA Porter Collegiate Institute', 'Partial Name'] = 'w a porter'
merged_df.loc[merged_df['School Name'] == 'Wexford Collegiate School for the Arts', 'Partial Name'] = 'wexford collegiate-arts'

In [42]:
# Ensure both columns are strings and convert to lowercase, handling NaNs
merged_df["Partial Name"] = merged_df["Partial Name"].fillna("").astype(str).str.lower()
high["School Name"] = high["School Name"].fillna("").astype(str).str.lower()

# Perform a left merge, keeping all rows from merged_df and adding matching rows from high
merged_result = pd.merge(merged_df, high, how='left', left_on='Partial Name', right_on='School Name')

# Optional: Drop the 'School Name' column from the merged result if you don't need it
# merged_result = merged_result.drop(columns=['School Name'])

In [43]:
# clean columns 
merged_result = merged_result.drop(columns=['Partial Name_y', 'Partial Name_x', 'School Name_y'])
merged_result = merged_result.dropna()

merged_result = merged_result.rename(columns={'School Name_x': 'School Name'})
merged_result


Unnamed: 0,School Name,EQAO % at Levels 3/4,OSSLT % Successful,Fraiser Score
0,A Y Jackson Secondary School,93.0,88.0,7.9
1,Agincourt Collegiate Institute,73.0,94.0,8.3
2,Albert Campbell Collegiate Institute,63.0,88.0,7.0
3,Birchmount Park Collegiate Institute,32.0,79.0,5.2
4,Bloor Collegiate Institute,79.0,94.0,8.5
...,...,...,...,...
59,Wexford Collegiate School for the Arts,42.0,86.0,6.6
60,Winston Churchill Collegiate Institute,20.0,70.0,3.4
61,Woburn Collegiate Institute,59.0,82.0,6.9
62,York Memorial Collegiate Institute,33.0,83.0,6.7


In [None]:
# Export to CSV
over_ride = False
if over_ride:
    merged_result.to_csv(parent_dir / "data" / "csv" / "School" / "Combined_schools_final.csv", index = False)

In [45]:
data = pd.read_csv("combined_schools_with_cords.csv")
data.columns = data.columns.str.replace('\r\n', ' ', regex=True)
data = data.dropna(subset=['latitude', 'longitude'])

# Convert to numeric and drop rows with invalid data
data['latitude'] = pd.to_numeric(data['latitude'], errors='coerce')
data['longitude'] = pd.to_numeric(data['longitude'], errors='coerce')
data = data.dropna(subset=['latitude', 'longitude'])

# Convert the relevant columns to numeric (EQAO, OSSLT, Score)
data["EQAO_% at Levels 3/4"] = pd.to_numeric(data["EQAO_% at Levels 3/4"], errors='coerce')
data["OSSLT_Successful (Fully Participating)"] = pd.to_numeric(data["OSSLT_Successful (Fully Participating)"], errors='coerce')
data["Score"] = pd.to_numeric(data["Score"], errors='coerce')

# Create GeoDataFrame with geometry
geometry = [Point(xy) for xy in zip(data['longitude'], data['latitude'])]
gdf = gpd.GeoDataFrame(data, geometry=geometry)

# Set CRS to EPSG:4326 (WGS84)
gdf.set_crs("EPSG:4326", allow_override=True, inplace=True)

# Create the map centered on a specific location (e.g., Toronto)
m = folium.Map(location=[43.7, -79.4], zoom_start=11)

# Function to add a heatmap layer with custom shades and gradient
def add_heatmap(map_obj, gdf, column, name):
    # Drop rows with NaN values in the specified column
    gdf_no_na = gdf.dropna(subset=[column])
    
    # Extract coordinates and values for the heatmap
    heat_data = [[row['latitude'], row['longitude'], row[column]] for index, row in gdf_no_na.iterrows()]
    
    # Define the gradient for the heatmap (custom color scale)
    gradient = {0.0: 'blue', 0.2: 'green', 0.4: 'yellow', 0.6: 'orange', 0.8: 'red', 1.0: 'darkred'}
    
    # Create a heatmap layer
    HeatMap(heat_data, name=name, radius=15, max_zoom=13, gradient=gradient).add_to(map_obj)

# Add heatmap layers for each ranking (EQAO, OSSLT, Score)
add_heatmap(m, gdf, "EQAO_% at Levels 3/4", "EQAO Heatmap")
add_heatmap(m, gdf, "OSSLT_Successful (Fully Participating)", "OSSLT Heatmap")
add_heatmap(m, gdf, "Score", "School Score Heatmap")

# Add Layer Control for toggling between heatmaps
folium.LayerControl().add_to(m)

# Function to add a custom legend for the heatmap
def add_legend(map_obj):
    legend_html = '''
        <div style="position: absolute; top: 10px; left: 10px; background-color: white; 
        padding: 10px; border-radius: 5px; border: 2px solid black; z-index: 9999;">
            <b>Heatmap Legend</b><br>
            <i style="background: blue; width: 20px; height: 20px; display: inline-block;"></i> 0-20%<br>
            <i style="background: green; width: 20px; height: 20px; display: inline-block;"></i> 20-40%<br>
            <i style="background: yellow; width: 20px; height: 20px; display: inline-block;"></i> 40-60%<br>
            <i style="background: orange; width: 20px; height: 20px; display: inline-block;"></i> 60-80%<br>
            <i style="background: red; width: 20px; height: 20px; display: inline-block;"></i> 80-100%<br>
            <i style="background: darkred; width: 20px; height: 20px; display: inline-block;"></i> 100%+
        </div>
    '''
    map_obj.get_root().html.add_child(folium.Element(legend_html))

# Adding the custom legend to the map
add_legend(m)

# Display the map
m

FileNotFoundError: [Errno 2] No such file or directory: 'combined_schools_with_cords.csv'

In [None]:
import folium
import pandas as pd
import geopandas as gpd
import numpy as np
from folium.plugins import HeatMap
from shapely.geometry import Point

# Load data
data = pd.read_csv("combined_schools_with_cords.csv")
data.columns = data.columns.str.replace('\r\n', ' ', regex=True)
data = data.dropna(subset=['latitude', 'longitude'])

# Convert to numeric and drop rows with invalid data
data['latitude'] = pd.to_numeric(data['latitude'], errors='coerce')
data['longitude'] = pd.to_numeric(data['longitude'], errors='coerce')
data = data.dropna(subset=['latitude', 'longitude'])

# Convert the relevant columns to numeric (EQAO, OSSLT, Score)
data["EQAO_% at Levels 3/4"] = pd.to_numeric(data["EQAO_% at Levels 3/4"], errors='coerce')
data["OSSLT_Successful (Fully Participating)"] = pd.to_numeric(data["OSSLT_Successful (Fully Participating)"], errors='coerce')
data["Score"] = pd.to_numeric(data["Score"], errors='coerce')

# Create GeoDataFrame with geometry
geometry = [Point(xy) for xy in zip(data['longitude'], data['latitude'])]
gdf = gpd.GeoDataFrame(data, geometry=geometry)

# Set CRS to EPSG:4326 (WGS84)
gdf.set_crs("EPSG:4326", allow_override=True, inplace=True)

# Create the map centered on a specific location (e.g., Toronto)
m = folium.Map(location=[43.7, -79.4], zoom_start=11)

# Function to add a heatmap layer
def add_heatmap(map_obj, gdf, column, name):
    # Drop rows with NaN values in the specified column
    gdf_no_na = gdf.dropna(subset=[column])
    
    # Extract coordinates and values for the heatmap
    heat_data = [[row['latitude'], row['longitude'], row[column]] for index, row in gdf_no_na.iterrows()]
    
    # Create a heatmap layer
    HeatMap(heat_data, name=name, radius=15, max_zoom=13).add_to(map_obj)

# Add heatmap layers for each ranking (EQAO, OSSLT, Score)
add_heatmap(m, gdf, "EQAO_% at Levels 3/4", "EQAO Heatmap")
add_heatmap(m, gdf, "OSSLT_Successful (Fully Participating)", "OSSLT Heatmap")
add_heatmap(m, gdf, "Score", "School Score Heatmap")

# Add Layer Control for toggling between heatmaps
folium.LayerControl().add_to(m)

# Display the map
m

In [None]:
import pandas as pd
import folium
import numpy as np
import geopandas as gpd
from shapely.geometry import Point

data = pd.read_csv("combined_schools_with_cords.csv")
data.columns = data.columns.str.replace('\r\n', ' ', regex=True)
data = data.dropna(subset=['latitude', 'longitude'])

# Convert to numeric and drop rows with invalid data
data['latitude'] = pd.to_numeric(data['latitude'], errors='coerce')
data['longitude'] = pd.to_numeric(data['longitude'], errors='coerce')
data = data.dropna(subset=['latitude', 'longitude'])

# Convert the relevant columns to numeric (EQAO, OSSLT, Score)
data["EQAO_% at Levels 3/4"] = pd.to_numeric(data["EQAO_% at Levels 3/4"], errors='coerce')
data["OSSLT_Successful (Fully Participating)"] = pd.to_numeric(data["OSSLT_Successful (Fully Participating)"], errors='coerce')
data["Score"] = pd.to_numeric(data["Score"], errors='coerce')

# Create GeoDataFrame with geometry
geometry = [Point(xy) for xy in zip(data['longitude'], data['latitude'])]
gdf = gpd.GeoDataFrame(data, geometry=geometry)

# Set CRS to EPSG:4326 (WGS84)
gdf.set_crs("EPSG:4326", allow_override=True, inplace=True)

# Function to calculate bins for choropleth
def calculate_bins(gdf, column):
    col_data = gdf[column].dropna()
    min_val, max_val = col_data.min(), col_data.max()
    bins = np.linspace(min_val, max_val, 6).tolist()
    return bins

# Function to add choropleth layer to the map
def add_choropleth(map_obj, gdf, name, column):
    bins = calculate_bins(gdf, column)
    geo_json_data = gdf.to_json()
    
    choropleth = folium.Choropleth(
        geo_data=geo_json_data,
        name=name,  # Adding a name for this layer
        data=gdf,
        columns=["School Name_x", column],
        key_on="feature.properties.School Name_x",
        fill_color="YlOrRd",
        fill_opacity=0.7,
        line_opacity=0.2,
        bins=bins,
        legend_name=name
    ).add_to(map_obj)
    
    return choropleth

# Create the map (with markers and choropleth)
m = folium.Map(location=[43.7, -79.4], zoom_start=11)

# Add CircleMarkers for individual schools (check if markers appear)
for index, row in data.iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=5,
        color="blue",
        fill=True,
        fill_color="blue",
        fill_opacity=0.6,
        popup=f"School: {row['School Name_x']}<br>EQAO: {row['EQAO_% at Levels 3/4']}<br>OSSLT: {row['OSSLT_Successful (Fully Participating)']}<br>Score: {row['Score']}"
    ).add_to(m)

# Add choropleth layers
add_choropleth(m, gdf, "EQAO Percentage at Levels 3/4", "EQAO_% at Levels 3/4")
add_choropleth(m, gdf, "OSSLT Successful", "OSSLT_Successful (Fully Participating)")
add_choropleth(m, gdf, "Score", "Score")

# Add Layer Control for toggling between layers
folium.LayerControl().add_to(m)

# Display the map
m