# Clean school CSV and Export Final Cleaned CSVs

KR - 11/28

In [1]:
# imports
import pdfplumber
import pandas as pd
from pathlib import Path
import os
import numpy as np
import geopandas as gpd

In [2]:
# Paths
current_dir = Path.cwd()

parent_dir = current_dir.parent
csv_folder = parent_dir / '0_raw_data' / 'school_data' / 'csv'

In [3]:
# final cleaned osslt
osslt = pd.read_csv(csv_folder / '2021_OSSLT.csv')

osslt.loc[:, 'Successful (Fully Participating)'] = osslt.iloc[:, 16:20].apply(
    lambda row: next((x for x in row if isinstance(x, str) and (x.endswith('%') or x not in ['None', 'N/R'])), None),
    axis=1
)
osslt = osslt[osslt['School Name'].notna() & 
              (osslt['School Name'] != 'None') & 
              (osslt['School Name'] != 'School Name') & 
              (osslt['School Name'] != '')]
osslt.loc[:, 'Successful (Fully Participating)'] = osslt['Successful (Fully Participating)'].replace(['N/R', ''], np.nan).str.replace('%', '').astype(float)
osslt = osslt.iloc[:, [0]].join(osslt[['Successful (Fully Participating)']])

osslt = osslt.rename(columns={'Successful (Fully Participating)': 'OSSLT % Successful'})

over_ride = False
if over_ride:
    osslt.to_csv(parent_dir / "2_data_cleaning" / "cleaned_csv" / 'OSSLT_final.csv', index=False)

In [4]:
# final cleaned eqao

eqao = pd.read_csv(csv_folder / '2021_EQAO.csv')

eqao = eqao.iloc[:, [0, 4]]
eqao.columns = eqao.columns.str.replace('\n', ' ')
eqao.loc[:, '% at Levels 3/4'] = eqao['% at Levels 3/4'].replace('N/R', np.nan).str.replace('%', '').astype(float)

eqao = eqao.rename(columns={'% at Levels 3/4': 'EQAO % at Levels 3/4'})

over_ride = False
if over_ride:
    eqao.to_csv(parent_dir / "2_data_cleaning" / "cleaned_csv" / 'EQAO_final.csv', index=False)

In [5]:
# final cleaned highschool
high = pd.read_csv(csv_folder / "secondary_schools_combined.csv")

high = high[high["3"] == "Toronto"]
high = high.drop(columns=['2', '4', '5'])
high = high.reset_index(drop=True)
high.columns = ["School Name", "Score", "City"]
high = high.drop(columns="City")

high['Score'] = pd.to_numeric(high['Score'], errors='coerce')

high = high.rename(columns={'Score': 'Fraiser Score'})

over_ride = False
if over_ride:
    high.to_csv(parent_dir / "2_data_cleaning" / "cleaned_csv" / 'Highschool_final.csv', index=False)