In [1]:
import pdfplumber
import pandas as pd
from pathlib import Path

current_dir = Path.cwd()

# Get the parent directory
parent_dir = current_dir.parent
csv_folder = parent_dir / 'school_data' / 'csv'
pdf_path = parent_dir / "school_data" / "pdf" / "2021_EQAO.pdf"

csv_folder.mkdir(parents=True, exist_ok=True)

all_tables = []

# Open the PDF with pdfplumber
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        all_tables.append(page.extract_tables())
print(all_tables)
            
final_tables = []
for i in range(len(all_tables)):
    final_tables.extend(all_tables[i][0][3:])

final_df = pd.DataFrame(final_tables, columns = all_tables[1][0][0])

csv_file_path = csv_folder / '2021_EQAO.csv'

final_df.to_csv(csv_file_path, index=False)

print(final_df)


[[[['School Name', 'Total # of\nStudents', '# of\nParticipating\nStudents', '# of\nStudents at\nLevels 3/4', '% at Levels\n3/4'], ['Province', '70279', '66805', '34954', '52%'], ['TDSB', '7512', '6832', '3606', '53%'], ['A Y Jackson Secondary School', '112', '103', '96', '93%'], ['Agincourt Collegiate Institute', '184', '172', '125', '73%'], ['Albert Campbell Collegiate Institute', '169', '158', '100', '63%'], ['Birchmount Park Collegiate Institute', '104', '87', '28', '32%'], ['Bloor Collegiate Institute', '78', '75', '59', '79%'], ['C W Jefferys Collegiate Institute', '84', '73', '14', '19%'], ['Cedarbrae Collegiate Institute', '120', '114', '59', '52%'], ['Central Technical School', '30', '28', '8', '29%'], ['Central Toronto Academy', '68', '50', '22', '44%'], ['Danforth Collegiate and Technical\nInstitute', '180', '169', '89', '53%'], ['David and Mary Thomson Collegiate\nInstitute', '125', '105', '20', '19%'], ['Don Mills Collegiate Institute', '116', '103', '63', '61%'], ['Downsvi

In [13]:
# Use a relative path
pdf_path = parent_dir/'school_data'/"pdf"/ "2021_osslt.pdf"
# Initialize an empty list to hold table data
all_tables = []
 
# Open the PDF with pdfplumber
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        # Extract tables from the current page
        all_tables.append(page.extract_tables())

final_tables = []
for i in range(1, len(all_tables)):
    final_tables.extend(all_tables[i][0])

final_df = pd.DataFrame(final_tables, columns = all_tables[1][0][0])

final_df.to_csv(csv_folder/'2021_OSSLT.csv', index=False)

   
# Display the DataFrame
final_df

Unnamed: 0,School Name,None,None.1,Number of\nPreviously\nEligible\nStudents,None.2,None.3,Absent\n(All\nPreviously\nEligible\nStudents),None.4,None.5,Deferred\n(All\nPreviously\nEligible\nStudents),...,Unnamed: 12,Number of\nFully\nParticipating\nStudents,None.6,None.7,Successful\n(Fully\nParticipating),None.8,None.9,Not\nSuccessful\n(Fully\nParticipating),None.10,None.11
0,School Name,,,Number of\nPreviously\nEligible\nStudents,,,Absent\n(All\nPreviously\nEligible\nStudents),,,Deferred\n(All\nPreviously\nEligible\nStudents),...,,Number of\nFully\nParticipating\nStudents,,,Successful\n(Fully\nParticipating),,,Not\nSuccessful\n(Fully\nParticipating),,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,Woburn Collegiate Institute,247,,,5%,,,0%,,,...,,,82%,,,18%,,,,
132,York Humber High School,,N/R,,,N/R,,,N/R,,...,N/R,,,N/R,,,N/R,,,
133,York Memorial Collegiate Institute,210,,,<1%,,,3%,,,...,,,83%,,,17%,,,,
134,York Mills Collegiate Institute,258,,,5%,,,1%,,,...,,,92%,,,8%,,,,


In [3]:
def consolidate_by_increment(df, id_col, start_col, increment):
    """
    Consolidates columns in incremental groups into one column each.

    Parameters:
    df (pd.DataFrame): The input dataframe.
    id_col (str): The name of the identifier column (e.g., 'School Name').
    start_col (int): The column index where grouping starts (after the identifier column).
    increment (int): The number of columns in each group to consolidate.

    Returns:
    pd.DataFrame: A dataframe with consolidated columns.
    """
    result = df[[id_col]].copy()  # Start with the identifier column
    total_cols = len(df.columns)

    # Loop through the columns in increments
    for i in range(start_col, total_cols, increment):
        # Select the group of `increment` columns
        group = df.columns[i:i + increment]  # Group of three columns at a time

        # Check for valid column names
        group = [col for col in group if col in df.columns]

        # Skip empty or incomplete groups
        if not group:
            continue

        # Consolidate non-NaN values
        result[f'Consolidated_{(i - start_col) // increment + 1}'] = df[group].bfill(axis=1).iloc[:, 0]

    return result



result_df = consolidate_by_increment(final_df, id_col='School Name', start_col=1, increment=3)

In [11]:
result_df.head(20)

Unnamed: 0,School Name,Number of\nPreviously\nEligible\nStudents,Absent\n(All\nPreviously\nEligible\nStudents),Deferred\n(All\nPreviously\nEligible\nStudents),Successful\n(Fully\nParticipating),Not\nSuccessful\n(Fully\nParticipating)
0,A Y Jackson Secondary School,312,8%,0%,88%,12%
1,Agincourt Collegiate Institute,388,1%,<1%,94%,6%
2,Albert Campbell Collegiate Institute,261,4%,7%,88%,12%
3,ALPHA II Alternative School (Sec),N/R1,N/R1,N/R1,N/R1,N/R1
4,Alternative Scarborough Education 1,N/R,N/R,N/R,N/R,N/R
5,Avondale Secondary Alternative School,19,16%,5%,73%,27%
6,Birchmount Park Collegiate Institute,169,22%,1%,79%,21%
7,Bloor Collegiate Institute,197,4%,0%,94%,6%
8,Burnhamthorpe Collegiate Institute,N/R,N/R,N/R,N/R,N/R
9,C W Jefferys Collegiate Institute,255,36%,0%,72%,28%


In [5]:
result_df.columns = result_df.iloc[0]  # Set the first row as column headers
result_df = result_df[1:]  # Drop the first row since it's now the header
result_df.reset_index(drop=True, inplace=True) 

In [6]:
result_df.head()

Unnamed: 0,School Name,Number of\nPreviously\nEligible\nStudents,Absent\n(All\nPreviously\nEligible\nStudents),Deferred\n(All\nPreviously\nEligible\nStudents),Unnamed: 5,Taking,Successful\n(Fully\nParticipating),Not\nSuccessful\n(Fully\nParticipating),None
0,,,,,,OSSLC,,,
1,,,,,,(All,,,
2,,,,,,Previously,,,
3,,,,,,Eligible,,,
4,,,,,,Students),,,


In [7]:
result_df = result_df[~result_df["School Name"].isin([None, "School Name", ""])]
result_df = result_df.drop(columns=[col for col in result_df.columns if not col or col == "Taking"])
result_df.reset_index(drop=True, inplace=True)
result_df

Unnamed: 0,School Name,Number of\nPreviously\nEligible\nStudents,Absent\n(All\nPreviously\nEligible\nStudents),Deferred\n(All\nPreviously\nEligible\nStudents),Successful\n(Fully\nParticipating),Not\nSuccessful\n(Fully\nParticipating)
0,A Y Jackson Secondary School,312,8%,0%,88%,12%
1,Agincourt Collegiate Institute,388,1%,<1%,94%,6%
2,Albert Campbell Collegiate Institute,261,4%,7%,88%,12%
3,ALPHA II Alternative School (Sec),N/R1,N/R1,N/R1,N/R1,N/R1
4,Alternative Scarborough Education 1,N/R,N/R,N/R,N/R,N/R
...,...,...,...,...,...,...
99,Woburn Collegiate Institute,247,247,247,247,247
100,York Humber High School,,,,,
101,York Memorial Collegiate Institute,210,210,210,210,210
102,York Mills Collegiate Institute,258,258,258,258,258


In [9]:
result_df

Unnamed: 0,School Name,Number of\nPreviously\nEligible\nStudents,Absent\n(All\nPreviously\nEligible\nStudents),Deferred\n(All\nPreviously\nEligible\nStudents),Successful\n(Fully\nParticipating),Not\nSuccessful\n(Fully\nParticipating)
0,A Y Jackson Secondary School,312,8%,0%,88%,12%
1,Agincourt Collegiate Institute,388,1%,<1%,94%,6%
2,Albert Campbell Collegiate Institute,261,4%,7%,88%,12%
3,ALPHA II Alternative School (Sec),N/R1,N/R1,N/R1,N/R1,N/R1
4,Alternative Scarborough Education 1,N/R,N/R,N/R,N/R,N/R
...,...,...,...,...,...,...
99,Woburn Collegiate Institute,247,247,247,247,247
100,York Humber High School,,,,,
101,York Memorial Collegiate Institute,210,210,210,210,210
102,York Mills Collegiate Institute,258,258,258,258,258


In [8]:
result_df.to_csv(csv_folder/'2021_OSSLT.csv', index=False)