In [3]:
#  make jupyter notebook dynamically reload modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import camelot
# ModuleNotFoundError: No module named 'tabula'
import tabula.io as tb
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import os

In [79]:
from numpy import mean


def extract_licensure(df: pd.DataFrame):
    scheme = {
        'shape': (4, 2),
        'extract': {
            "lic_total": (1,1),
            "licensed": (2,1),
            "licensure_extracted": (3,1),
        },
    }
    info = {}
    if df.shape != scheme['shape']:
        print(f"Licensure Table has shape {df.shape} instead of {scheme['shape']}")
    else:
        for key, loc in scheme['extract'].items():
            info[key] = df.iloc[loc]
    return info
def extract_time_to_complete(df: pd.DataFrame):
    info = {}
    for ir, x in enumerate(df.iloc[:,0]):
        if "Mean number of years to complete".replace(" ", '') in x.replace(" ", ''):
            years = df.iloc[ir, 1:].to_list()
            years = [y for y in years if y]
            info['years_long'] = max(years)
            info['years_short'] = min(years)    
            info['years_mean'] = years[-1]
            break
    return info
def extract_tuition(df: pd.DataFrame):
    info = {}
    for ir, x in enumerate(df.iloc[:,0]):
        if "out-of-state" in x.lower():
            fees = df.iloc[ir, 1:].to_list()
            fees = [y for y in fees if y]
            tuition = fees[-1]
            # if tuition is not precedented by a $ sign, add it
            if "$" not in tuition:
                tuition = "$" + tuition
            info['tuition'] = tuition
            break
    return info
def extract_internship(df: pd.DataFrame):
    info = {}
    for ir, x in enumerate(df.iloc[:,0]):
        if "Students who obtained APA/CPA-accredited internships".replace(" ", '') in x.replace(" ", ''):
            interns = df.iloc[ir, 1:].to_list()
            interns = [y for y in interns if y.isnumeric()]
            info['internship_rate_lastest'] = interns[-1]
            break
    return info
def extract_attrition(df: pd.DataFrame):
    info = {}
    for ir, x in enumerate(df.iloc[:,0]):
        if "Students for whom this is the year of first enrollment".replace(" ", '') in x.replace(" ", ''):
            enrollment = df.iloc[ir, 1:].to_list()
            # only keep numeric values
            enrollment = [int(y) for y in enrollment if y.isnumeric()]
            info['enrollment_latest'] = str(enrollment[-1])
            info['enrollment_ave'] = str(mean(enrollment))
        if "Students no longer enrolled for any reason".replace(" ", '') in x.replace(" ", ''):
            attrition = df.iloc[ir, 1:].to_list()
            # only keep numeric values
            attrition = [int(y) for y in attrition if y.isnumeric()]
            info['attrition_rate_high'] = str(max(attrition[1::2]))
            info['attrition_ave'] = str(mean(attrition[::2]))
            info['attrition_latest'] = str(attrition[-1])
    return info

# extract info from effective tables
def extract_effective_tables(tables: dict):
    info = {}
    for title, df in tables.items():
        if "licensure" in title.lower():
            info.update(extract_licensure(df))
        if "time" in title.lower():
            info.update(extract_time_to_complete(df))
        if "tuition" in title.lower():
            info.update(extract_tuition(df))
        if "internship" in title.lower():
            info.update(extract_internship(df))
        if "attrition" in title.lower():
            info.update(extract_attrition(df))
    return info

In [80]:
from scrap.parse_pdf import parse_pdf_file, find_effective_tables

verified_excel = "scrap/result/apa_programs_licensure.xlsx"
df = pd.read_excel(verified_excel)
index = 43
pdf_filename = df.at[index, "pdf_filename"]
format_indicators, tables = parse_pdf_file(pdf_filename)
effective_tables = find_effective_tables(tables)
info = extract_effective_tables(effective_tables)
print(info)


{'years_long': '5.5', 'years_short': '5.05', 'years_mean': '5.232', 'tuition': '37962', 'internship_rate_lastest': '100', 'enrollment_latest': '34', 'enrollment_ave': '29.3', 'attrition_rate_high': '16', 'attrition_ave': '2.2', 'attrition_latest': '6'}
