In [35]:
# ADDING FUNCTIONS TO TEST HERE

import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import os
from pathlib import Path
import warnings
warnings.simplefilter(action='ignore', category=Warning)


def clean_percents(item):
    '''
    Function to turn percent values into usable floats
    '''
    if '%' in str(item):
        item = item.replace('%', '')
        return float(item) / 100
    return item

def create_census_df():
    '''
    Reads census files for every year and concatenates the desired info into a single dataframe
    '''
    # Generate file paths for each spreadsheet in the census_info folder
    paths = sorted(Path('census_info').iterdir(), key=os.path.getmtime)
    data = []
    year = 2010
    # Iterate through each filepath (ordered by year) and convert each CSV into a DataFrame
    for path in paths:
        if os.path.basename(path).endswith('.csv'):
            census = pd.read_csv(path).dropna()[:-1]
            census['Year'] = year
            census.columns = census.columns.str.replace('!!', ' ')
            # Slight difference in format between newer (post-2015) and older files
            if year < 2015:
                census['United States Total Estimate'] = census['United States Total Estimate'].apply(clean_percents)
            else:
                census['United States Percent Estimate'] = census['United States Percent Estimate'].apply(clean_percents)
            # Extract the data we care about by index number to account for format of files
            group_1824 = census.iloc[1:5]
            group_1824['Metric'] = 'Educational Attainment: 18-24'
            data.append(group_1824)
            group_o25 = census.iloc[6:13]
            group_o25['Metric'] = 'Educational Attainment: 25 and Over'
            data.append(group_o25)
            group_earnings = census.tail()
            group_earnings['Metric'] = 'Median Earnings'
            data.append(group_earnings)
            year += 1
    
    # Add all DataFrames into one larger DataFrame, grouped by year
    census_df = pd.concat(data)
    census_df['Label'] = census_df['Label'].apply(str.strip)
    census_df = census_df.set_index('Year')
    return census_df

def some_college_1824(frame):
    '''
    A function used in conjunction with .apply() to returns a series of the total percentage of Americans
    aged 18-24 that have attained at least some college per year
    '''
    year = frame.index[0]
    edu_at = frame[frame['Metric'] == 'Educational Attainment: 18-24']
    if year < 2015:
        some_col = edu_at[edu_at['Label'] == 'Some college or associate\'s degree']['United States Total Estimate'][year]
        higher_col = edu_at[edu_at['Label'] == 'Bachelor\'s degree or higher']['United States Total Estimate'][year]
    else:
        some_col = edu_at[edu_at['Label'] == 'Some college or associate\'s degree']['United States Percent Estimate'][year]
        higher_col = edu_at[edu_at['Label'] == 'Bachelor\'s degree or higher']['United States Percent Estimate'][year]
    return some_col + higher_col

def convert_to_int(item):
    '''
    Takes a string representation of a number with commas and converts it to a numpy integer
    '''
    return np.int64(item.replace(',', ''))

def get_median_yearly(frame):
    '''
    A function used in conjunction with .apply() to returns a series of the total percentage of Americans
    aged 18-24 that have attained at least some college per year
    '''
    year = frame.index[0]
    median = frame[frame['Metric'] == 'Median Earnings']
    median['United States Total Estimate'] = median['United States Total Estimate'].apply(convert_to_int)
    return median['United States Total Estimate'].mean()

def create_ap_volume():
    '''
    Reads a csv file containing AP Exam volume change data and returns a series containing the total volume of
    AP exams for each year, 2010-2020
    '''
    ap_volume = pd.read_excel(r'ap_info/AP-Exam-Volume-Changes-2010-2020.xls', header=1)[:88].set_index('   SUBJECT ')
    tot_volume = ap_volume.loc['   TOTAL ']
    tot_volume = tot_volume[(tot_volume.index != '%\nChange')]
    for i in range(1, 10):
        tot_volume = tot_volume[(tot_volume.index != f'%\nChange.{i}')]
    return tot_volume

def get_school_data():
    '''
    Reads a csv file containing data for the number of schools offering AP exams from 2011-2020 and returns a tuple of two
    series: 1. The total number of schools offering AP exams, 2. The average number of AP subjects per participating school
    '''
    ap_schools = pd.read_excel(r'ap_info/Number-of-Schools-Offering-AP-Exams-2011-2020.xls', header=1)[1:43].set_index('SUBJECT')
    ap_schools.columns = ap_schools.columns.astype(str)
    ap_schools = ap_schools.loc[:, :'2020']
    total_schools = ap_schools.loc['TOTAL SCHOOLS'].drop('%')
    total_subjects = ap_schools.loc['Subjects Per School'].drop('%')
    for i in range(1, 9):
        total_schools = total_schools.drop(f'%.{i}')
        total_subjects = total_subjects.drop(f'%.{i}')
    return (total_schools, total_subjects)


In [36]:
# Test clean_percents
assert(clean_percents("%15") == 0.15)
assert(clean_percents("%0") == 0)

In [37]:
# Test create_census_df
assert(type(create_census_df()) == pd.DataFrame)
assert(type(create_census_df()["Metric"]) == pd.Series)

In [38]:
# Test some_college_1824
assert(type(some_college_1824(create_census_df())) == float)
assert(some_college_1824(create_census_df()) < 1)

In [39]:
# Test convert_to_int
assert(convert_to_int("1,200,300") == np.int(1200300))
assert(convert_to_int("1") == np.int(1))

In [40]:
# Test get_median_yearly
assert(type(get_median_yearly(create_census_df())) == np.float64)
assert(get_median_yearly(create_census_df()) > 30000)

In [42]:
# Test create_ap_volume
assert(type(create_ap_volume()) == pd.Series)
assert(len(create_ap_volume()) == 11)

In [49]:
# Test get_school_data
assert(type(get_school_data()) == tuple)
assert(len(get_school_data()) == 2)
assert(type(get_school_data()[0]) == pd.Series)
assert(len(get_school_data()[0]) == 10)
assert(type(get_school_data()[1]) == pd.Series)
assert(len(get_school_data()[1]) == 10)