In [4]:
# Overview
# This notebook contains code to generate output files from a series of spreadsheets
# The overall problem:
#
# input data contains a data measurement, such as rate, by location, for a particular health condition
# so a file named condition_a.csv would have a series of rows
# location, rate
# location_a, 0.4
# location_b, 0.5
# ...
#
# the goal is to output a series of files for each location, with a row containing the condition and health
# for example location_a.csv would have a series of rows
# condition_a, 0.4
# condition_b, 0.6
# ...


In [5]:
import glob
import pandas as pd

# read all condition files from the input data directory
input_files = glob.glob('input_data/*.xls')

# create a dictionary that contains the name of the condition (extracted from the file name)
# and a dataframe holding the condition and the rate
idfs = {}
for f in input_files:
    idfs[f[11:-4]] = pd.read_csv(f, index_col='county')
    
# get the location_names from the first condition file
# note - this assumes that all files have the same list of locations
location_names = idfs['condition_a'].index.tolist()

# go through each condition name
# create a spreadsheet containing each condition and rate
# for that location
for n in location_names:
    conditions = []
    vals = []
    for k in idfs.keys():
        conditions.append(k)
        vals.append(idfs[k].loc[n][0])
    df = pd.DataFrame.from_dict({'condition':conditions, 'val':vals})
    df.to_excel('output_data/' + n + '.xls', index=False)