In [None]:
## Relevant modules.
import os
import pandas as pd
from googletrans import Translator

In [None]:
## First look at the original dataframe: outpatient_costs_firstlook.
file_in = 'original_data/costos_promedios_ambulatorios.csv'
outpatient_costs_firstlook = pd.read_csv(file_in)
print(outpatient_costs_firstlook.head())

In [None]:
## Organizing dataframe better: outpatient_costs.
outpatient_costs = pd.read_csv(file_in, header=[0,1], index_col=[0,1],\
                               na_values=['nd']).sort_index(level=1).round(2)
outpatient_costs.index = [outpatient_costs.index.get_level_values(0),\
                          outpatient_costs.index.get_level_values(1).str.lower()]
print(outpatient_costs.head())

In [None]:
## Organizing dataframe as multi-index dataframe: outpatient_costs.
col_0 = [2008]*2 + [2009]*2 + [2010]*2 + [2011]*2 + [2012]*2
col_1 = ['c_visit','c_surgery']*5
arrays = [col_0, col_1]
outpatient_costs.columns = pd.MultiIndex.from_arrays(arrays,\
                                                     names=['year','avg_cost_usd'])
outpatient_costs.index.names = ['spec_code','spec_es']
print(outpatient_costs.head())

In [None]:
## Adding new column with specialty in english: outpatient_costs.
translator = Translator()
translated_words = []
for i in outpatient_costs.index.get_level_values(1):
    j = translator.translate(i, src='es', dest='en')
    translated_words.append(j.text)
outpatient_costs['spec_en'] = list(i.lower() for i in translated_words)
outpatient_costs.set_index('spec_en', append=True, inplace=True)
print(outpatient_costs.head())

In [None]:
## Optional: code to access multi-index outpatient_costs if necessary.
#idx = pd.IndexSlice
#print(outpatient_costs.loc[:,idx[2012,:]].head())
#print(outpatient_costs.loc[:,idx[:,['c_visit','c_surgery']]].head())

In [None]:
## Stacking data in outpatient_costs: outpatient_costs_stack.
outpatient_costs_stack = outpatient_costs.stack(level=0, dropna=False)
print(outpatient_costs_stack.head(10))

In [None]:
## Optional: code to access multi-index outpatient_costs_stack if necessary.
#idx = pd.IndexSlice
#print(outpatient_costs_stack.loc[idx[:,:,:,[2008,2009]],'c_surgery'].head())

In [None]:
## Making a simple dataframe: mean_outpatient_costs_usd.
mean_outpatient_costs_usd = outpatient_costs_stack\
                            .reset_index(level=['spec_es','spec_en','year'])
del mean_outpatient_costs_usd.columns.name

## Sorting by spec_code, the current index (alternative: by spec_es).
mean_outpatient_costs_usd.sort_index(inplace=True)
#mean_outpatient_costs_usd.sort_values('spec_es', inplace=True)
print(mean_outpatient_costs_usd.head(10))

In [None]:
## Reseting the index, just in case.
mean_outpatient_costs_usd.reset_index(inplace=True)
print(mean_outpatient_costs_usd.head(10))

## Saving mean_outpatient_costs_usd as csv to data folder.
if os.path.exists('data/mean_outpatient_costs_usd.csv'):
    print('mean_outpatient_costs_usd.csv already exists')
elif os.path.exists('data'):
    file_out = 'data/mean_outpatient_costs_usd.csv'
    mean_outpatient_costs_usd.to_csv(path_or_buf=file_out, index=False)
else:
    os.mkdir('data')
    file_out = 'data/mean_outpatient_costs_usd.csv'
    mean_outpatient_costs_usd.to_csv(path_or_buf=file_out, index=False)