In [1]:
## Relevant modules.
import os
import pandas as pd
from googletrans import Translator

In [2]:
## First look at the original dataframe: outpatient_costs_firstlook.
file_in = 'original_data/costos_promedios_ambulatorios.csv'
outpatient_costs_firstlook = pd.read_csv(file_in)
print(outpatient_costs_firstlook.head())

  COSTO PROMEDIO PARA LA ASEGURADORA                            US$  \
0                          COD_ESPEC                   ESPECIALIDAD   
1                                389                   ADOLESCENCIA   
2                                 22   ANESTESIOLOGIA Y REANIMACION   
3                                120                    CARDIOLOGIA   
4                                211  CARDIOLOGIA ELECTROFISIOLOGIA   

                  2008a                2008b                 2009a  \
0  CONSULTA AMBULATORIA  CIRUGIA AMBULATORIA  CONSULTA AMBULATORIA   
1    29.043628650677107                   nd     32.09116621819961   
2    21.689252583869877   205.85068883330013     22.71958577951729   
3     29.51413869708569   209.01869158878506    30.897962342587373   
4     25.23658448617874   43.065589818893784    27.069279011356457   

                 2009b                 2010a                2010b  \
0  CIRUGIA AMBULATORIA  CONSULTA AMBULATORIA  CIRUGIA AMBULATORIA   
1    101.27201

In [3]:
## Organizing dataframe better: outpatient_costs.
outpatient_costs = pd.read_csv(file_in, header=[0,1], index_col=[0,1],\
                               na_values=['nd']).sort_index(level=1).round(2)
outpatient_costs.index = [outpatient_costs.index.get_level_values(0),\
                          outpatient_costs.index.get_level_values(1).str.lower()]
print(outpatient_costs.head())

COSTO PROMEDIO PARA LA ASEGURADORA                2008a               2008b  \
COD_ESPEC                          CONSULTA AMBULATORIA CIRUGIA AMBULATORIA   
389 adolescencia                                  29.04                 NaN   
22  anestesiologia y reanimacion                  21.69              205.85   
120 cardiologia                                   29.51              209.02   
211 cardiologia electrofisiologia                 25.24               43.07   
130 cirugia cardiovascular                        25.46              527.50   

COSTO PROMEDIO PARA LA ASEGURADORA                2009a               2009b  \
COD_ESPEC                          CONSULTA AMBULATORIA CIRUGIA AMBULATORIA   
389 adolescencia                                  32.09              101.27   
22  anestesiologia y reanimacion                  22.72              234.47   
120 cardiologia                                   30.90              266.23   
211 cardiologia electrofisiologia                 2

In [4]:
## Organizing dataframe as multi-index dataframe: outpatient_costs.
col_0 = [2008]*2 + [2009]*2 + [2010]*2 + [2011]*2 + [2012]*2
col_1 = ['c_visit','c_surgery']*5
arrays = [col_0, col_1]
outpatient_costs.columns = pd.MultiIndex.from_arrays(arrays,\
                                                     names=['year','avg_cost_usd'])
outpatient_costs.index.names = ['spec_code','spec_es']
print(outpatient_costs.head())

year                                       2008              2009            \
avg_cost_usd                            c_visit c_surgery c_visit c_surgery   
spec_code spec_es                                                             
389       adolescencia                    29.04       NaN   32.09    101.27   
22        anestesiologia y reanimacion    21.69    205.85   22.72    234.47   
120       cardiologia                     29.51    209.02   30.90    266.23   
211       cardiologia electrofisiologia   25.24     43.07   27.07    167.22   
130       cirugia cardiovascular          25.46    527.50   27.13    651.50   

year                                       2010              2011            \
avg_cost_usd                            c_visit c_surgery c_visit c_surgery   
spec_code spec_es                                                             
389       adolescencia                    36.43     87.81   33.12    115.13   
22        anestesiologia y reanimacion      NaN    

In [5]:
## Adding new column with specialty in english: outpatient_costs.
translator = Translator()
translated_words = []
for i in outpatient_costs.index.get_level_values(1):
    j = translator.translate(i, src='es', dest='en')
    translated_words.append(j.text)
outpatient_costs['spec_en'] = list(i.lower() for i in translated_words)
outpatient_costs.set_index('spec_en', append=True, inplace=True)
print(outpatient_costs.head())

year                                                                        2008  \
avg_cost_usd                                                             c_visit   
spec_code spec_es                       spec_en                                    
389       adolescencia                  adolescence                        29.04   
22        anestesiologia y reanimacion  anesthesiology and resuscitation   21.69   
120       cardiologia                   cardiology                         29.51   
211       cardiologia electrofisiologia electrophysiology cardiology       25.24   
130       cirugia cardiovascular        cardiovascular surgery             25.46   

year                                                                                \
avg_cost_usd                                                             c_surgery   
spec_code spec_es                       spec_en                                      
389       adolescencia                  adolescence                  

In [None]:
## Optional: code to access multi-index outpatient_costs if necessary.
#idx = pd.IndexSlice
#print(outpatient_costs.loc[:,idx[2012,:]].head())
#print(outpatient_costs.loc[:,idx[:,['c_visit','c_surgery']]].head())

In [6]:
## Stacking data in outpatient_costs: outpatient_costs_stack.
outpatient_costs_stack = outpatient_costs.stack(level=0, dropna=False)
print(outpatient_costs_stack.head(10))

avg_cost_usd                                                                  c_surgery  \
spec_code spec_es                      spec_en                          year              
389       adolescencia                 adolescence                      2008        NaN   
                                                                        2009     101.27   
                                                                        2010      87.81   
                                                                        2011     115.13   
                                                                        2012     368.24   
22        anestesiologia y reanimacion anesthesiology and resuscitation 2008     205.85   
                                                                        2009     234.47   
                                                                        2010        NaN   
                                                                        2011     324.77   

In [None]:
## Optional: code to access multi-index outpatient_costs_stack if necessary.
#idx = pd.IndexSlice
#print(outpatient_costs_stack.loc[idx[:,:,:,[2008,2009]],'c_surgery'].head())

In [7]:
## Making a simple dataframe: mean_outpatient_costs_usd.
mean_outpatient_costs_usd = outpatient_costs_stack\
                            .reset_index(level=['spec_es','spec_en','year'])
del mean_outpatient_costs_usd.columns.name

## Sorting by spec_code, the current index (alternative: by spec_es).
mean_outpatient_costs_usd.sort_index(inplace=True)
#mean_outpatient_costs_usd.sort_values('spec_es', inplace=True)
print(mean_outpatient_costs_usd.head(10))

                                spec_es                           spec_en  \
spec_code                                                                   
22         anestesiologia y reanimacion  anesthesiology and resuscitation   
22         anestesiologia y reanimacion  anesthesiology and resuscitation   
22         anestesiologia y reanimacion  anesthesiology and resuscitation   
22         anestesiologia y reanimacion  anesthesiology and resuscitation   
22         anestesiologia y reanimacion  anesthesiology and resuscitation   
120                         cardiologia                        cardiology   
120                         cardiologia                        cardiology   
120                         cardiologia                        cardiology   
120                         cardiologia                        cardiology   
120                         cardiologia                        cardiology   

           year  c_surgery  c_visit  
spec_code                            

In [8]:
## Reseting the index, just in case.
mean_outpatient_costs_usd.reset_index(inplace=True)
print(mean_outpatient_costs_usd.head(10))

## Saving mean_outpatient_costs_usd as csv to data folder.
if os.path.exists('data/mean_outpatient_costs_usd.csv'):
    print('data/mean_outpatient_costs_usd.csv already exists.')
elif os.path.exists('data'):
    file_out = 'data/mean_outpatient_costs_usd.csv'
    mean_outpatient_costs_usd.to_csv(path_or_buf=file_out, index=False)
    print('data/ already exists.')
    print('data/mean_outpatient_costs_usd.csv has been created.')
else:
    os.mkdir('data')
    file_out = 'data/mean_outpatient_costs_usd.csv'
    mean_outpatient_costs_usd.to_csv(path_or_buf=file_out, index=False)
    print('data/mean_outpatient_costs_usd.csv has been created.')

   spec_code                       spec_es                           spec_en  \
0         22  anestesiologia y reanimacion  anesthesiology and resuscitation   
1         22  anestesiologia y reanimacion  anesthesiology and resuscitation   
2         22  anestesiologia y reanimacion  anesthesiology and resuscitation   
3         22  anestesiologia y reanimacion  anesthesiology and resuscitation   
4         22  anestesiologia y reanimacion  anesthesiology and resuscitation   
5        120                   cardiologia                        cardiology   
6        120                   cardiologia                        cardiology   
7        120                   cardiologia                        cardiology   
8        120                   cardiologia                        cardiology   
9        120                   cardiologia                        cardiology   

   year  c_surgery  c_visit  
0  2012     400.23    54.42  
1  2008     205.85    21.69  
2  2009     234.47    22.72  