In [1]:
## Relevant modules.
import pandas as pd
import os

In [3]:
## Reading files.
file_2008 = 'data/data_outpatient_2008.csv'
file_2009 = 'data/data_outpatient_2009.csv'
file_2010 = 'data/data_outpatient_2010.csv'
file_2011 = 'data/data_outpatient_2011.csv'
file_2012 = 'data/data_outpatient_2012.csv'
file_costs = 'data/mean_outpatient_costs_usd.csv'
data_outpatient_2008 = pd.read_csv(file_2008, header=0)
data_outpatient_2009 = pd.read_csv(file_2009, header=0)
data_outpatient_2010 = pd.read_csv(file_2010, header=0)
data_outpatient_2011 = pd.read_csv(file_2011, header=0)
data_outpatient_2012 = pd.read_csv(file_2012, header=0)
mean_outpatient_costs_usd = pd.read_csv(file_costs, header=0)
## Expected length when concatenating 2008-2012 files. 
exp_len = (len(data_outpatient_2008)+
           len(data_outpatient_2009)+
           len(data_outpatient_2010)+
           len(data_outpatient_2011)+
           len(data_outpatient_2012))
## Expected total visits when concatenating 2008-2012 files. 
exp_vis = (data_outpatient_2008['n_visits'].sum()+
           data_outpatient_2009['n_visits'].sum()+
           data_outpatient_2010['n_visits'].sum()+
           data_outpatient_2011['n_visits'].sum()+
           data_outpatient_2012['n_visits'].sum())
## Expected total surgeries when concatenating 2008-2012 files. 
exp_sur = (data_outpatient_2008['n_surgeries'].sum()+
           data_outpatient_2009['n_surgeries'].sum()+
           data_outpatient_2010['n_surgeries'].sum()+
           data_outpatient_2011['n_surgeries'].sum()+
           data_outpatient_2012['n_surgeries'].sum())
## Expected total patients when concatenating 2008-2012 files. 
exp_pat = (data_outpatient_2008['n_patients'].sum()+
           data_outpatient_2009['n_patients'].sum()+
           data_outpatient_2010['n_patients'].sum()+
           data_outpatient_2011['n_patients'].sum()+
           data_outpatient_2012['n_patients'].sum())
print('The expected length when concatenating 2008-2012 files is', exp_len, end='.\n')
print('The expected total visits is', exp_vis, end='.\n')
print('The expected total surgeries is', exp_sur, end='.\n')
print('The expected total patients is', exp_pat, end='.')


The expected length when concatenating 2008-2012 files is 9420.
The expected total visits is 1044072.
The expected total surgeries is 108118.
The expected total patients is 870724.

In [4]:
## Concatenating 2008-2012 files.
concat_data = pd.concat([data_outpatient_2008,\
                         data_outpatient_2009,\
                         data_outpatient_2010,\
                         data_outpatient_2011,\
                         data_outpatient_2012],
                         ignore_index=True)

print(concat_data.head(), '\n')
print(concat_data.tail(), '\n')
print(concat_data.info(), '\n')
print('Length as expected?', len(concat_data) == exp_len, end='.') 

   spec_code                       spec_es  year  doc_code  n_visits  \
0         22  anestesiologia y reanimacion  2008  16449291        59   
1         22  anestesiologia y reanimacion  2008  31865019        85   
2         22  anestesiologia y reanimacion  2008   8682278         6   
3        120                   cardiologia  2008  19188542        24   
4        120                   cardiologia  2008  70115939       429   

   n_surgeries  n_patients  
0            1          67  
1            2          89  
2            4           6  
3            3          24  
4          104         430   

      spec_code                   spec_es  year  doc_code  n_visits  \
9415        775       urologia pediatrica  2012  79939705        48   
9416        775       urologia pediatrica  2012  79557883        82   
9417        992  cirugia de hombro y codo  2012  80418242       105   
9418        999          medicina general  2012  41797433        27   
9419        999          medicina ge

In [5]:
## Grouping by spec_code, spec_es, year and doc_code.
concat_data_unique = concat_data.groupby(['spec_code',\
                                          'spec_es',\
                                          'year',\
                                          'doc_code'],\
                                           as_index=False, sort=False).sum()

tot_vis = concat_data_unique['n_visits'].sum()
tot_sur = concat_data_unique['n_surgeries'].sum()
tot_pat = concat_data_unique['n_patients'].sum()

print(concat_data_unique.head(), '\n')
print(concat_data_unique.tail(), '\n')
print(concat_data_unique.info(), '\n')
print('Total visits is', tot_vis, end='.'); print(' As expected?', tot_vis==exp_vis, end='.\n')
print('Total surgeries is', tot_sur, end='.'); print(' As expected?', tot_sur==exp_sur, end='.\n')
print('Total patients is', tot_pat, end='.'); print(' As expected?', tot_pat==exp_pat, end='.')

   spec_code                       spec_es  year  doc_code  n_visits  \
0         22  anestesiologia y reanimacion  2008  16449291        59   
1         22  anestesiologia y reanimacion  2008  31865019        85   
2         22  anestesiologia y reanimacion  2008   8682278         6   
3        120                   cardiologia  2008  19188542        24   
4        120                   cardiologia  2008  70115939       429   

   n_surgeries  n_patients  
0            1          67  
1            2          89  
2            4           6  
3            3          24  
4          104         430   

      spec_code                   spec_es  year  doc_code  n_visits  \
8299        775       urologia pediatrica  2012  79939705        48   
8300        775       urologia pediatrica  2012  79557883        82   
8301        992  cirugia de hombro y codo  2012  80418242       105   
8302        999          medicina general  2012  41797433        27   
8303        999          medicina ge

In [6]:
## Putting everything together, a.k.a. merging.
all_together = pd.merge(concat_data_unique, mean_outpatient_costs_usd,\
                        on=['spec_code','spec_es','year'], how='left')
print(all_together.head(), '\n')
print(all_together.tail(), '\n')
print(all_together.info())

   spec_code                       spec_es  year  doc_code  n_visits  \
0         22  anestesiologia y reanimacion  2008  16449291        59   
1         22  anestesiologia y reanimacion  2008  31865019        85   
2         22  anestesiologia y reanimacion  2008   8682278         6   
3        120                   cardiologia  2008  19188542        24   
4        120                   cardiologia  2008  70115939       429   

   n_surgeries  n_patients                           spec_en  c_surgery  \
0            1          67  anesthesiology and resuscitation     205.85   
1            2          89  anesthesiology and resuscitation     205.85   
2            4           6  anesthesiology and resuscitation     205.85   
3            3          24                        cardiology     209.02   
4          104         430                        cardiology     209.02   

   c_visit  
0    21.69  
1    21.69  
2    21.69  
3    29.51  
4    29.51   

      spec_code                   sp

In [8]:
## Making some rearrangement.
col_order = ['doc_code','spec_code','spec_es','spec_en',\
             'year','n_visits','n_surgeries','n_patients',\
             'c_visit','c_surgery']
all_together = all_together[col_order]
print(all_together.head(), '\n')
print(all_together.tail(), '\n')
print(all_together.info())

   doc_code  spec_code                       spec_es  \
0  16449291         22  anestesiologia y reanimacion   
1  31865019         22  anestesiologia y reanimacion   
2   8682278         22  anestesiologia y reanimacion   
3  19188542        120                   cardiologia   
4  70115939        120                   cardiologia   

                            spec_en  year  n_visits  n_surgeries  n_patients  \
0  anesthesiology and resuscitation  2008        59            1          67   
1  anesthesiology and resuscitation  2008        85            2          89   
2  anesthesiology and resuscitation  2008         6            4           6   
3                        cardiology  2008        24            3          24   
4                        cardiology  2008       429          104         430   

   c_visit  c_surgery  
0    21.69     205.85  
1    21.69     205.85  
2    21.69     205.85  
3    29.51     209.02  
4    29.51     209.02   

      doc_code  spec_code           

In [10]:
## Saving all_together as csv to data folder.
if os.path.exists('data/data_panel.csv'):
    print('data_panel.csv already exists')
elif os.path.exists('data'):
    file_out = 'data/data_panel.csv'
    all_together.to_csv(path_or_buf=file_out, index=False)
else:
    os.mkdir('data')
    file_out = 'data/data_panel.csv'
    all_together.to_csv(path_or_buf=file_out, index=False)

data_panel.csv already exists
