In [1]:
## Relevant modules.
import pandas as pd
import os
## Optional for alternative code two (see forth cell below)
#import re

In [2]:
## First look at the original dataframe: data_outpatient_year_firstlook.
## Please choose a year between 2008-2012.
year = 2008

file_in = 'original_data/datos_ambulatorios_'+str(year)+'_new.csv'
data_outpatient_year_firstlook = pd.read_csv(file_in)
print(data_outpatient_year_firstlook.head())

                          MEDICO  COD_ESPEC                  ESPECIALIDAD  \
0  CC-16449291-NOMBRE*DEL*DOCTOR         22  ANESTESIOLOGIA Y REANIMACION   
1  CC-31865019-NOMBRE*DEL*DOCTOR         22  ANESTESIOLOGIA Y REANIMACION   
2   CC-8682278-NOMBRE*DEL*DOCTOR         22  ANESTESIOLOGIA Y REANIMACION   
3  CC-19188542-NOMBRE*DEL*DOCTOR        120                   CARDIOLOGIA   
4  CC-70115939-NOMBRE*DEL*DOCTOR        120                   CARDIOLOGIA   

   NO CONSULTAS  NO CIRUGIAS  NO PACIENTES  
0            59            1            67  
1            85            2            89  
2             6            4             6  
3            24            3            24  
4           429          104           430  


In [3]:
## Organizing dataframe better: data_outpatient_year.
data_outpatient_year = pd.read_csv(file_in, header=0)
data_outpatient_year['year'] = [year]*len(data_outpatient_year)
col_names = ['doc_name','spec_code','spec_es','n_visits','n_surgeries','n_patients','year']
col_order = ['spec_code','spec_es','year','doc_name','n_visits','n_surgeries','n_patients']
data_outpatient_year.columns = col_names
data_outpatient_year = data_outpatient_year[col_order]

data_outpatient_year['spec_es'] = [i.lower() for i in data_outpatient_year['spec_es']]

print(data_outpatient_year.head(),'\n')
print(data_outpatient_year.info())
for i in range(len(data_outpatient_year.columns)):
    i_name = data_outpatient_year.iloc[:,i].name
    i_value = sum(list(type(i)==str for i in data_outpatient_year.iloc[:,i]))
    print('Column', i_name, 'has', i_value, 'strings.')

   spec_code                       spec_es  year  \
0         22  anestesiologia y reanimacion  2008   
1         22  anestesiologia y reanimacion  2008   
2         22  anestesiologia y reanimacion  2008   
3        120                   cardiologia  2008   
4        120                   cardiologia  2008   

                        doc_name  n_visits  n_surgeries  n_patients  
0  CC-16449291-NOMBRE*DEL*DOCTOR        59            1          67  
1  CC-31865019-NOMBRE*DEL*DOCTOR        85            2          89  
2   CC-8682278-NOMBRE*DEL*DOCTOR         6            4           6  
3  CC-19188542-NOMBRE*DEL*DOCTOR        24            3          24  
4  CC-70115939-NOMBRE*DEL*DOCTOR       429          104         430   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 7 columns):
spec_code      1600 non-null int64
spec_es        1600 non-null object
year           1600 non-null int64
doc_name       1600 non-null object
n_visits       16

In [4]:
## Getting code for doctor, doc_code, and adding it to data_outpatient_year.
doc_code = []
for i in data_outpatient_year['doc_name']:
    for j in i.split('-'):
        if j.isdigit():
            doc_code.append(int(j))

## Alternative code one.
#doc_code = [int(j) for i in data_outpatient_year['doc_name'] for j in i.split('-') if j.isdigit()]            

## Alternative code two.
#doc_code = []
#for i in data_outpatient_year['doc_name']:
#    temp = re.findall(r'\d+', i) 
#    res = list(map(int, temp))
#    doc_code.append(res[0])

data_outpatient_year['doc_code'] = doc_code
col_order = ['spec_code','spec_es','year','doc_code','doc_name','n_visits','n_surgeries','n_patients']
data_outpatient_year = data_outpatient_year[col_order]

print(data_outpatient_year.head(),'\n')
print(data_outpatient_year.info())
for i in range(len(data_outpatient_year.columns)):
    i_name = data_outpatient_year.iloc[:,i].name
    i_value = sum(list(type(i)==str for i in data_outpatient_year.iloc[:,i]))
    print('Column', i_name, 'has', i_value, 'strings.')

   spec_code                       spec_es  year  doc_code  \
0         22  anestesiologia y reanimacion  2008  16449291   
1         22  anestesiologia y reanimacion  2008  31865019   
2         22  anestesiologia y reanimacion  2008   8682278   
3        120                   cardiologia  2008  19188542   
4        120                   cardiologia  2008  70115939   

                        doc_name  n_visits  n_surgeries  n_patients  
0  CC-16449291-NOMBRE*DEL*DOCTOR        59            1          67  
1  CC-31865019-NOMBRE*DEL*DOCTOR        85            2          89  
2   CC-8682278-NOMBRE*DEL*DOCTOR         6            4           6  
3  CC-19188542-NOMBRE*DEL*DOCTOR        24            3          24  
4  CC-70115939-NOMBRE*DEL*DOCTOR       429          104         430   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 8 columns):
spec_code      1600 non-null int64
spec_es        1600 non-null object
year           1600 non-null

In [5]:
## Sorting by spec_code and reseting index.
data_outpatient_year.drop('doc_name', axis=1, inplace=True)
data_outpatient_year.sort_values('spec_code', inplace=True)
data_outpatient_year.reset_index(drop=True, inplace=True)
print(data_outpatient_year.head(10),'\n')

   spec_code                       spec_es  year  doc_code  n_visits  \
0         22  anestesiologia y reanimacion  2008  16449291        59   
1         22  anestesiologia y reanimacion  2008  31865019        85   
2         22  anestesiologia y reanimacion  2008   8682278         6   
3        120                   cardiologia  2008  19188542        24   
4        120                   cardiologia  2008  70115939       429   
5        120                   cardiologia  2008  71593942       348   
6        120                   cardiologia  2008  71622813       354   
7        120                   cardiologia  2008  73085294        61   
8        130        cirugia cardiovascular  2008  98543766        13   
9        130        cirugia cardiovascular  2008   8313225        22   

   n_surgeries  n_patients  
0            1          67  
1            2          89  
2            4           6  
3            3          24  
4          104         430  
5           61         311  
6   