In [1]:
## Relevant modules.
import pandas as pd
import os
## Optional for alternative code two (see forth cell below)
#import re

In [2]:
## First look at the original dataframe: data_outpatient_2010_firstlook.
file_in = 'original_data/datos_ambulatorios_2010_new.csv'
data_outpatient_2010_firstlook = pd.read_csv(file_in)
print(data_outpatient_2010_firstlook.head())

                          MEDICO  COD_ESPEC                  ESPECIALIDAD  \
0  CC-16449291-NOMBRE*DEL*DOCTOR         22  ANESTESIOLOGIA Y REANIMACION   
1   CC-8682278-NOMBRE*DEL*DOCTOR         22  ANESTESIOLOGIA Y REANIMACION   
2  CC-19188542-NOMBRE*DEL*DOCTOR        120                   CARDIOLOGIA   
3  CC-70079989-NOMBRE*DEL*DOCTOR        120                   CARDIOLOGIA   
4  CC-70115939-NOMBRE*DEL*DOCTOR        120                   CARDIOLOGIA   

   NO CONSULTAS  NO CIRUGIAS  NO PACIENTES  
0            84            1            90  
1            10            8            10  
2            45            2            33  
3           251            1           180  
4           712          162           630  


In [3]:
## Organizing dataframe better: data_outpatient_2010.
data_outpatient_2010 = pd.read_csv(file_in, header=0)
data_outpatient_2010['year'] = [2010]*len(data_outpatient_2010)
col_names = ['doc_name','spec_code','spec_es','n_visits','n_surgeries','n_patients','year']
col_order = ['spec_code','spec_es','year','doc_name','n_visits','n_surgeries','n_patients']
data_outpatient_2010.columns = col_names
data_outpatient_2010 = data_outpatient_2010[col_order]

data_outpatient_2010['spec_es'] = [i.lower() for i in data_outpatient_2010['spec_es']]

print(data_outpatient_2010.head(),'\n')
print(data_outpatient_2010.info())
for i in range(len(data_outpatient_2010.columns)):
    i_name = data_outpatient_2010.iloc[:,i].name
    i_value = sum(list(type(i)==str for i in data_outpatient_2010.iloc[:,i]))
    print('Column', i_name, 'has', i_value, 'strings.')

   spec_code                       spec_es  year  \
0         22  anestesiologia y reanimacion  2010   
1         22  anestesiologia y reanimacion  2010   
2        120                   cardiologia  2010   
3        120                   cardiologia  2010   
4        120                   cardiologia  2010   

                        doc_name  n_visits  n_surgeries  n_patients  
0  CC-16449291-NOMBRE*DEL*DOCTOR        84            1          90  
1   CC-8682278-NOMBRE*DEL*DOCTOR        10            8          10  
2  CC-19188542-NOMBRE*DEL*DOCTOR        45            2          33  
3  CC-70079989-NOMBRE*DEL*DOCTOR       251            1         180  
4  CC-70115939-NOMBRE*DEL*DOCTOR       712          162         630   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1659 entries, 0 to 1658
Data columns (total 7 columns):
spec_code      1659 non-null int64
spec_es        1659 non-null object
year           1659 non-null int64
doc_name       1659 non-null object
n_visits       16

In [4]:
## Getting code for doctor, doc_code, and adding it to data_outpatient_2010.
doc_code = []
for i in data_outpatient_2010['doc_name']:
    for j in i.split('-'):
        if j.isdigit():
            doc_code.append(int(j))

## Alternative code one.
#doc_code = [int(j) for i in data_outpatient_2010['doc_name'] for j in i.split('-') if j.isdigit()]            

## Alternative code two.
#doc_code = []
#for i in data_outpatient_2010['doc_name']:
#    temp = re.findall(r'\d+', i) 
#    res = list(map(int, temp))
#    doc_code.append(res[0])

data_outpatient_2010['doc_code'] = doc_code
col_order = ['spec_code','spec_es','year','doc_code','doc_name','n_visits','n_surgeries','n_patients']
data_outpatient_2010 = data_outpatient_2010[col_order]

print(data_outpatient_2010.head(),'\n')
print(data_outpatient_2010.info())
for i in range(len(data_outpatient_2010.columns)):
    i_name = data_outpatient_2010.iloc[:,i].name
    i_value = sum(list(type(i)==str for i in data_outpatient_2010.iloc[:,i]))
    print('Column', i_name, 'has', i_value, 'strings.')

   spec_code                       spec_es  year  doc_code  \
0         22  anestesiologia y reanimacion  2010  16449291   
1         22  anestesiologia y reanimacion  2010   8682278   
2        120                   cardiologia  2010  19188542   
3        120                   cardiologia  2010  70079989   
4        120                   cardiologia  2010  70115939   

                        doc_name  n_visits  n_surgeries  n_patients  
0  CC-16449291-NOMBRE*DEL*DOCTOR        84            1          90  
1   CC-8682278-NOMBRE*DEL*DOCTOR        10            8          10  
2  CC-19188542-NOMBRE*DEL*DOCTOR        45            2          33  
3  CC-70079989-NOMBRE*DEL*DOCTOR       251            1         180  
4  CC-70115939-NOMBRE*DEL*DOCTOR       712          162         630   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1659 entries, 0 to 1658
Data columns (total 8 columns):
spec_code      1659 non-null int64
spec_es        1659 non-null object
year           1659 non-null

In [5]:
## Sorting by spec_code and reseting index.
data_outpatient_2010.sort_values('spec_code', inplace=True)
data_outpatient_2010.reset_index(drop=True, inplace=True)
print(data_outpatient_2010.head(10),'\n')

## Saving data_outpatient_2010 as csv to data folder.
if os.path.exists('data/data_outpatient_2010.csv'):
    print('data_outpatient_2010.csv already exists')
elif os.path.exists('data'):
    file_out = 'data/data_outpatient_2010.csv'
    data_outpatient_2010.to_csv(path_or_buf=file_out, index=False)
else:
    os.mkdir('data')
    file_out = 'data/data_outpatient_2010.csv'
    data_outpatient_2010.to_csv(path_or_buf=file_out, index=False)

   spec_code                       spec_es  year  doc_code  \
0         22  anestesiologia y reanimacion  2010  16449291   
1         22  anestesiologia y reanimacion  2010   8682278   
2        120                   cardiologia  2010  19188542   
3        120                   cardiologia  2010  70079989   
4        120                   cardiologia  2010  70115939   
5        120                   cardiologia  2010  70557356   
6        120                   cardiologia  2010  71593942   
7        120                   cardiologia  2010  71622813   
8        120                   cardiologia  2010  79451776   
9        130        cirugia cardiovascular  2010  19111501   

                        doc_name  n_visits  n_surgeries  n_patients  
0  CC-16449291-NOMBRE*DEL*DOCTOR        84            1          90  
1   CC-8682278-NOMBRE*DEL*DOCTOR        10            8          10  
2  CC-19188542-NOMBRE*DEL*DOCTOR        45            2          33  
3  CC-70079989-NOMBRE*DEL*DOCTOR     