In [1]:
## Relevant modules.
import pandas as pd
import os
## Optional for alternative code two (see forth cell below)
#import re

In [2]:
## First look at the original dataframe: data_outpatient_2012_firstlook.
file_in = 'original_data/datos_ambulatorios_2012_new.csv'
data_outpatient_2012_firstlook = pd.read_csv(file_in)
print(data_outpatient_2012_firstlook.head())

                         MEDICO  COD_ESPEC                  ESPECIALIDAD  \
0  C-31988075-NOMBRE*DEL*DOCTOR         22  ANESTESIOLOGIA Y REANIMACION   
1  C-39779019-NOMBRE*DEL*DOCTOR         22  ANESTESIOLOGIA Y REANIMACION   
2  C-73071180-NOMBRE*DEL*DOCTOR         22  ANESTESIOLOGIA Y REANIMACION   
3  C-19188542-NOMBRE*DEL*DOCTOR        120                   CARDIOLOGIA   
4  C-70115939-NOMBRE*DEL*DOCTOR        120                   CARDIOLOGIA   

   NO CONSULTAS  NO CIRUGIAS  NO PACIENTES  
0            72            2            76  
1            38            1            30  
2           113            1           125  
3            96           10            75  
4           730          207           565  


In [3]:
## Organizing dataframe better: data_outpatient_2012.
data_outpatient_2012 = pd.read_csv(file_in, header=0)
data_outpatient_2012['year'] = [2012]*len(data_outpatient_2012)
col_names = ['doc_name','spec_code','spec_es','n_visits','n_surgeries','n_patients','year']
col_order = ['spec_code','spec_es','year','doc_name','n_visits','n_surgeries','n_patients']
data_outpatient_2012.columns = col_names
data_outpatient_2012 = data_outpatient_2012[col_order]

data_outpatient_2012['spec_es'] = [i.lower() for i in data_outpatient_2012['spec_es']]

print(data_outpatient_2012.head(),'\n')
print(data_outpatient_2012.info())
for i in range(len(data_outpatient_2012.columns)):
    i_name = data_outpatient_2012.iloc[:,i].name
    i_value = sum(list(type(i)==str for i in data_outpatient_2012.iloc[:,i]))
    print('Column', i_name, 'has', i_value, 'strings.')

   spec_code                       spec_es  year  \
0         22  anestesiologia y reanimacion  2012   
1         22  anestesiologia y reanimacion  2012   
2         22  anestesiologia y reanimacion  2012   
3        120                   cardiologia  2012   
4        120                   cardiologia  2012   

                       doc_name  n_visits  n_surgeries  n_patients  
0  C-31988075-NOMBRE*DEL*DOCTOR        72            2          76  
1  C-39779019-NOMBRE*DEL*DOCTOR        38            1          30  
2  C-73071180-NOMBRE*DEL*DOCTOR       113            1         125  
3  C-19188542-NOMBRE*DEL*DOCTOR        96           10          75  
4  C-70115939-NOMBRE*DEL*DOCTOR       730          207         565   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1837 entries, 0 to 1836
Data columns (total 7 columns):
spec_code      1837 non-null int64
spec_es        1837 non-null object
year           1837 non-null int64
doc_name       1837 non-null object
n_visits       1837 non

In [4]:
## Getting code for doctor, doc_code, and adding it to data_outpatient_2012.
doc_code = []
for i in data_outpatient_2012['doc_name']:
    for j in i.split('-'):
        if j.isdigit():
            doc_code.append(int(j))

## Alternative code one.
#doc_code = [int(j) for i in data_outpatient_2012['doc_name'] for j in i.split('-') if j.isdigit()]            

## Alternative code two.
#doc_code = []
#for i in data_outpatient_2012['doc_name']:
#    temp = re.findall(r'\d+', i) 
#    res = list(map(int, temp))
#    doc_code.append(res[0])

data_outpatient_2012['doc_code'] = doc_code
col_order = ['spec_code','spec_es','year','doc_code','doc_name','n_visits','n_surgeries','n_patients']
data_outpatient_2012 = data_outpatient_2012[col_order]

print(data_outpatient_2012.head(),'\n')
print(data_outpatient_2012.info())
for i in range(len(data_outpatient_2012.columns)):
    i_name = data_outpatient_2012.iloc[:,i].name
    i_value = sum(list(type(i)==str for i in data_outpatient_2012.iloc[:,i]))
    print('Column', i_name, 'has', i_value, 'strings.')

   spec_code                       spec_es  year  doc_code  \
0         22  anestesiologia y reanimacion  2012  31988075   
1         22  anestesiologia y reanimacion  2012  39779019   
2         22  anestesiologia y reanimacion  2012  73071180   
3        120                   cardiologia  2012  19188542   
4        120                   cardiologia  2012  70115939   

                       doc_name  n_visits  n_surgeries  n_patients  
0  C-31988075-NOMBRE*DEL*DOCTOR        72            2          76  
1  C-39779019-NOMBRE*DEL*DOCTOR        38            1          30  
2  C-73071180-NOMBRE*DEL*DOCTOR       113            1         125  
3  C-19188542-NOMBRE*DEL*DOCTOR        96           10          75  
4  C-70115939-NOMBRE*DEL*DOCTOR       730          207         565   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1837 entries, 0 to 1836
Data columns (total 8 columns):
spec_code      1837 non-null int64
spec_es        1837 non-null object
year           1837 non-null int64

In [5]:
## Sorting by spec_code and reseting index.
data_outpatient_2012.drop('doc_name', axis=1, inplace=True)
data_outpatient_2012.sort_values('spec_code', inplace=True)
data_outpatient_2012.reset_index(drop=True, inplace=True)
print(data_outpatient_2012.head(10),'\n')

## Saving data_outpatient_2012 as csv to data folder.
if os.path.exists('data/data_outpatient_2012.csv'):
    print('data_outpatient_2012.csv already exists')
elif os.path.exists('data'):
    file_out = 'data/data_outpatient_2012.csv'
    data_outpatient_2012.to_csv(path_or_buf=file_out, index=False)
else:
    os.mkdir('data')
    file_out = 'data/data_outpatient_2012.csv'
    data_outpatient_2012.to_csv(path_or_buf=file_out, index=False)

   spec_code                       spec_es  year  doc_code  n_visits  \
0         22  anestesiologia y reanimacion  2012  31988075        72   
1         22  anestesiologia y reanimacion  2012  39779019        38   
2         22  anestesiologia y reanimacion  2012  73071180       113   
3        120                   cardiologia  2012  19188542        96   
4        120                   cardiologia  2012  70115939       730   
5        120                   cardiologia  2012  70557356       238   
6        120                   cardiologia  2012  71593942       358   
7        120                   cardiologia  2012  71622813       254   
8        120                   cardiologia  2012  79451776        23   
9        130        cirugia cardiovascular  2012   8313225        32   

   n_surgeries  n_patients  
0            2          76  
1            1          30  
2            1         125  
3           10          75  
4          207         565  
5            1         197  
6   