In [1]:
## Relevant modules.
import pandas as pd
import os
## Optional for alternative code two (see forth cell below)
#import re

In [2]:
## First look at the original dataframe: data_outpatient_2011_firstlook.
file_in = 'original_data/datos_ambulatorios_2011_new.csv'
data_outpatient_2011_firstlook = pd.read_csv(file_in)
print(data_outpatient_2011_firstlook.head())

                         MEDICO  COD_ESPEC                  ESPECIALIDAD  \
0  C-15349090-NOMBRE*DEL*DOCTOR         22  ANESTESIOLOGIA Y REANIMACION   
1  C-31988075-NOMBRE*DEL*DOCTOR         22  ANESTESIOLOGIA Y REANIMACION   
2  C-39779019-NOMBRE*DEL*DOCTOR         22  ANESTESIOLOGIA Y REANIMACION   
3   C-8693061-NOMBRE*DEL*DOCTOR         22  ANESTESIOLOGIA Y REANIMACION   
4  C-10266901-NOMBRE*DEL*DOCTOR        120                   CARDIOLOGIA   

   NO CONSULTAS  NO CIRUGIAS  NO PACIENTES  
0            22            3            31  
1            37            1            43  
2             5            1             5  
3             6            3             8  
4             6            1            25  


In [3]:
## Organizing dataframe better: data_outpatient_2011.
data_outpatient_2011 = pd.read_csv(file_in, header=0)
data_outpatient_2011['year'] = [2011]*len(data_outpatient_2011)
col_names = ['doc_name','spec_code','spec_es','n_visits','n_surgeries','n_patients','year']
col_order = ['spec_code','spec_es','year','doc_name','n_visits','n_surgeries','n_patients']
data_outpatient_2011.columns = col_names
data_outpatient_2011 = data_outpatient_2011[col_order]

data_outpatient_2011['spec_es'] = [i.lower() for i in data_outpatient_2011['spec_es']]

print(data_outpatient_2011.head(),'\n')
print(data_outpatient_2011.info())
for i in range(len(data_outpatient_2011.columns)):
    i_name = data_outpatient_2011.iloc[:,i].name
    i_value = sum(list(type(i)==str for i in data_outpatient_2011.iloc[:,i]))
    print('Column', i_name, 'has', i_value, 'strings.')

   spec_code                       spec_es  year  \
0         22  anestesiologia y reanimacion  2011   
1         22  anestesiologia y reanimacion  2011   
2         22  anestesiologia y reanimacion  2011   
3         22  anestesiologia y reanimacion  2011   
4        120                   cardiologia  2011   

                       doc_name  n_visits  n_surgeries  n_patients  
0  C-15349090-NOMBRE*DEL*DOCTOR        22            3          31  
1  C-31988075-NOMBRE*DEL*DOCTOR        37            1          43  
2  C-39779019-NOMBRE*DEL*DOCTOR         5            1           5  
3   C-8693061-NOMBRE*DEL*DOCTOR         6            3           8  
4  C-10266901-NOMBRE*DEL*DOCTOR         6            1          25   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2736 entries, 0 to 2735
Data columns (total 7 columns):
spec_code      2736 non-null int64
spec_es        2736 non-null object
year           2736 non-null int64
doc_name       2736 non-null object
n_visits       2736 non

In [4]:
## Getting code for doctor, doc_code, and adding it to data_outpatient_2011.
doc_code = []
for i in data_outpatient_2011['doc_name']:
    for j in i.split('-'):
        if j.isdigit():
            doc_code.append(int(j))

## Alternative code one.
#doc_code = [int(j) for i in data_outpatient_2011['doc_name'] for j in i.split('-') if j.isdigit()]            

## Alternative code two.
#doc_code = []
#for i in data_outpatient_2011['doc_name']:
#    temp = re.findall(r'\d+', i) 
#    res = list(map(int, temp))
#    doc_code.append(res[0])

data_outpatient_2011['doc_code'] = doc_code
col_order = ['spec_code','spec_es','year','doc_code','doc_name','n_visits','n_surgeries','n_patients']
data_outpatient_2011 = data_outpatient_2011[col_order]

print(data_outpatient_2011.head(),'\n')
print(data_outpatient_2011.info())
for i in range(len(data_outpatient_2011.columns)):
    i_name = data_outpatient_2011.iloc[:,i].name
    i_value = sum(list(type(i)==str for i in data_outpatient_2011.iloc[:,i]))
    print('Column', i_name, 'has', i_value, 'strings.')

   spec_code                       spec_es  year  doc_code  \
0         22  anestesiologia y reanimacion  2011  15349090   
1         22  anestesiologia y reanimacion  2011  31988075   
2         22  anestesiologia y reanimacion  2011  39779019   
3         22  anestesiologia y reanimacion  2011   8693061   
4        120                   cardiologia  2011  10266901   

                       doc_name  n_visits  n_surgeries  n_patients  
0  C-15349090-NOMBRE*DEL*DOCTOR        22            3          31  
1  C-31988075-NOMBRE*DEL*DOCTOR        37            1          43  
2  C-39779019-NOMBRE*DEL*DOCTOR         5            1           5  
3   C-8693061-NOMBRE*DEL*DOCTOR         6            3           8  
4  C-10266901-NOMBRE*DEL*DOCTOR         6            1          25   

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2736 entries, 0 to 2735
Data columns (total 8 columns):
spec_code      2736 non-null int64
spec_es        2736 non-null object
year           2736 non-null int64

In [5]:
## Sorting by spec_code and reseting index.
data_outpatient_2011.sort_values('spec_code', inplace=True)
data_outpatient_2011.reset_index(drop=True, inplace=True)
print(data_outpatient_2011.head(10),'\n')

## Saving data_outpatient_2011 as csv to data folder.
if os.path.exists('data/data_outpatient_2011.csv'):
    print('data_outpatient_2011.csv already exists')
elif os.path.exists('data'):
    file_out = 'data/data_outpatient_2011.csv'
    data_outpatient_2011.to_csv(path_or_buf=file_out, index=False)
else:
    os.mkdir('data')
    file_out = 'data/data_outpatient_2011.csv'
    data_outpatient_2011.to_csv(path_or_buf=file_out, index=False)

   spec_code                       spec_es  year  doc_code  \
0         22  anestesiologia y reanimacion  2011  15349090   
1         22  anestesiologia y reanimacion  2011  31988075   
2         22  anestesiologia y reanimacion  2011  39779019   
3         22  anestesiologia y reanimacion  2011   8693061   
4        120                   cardiologia  2011  98561217   
5        120                   cardiologia  2011  71622813   
6        120                   cardiologia  2011  71593942   
7        120                   cardiologia  2011  70115939   
8        120                   cardiologia  2011  19188542   
9        120                   cardiologia  2011  98561217   

                        doc_name  n_visits  n_surgeries  n_patients  
0   C-15349090-NOMBRE*DEL*DOCTOR        22            3          31  
1   C-31988075-NOMBRE*DEL*DOCTOR        37            1          43  
2   C-39779019-NOMBRE*DEL*DOCTOR         5            1           5  
3    C-8693061-NOMBRE*DEL*DOCTOR     