# DataFrame generation for Modelling

In [1]:
import pandas as pd
import numpy as np
import csv
from glob import glob

Realizamos una lista del directorio con los Ficheros por Contaminante agrupados y filtrados por fallecimientos del año 2015 que vamos a procesar a posterior

In [2]:
#Listado de ficheros entrada DeathsEmissions por Contaminante
ListFichDeathsEmissionsFinal = glob('../data/csv/final_csv/DeathsEmissions_final_*.csv')

In [3]:
## INFORMACIÓN DE VARIABLES A TRATAR:
#'Id'				-- > Id fallecimiento
#'ProvinciaReside'	-- > Provincia Residencia fallecido
#'MunicipioReside'	-- > Municipio residencia fallecido
#'Sexo'				-- > Sexo fallecido
#'Ocupacion'			-- > Ocupación fallecido
#'AnioCumplidos'		-- > Años cumplidos fallecimiento
#'TamanioMuniResi'	-- > Tamaño Municipio Residencia
#'CausaMuertebas1'	-- > Codigo CI10 Alfabético
#'CausaMuertebas2'	-- > Código CI10 Numérico
#'CausaMuertebas3'	-- > Código CI10 Numérico
#'CausaMuertebas4'	-- > Código CI10 Numérico
#'CausaMortaReduc'	-- > Causa Mortalidad reducida
#'CausaMortaperin'	-- > Causa Mortalidad Perinatal
#'CausaMortaInfan'	-- > Causa Mortalidad infantil
#'NivelEstudios'		-- > Nivel estudios fallecido
#'Población'			-- > Población resi fallecido
#'Provincia'			-- > Provincia resi fallecido
#'Comunidad'			-- > Comunidad resi fallecido
#'Latitud'			-- > Latitud fallecimiento
#'Longitud'			-- > Longitud fallecimiento
#'Habitantes'		-- > Num habitantes población
#'Hombres'			-- > Num Hombres población
#'Mujeres'			-- > Num mujeres población
#'CodigoPRTR'		-- > Codigo PRTR emisión
#'LongitudE'			-- > Longitud Emisión
#'LatitudE'			-- > Latitud Emisión
#'Contaminante'		-- > Tipo de Contaminante emitido
#'CantidadTotalkg'	-- > Cantidad total de Kg emitido
#'TotalAniosId'		-- > Total años exposición fallecido

In [4]:
#Escribimos las columnas que más nos interesan añadir al Dataframe
cols_of_interest= ['Id','ProvinciaReside', 'MunicipioReside','Sexo', 'AnioCumplidos', \
       'TamanioMuniResi', 'CausaMuertebas1', 'CausaMuertebas2', 'CausaMuertebas3', 'CausaMuertebas4', 
        'CausaMortaReduc','CausaMortaperin', 'CausaMortaInfan', 'NivelEstudios', 'Población', 'Provincia', \
        'Comunidad', 'Latitud', 'Longitud', 'Habitantes', 'Hombres', 'Mujeres', 'CodigoPRTR','LongitudE','LatitudE',\
        'Contaminante','CantidadTotalkg','TotalAniosId']
      

Esta iteración genera dataframes temporales para volcarlos a disco como CSV y construir luego el dataframe general para el modelo.
El tiempo de ejecución medio según la máquina es de entre 5' a 15' (generación csv gzip)

In [5]:
for i in range(0,len(ListFichDeathsEmissionsFinal)):   
#for i in range(0,4):  
        
    print ('Contaminante: '+ListFichDeathsEmissionsFinal[i])
            
    #Leemos de los CSV por Contaminantes
    DfDeathsEmissions = pd.read_csv(ListFichDeathsEmissionsFinal[i],sep=';', encoding = 'utf-8', compression='gzip', index_col=False) #crgamos df con cada fichero
    
    DfDeathsEmissions.drop(['Unnamed: 0'], axis=1, inplace=True) #Borramos columnas innecesarias
    DfDeathsEmissions.dropna(subset=['LatitudE', 'LongitudE'], how="any") #Eliminamos las filas no georreferenciadas.
    
    #Seleccionamos columnas de interes
    DfDeathsEmissions =  DfDeathsEmissions[cols_of_interest]
    
    # Recogemos el dataframe ya agrupado por Contaminante y agrupamos por Id, CodPRTR (necesario luego para geometrías)
    # y CantidadTotalKg.
    Columnas = ('Id','CodigoPRTR','CantidadTotalkg')
    
    #Quitamos duplicados para cada contaminante.
    DfDeathsEmissions = DfDeathsEmissions.drop_duplicates(Columnas, keep="first", inplace=False)
    
    DfAgrupado = DfDeathsEmissions.groupby(Columnas).agg({'CantidadTotalkg':'sum'})#Agrupamos...
    DfAgrupado = DfAgrupado.rename(columns={"CantidadTotalkg":"Total_Kg_expo"}) #Renombramos
    # 
    #Realizamos merge de la agrupación..
    DfDeathsEmissions = DfDeathsEmissions.merge(DfAgrupado, how='left', left_on=Columnas, right_index=True)
    
    #Salvamos a csv temporales para utilizarlos más abajo para montar el dataframe general.
    DfDeathsEmissions.to_csv('../data/csv/temp_csv_for_modelling/'+'Temp_Cont_GroupByIdCodPRTR_for_model_'+ str(i) + '.csv', sep=';', encoding= 'utf-8',compression='gzip')

    del DfDeathsEmissions
    

Contaminante: ../data/csv/final_csv\DeathsEmissions_final_1_1_1-tricloroetano_TCE_1.csv
Contaminante: ../data/csv/final_csv\DeathsEmissions_final_1_1_2_2-tetracloroetano_1.csv
Contaminante: ../data/csv/final_csv\DeathsEmissions_final_1_2-dicloroetano_DCE_1.csv
Contaminante: ../data/csv/final_csv\DeathsEmissions_final_1_2_3_4_5_6-hexaclorociclohexano_HCH_1.csv
Contaminante: ../data/csv/final_csv\DeathsEmissions_final_Amoniaco_NH3_1.csv
Contaminante: ../data/csv/final_csv\DeathsEmissions_final_Antraceno_1.csv
Contaminante: ../data/csv/final_csv\DeathsEmissions_final_Arsénico_y_compuestos_como_As_1.csv
Contaminante: ../data/csv/final_csv\DeathsEmissions_final_Benceno__1.csv
Contaminante: ../data/csv/final_csv\DeathsEmissions_final_Cadmio_y_compuestos_como_Cd_1.csv
Contaminante: ../data/csv/final_csv\DeathsEmissions_final_Cianuro_de_hidrogeno_HCN_1.csv
Contaminante: ../data/csv/final_csv\DeathsEmissions_final_Clorofluorocarburos_CFC__1.csv
Contaminante: ../data/csv/final_csv\DeathsEmission

Obtenemos el listado de los CSV parciales agrupados

In [6]:
ListCSV_for_model = glob('../data/csv/temp_csv_for_modelling/*.csv')

#Iniciamos el dataframe para el modelo
df_model = pd.DataFrame()

Generamos el dataframe para el modelo a partir del listado de los ficheros CSV

In [7]:
for i in range(0,len(ListCSV_for_model)):   
    df_partial =pd.DataFrame()
    df_partial = pd.read_csv(ListCSV_for_model[i], sep=';', encoding = 'utf-8', compression='gzip', index_col=False) #cargamos df parcial con cada fichero
    df_model = df_model.append(df_partial)
    del df_partial

In [8]:
df_model.shape

(2664633, 30)

In [9]:
pd.options.display.max_columns = None

In [10]:
df_model.head(1000).sort_values(by='TotalAniosId', ascending=False)

Unnamed: 0.1,Unnamed: 0,Id,ProvinciaReside,MunicipioReside,Sexo,AnioCumplidos,TamanioMuniResi,CausaMuertebas1,CausaMuertebas2,CausaMuertebas3,CausaMuertebas4,CausaMortaReduc,CausaMortaperin,CausaMortaInfan,NivelEstudios,Población,Provincia,Comunidad,Latitud,Longitud,Habitantes,Hombres,Mujeres,CodigoPRTR,LongitudE,LatitudE,Contaminante,CantidadTotalkg,TotalAniosId,Total_Kg_expo
0,0,597382,33,44,1,64,6,C,0,6,9.0,9,,,4,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0
671,671,600458,33,44,1,56,6,C,3,4,9.0,18,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0
658,658,600409,33,44,6,90,6,M,8,4,4.0,75,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0
659,659,600412,33,44,6,97,6,I,1,3,2.0,54,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0
660,660,600423,33,44,1,79,6,I,6,4,,59,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0
661,661,600430,33,44,6,82,6,J,1,8,1.0,63,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0
662,662,600433,33,44,1,87,6,G,2,0,,52,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0
663,663,600435,33,44,6,55,6,I,2,1,9.0,55,,,12,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0
664,664,600436,33,44,6,88,6,I,6,2,0.0,59,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0
665,665,600438,33,44,6,89,6,C,6,7,9.0,31,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0


In [11]:
df_model.dtypes

Unnamed: 0           int64
Id                   int64
ProvinciaReside      int64
MunicipioReside      int64
Sexo                 int64
AnioCumplidos        int64
TamanioMuniResi      int64
CausaMuertebas1     object
CausaMuertebas2      int64
CausaMuertebas3      int64
CausaMuertebas4    float64
CausaMortaReduc      int64
CausaMortaperin    float64
CausaMortaInfan    float64
NivelEstudios        int64
Población           object
Provincia           object
Comunidad           object
Latitud            float64
Longitud           float64
Habitantes           int64
Hombres              int64
Mujeres              int64
CodigoPRTR           int64
LongitudE          float64
LatitudE           float64
Contaminante        object
CantidadTotalkg    float64
TotalAniosId         int64
Total_Kg_expo      float64
dtype: object

#### Vamos a generar un dataframe con las codificaciones del CI-10 según el tipo de enfermedad

In [12]:
cie10 = pd.ExcelFile('../data/excel/CIE10_10rev.xlsx', sheetname='CIE10')
df_cie10 = pd.read_excel(cie10,index_col=None,na_values=['NA'])

Concatenamos los 3 campos que nos indicaran el tipo según la codificación internacional CI-10

In [13]:
df_model['COD_3'] = df_model['CausaMuertebas1']+ df_model['CausaMuertebas2'].astype(str)+ df_model['CausaMuertebas3'].astype(str)

In [14]:
df_cie10.head()

Unnamed: 0,COD_3,Descripcion_Cod_3,COD_4,Descripcion_Cod_4
0,A00,COLERA,A000,"COLERA DEBIDO A VIBRIO CHOLERAE O1, BIOTIPO CH..."
1,,,A001,"COLERA DEBIDO A VIBRIO CHOLERAE O1, BIOTIPO EL..."
2,,,A009,COLERA NO ESPECIFICADO
3,A01,FIEBRES TIFOIDEA Y PARATIFOIDEA,A010,FIEBRE TIFOIDEA
4,,,A011,FIEBRE PARATIFOIDEA A


In [15]:
df_cie10 = df_cie10.dropna(subset = ['COD_3', 'Descripcion_Cod_3'])

In [16]:
df_cie10 = df_cie10[['COD_3', 'Descripcion_Cod_3']]
df_cie10.head()

Unnamed: 0,COD_3,Descripcion_Cod_3
0,A00,COLERA
3,A01,FIEBRES TIFOIDEA Y PARATIFOIDEA
8,A02,OTRAS INFECCIONES DEBIDAS SALMONELLA
13,A03,SHIGELOSIS
19,A04,OTRAS INFECCIONES INTESTINALES BACTERIANAS


In [17]:
df_merge = pd.merge(df_model, df_cie10, on='COD_3', how='left')

In [18]:
df_merge.head(1000)

Unnamed: 0.1,Unnamed: 0,Id,ProvinciaReside,MunicipioReside,Sexo,AnioCumplidos,TamanioMuniResi,CausaMuertebas1,CausaMuertebas2,CausaMuertebas3,CausaMuertebas4,CausaMortaReduc,CausaMortaperin,CausaMortaInfan,NivelEstudios,Población,Provincia,Comunidad,Latitud,Longitud,Habitantes,Hombres,Mujeres,CodigoPRTR,LongitudE,LatitudE,Contaminante,CantidadTotalkg,TotalAniosId,Total_Kg_expo,COD_3,Descripcion_Cod_3
0,0,597382,33,44,1,64,6,C,0,6,9.0,9,,,4,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C06,TUMOR MALIGNO DE OTRAS PARTES Y DE LAS NO ESPE...
1,1,597392,33,44,1,72,6,C,6,1,,28,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C61,TUMOR MALIGNO DE LA PROSTATA
2,2,597397,33,44,1,98,6,I,4,9,9.0,58,,,5,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,I49,OTRAS ARRITMIAS CARDIACAS
3,3,597403,33,44,1,82,6,C,3,4,9.0,18,,,6,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C34,TUMOR MALIGNO DE LOS BRONQUIOS Y DEL PULMON
4,4,597405,33,44,6,86,6,C,4,3,9.0,21,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C43,MELANOMA MALIGNO DE LA PIEL
5,5,597428,33,44,1,67,6,C,3,4,9.0,18,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C34,TUMOR MALIGNO DE LOS BRONQUIOS Y DEL PULMON
6,6,597429,33,44,6,69,6,C,5,0,9.0,23,,,4,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C50,TUMOR MALIGNO DE LA MAMA
7,7,597433,33,44,6,89,6,N,3,9,0.0,80,,,4,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,N39,OTROS TRASTORNOS DEL SISTEMA URINARIO
8,8,597435,33,44,1,78,6,C,9,2,0.0,37,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C92,LEUCEMIA MIELOIDE
9,9,597440,33,44,6,90,6,K,8,0,3.0,72,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,K80,COLELITIASIS


Examinando el excel del CI-10 vamos a filtrar por palabras clave para detectar defunciones provocadas por exposición a las emisiones industriales. Las palabras clave una vez examinado el Excel CI10 son: 
"tumor", "Pulmo", "Respi", "Toxico", "Carbono"

In [19]:
df_filtro = df_merge[df_merge['Descripcion_Cod_3'].str.contains(r"[Tt][Uu][Mm][Oo][Rr]|[Pp][Uu][Ll][Mn][Oo]|[Rr][Ee][Ss][Pp][Ii]|[Tt][Oo][Xx][Ii][Cc]|[Cc][Aa][Rr][Bb][Oo][Nn][Oo]",regex=True, na=False)]

In [20]:
df_filtro.head(1000)

Unnamed: 0.1,Unnamed: 0,Id,ProvinciaReside,MunicipioReside,Sexo,AnioCumplidos,TamanioMuniResi,CausaMuertebas1,CausaMuertebas2,CausaMuertebas3,CausaMuertebas4,CausaMortaReduc,CausaMortaperin,CausaMortaInfan,NivelEstudios,Población,Provincia,Comunidad,Latitud,Longitud,Habitantes,Hombres,Mujeres,CodigoPRTR,LongitudE,LatitudE,Contaminante,CantidadTotalkg,TotalAniosId,Total_Kg_expo,COD_3,Descripcion_Cod_3
0,0,597382,33,44,1,64,6,C,0,6,9.0,9,,,4,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C06,TUMOR MALIGNO DE OTRAS PARTES Y DE LAS NO ESPE...
1,1,597392,33,44,1,72,6,C,6,1,,28,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C61,TUMOR MALIGNO DE LA PROSTATA
3,3,597403,33,44,1,82,6,C,3,4,9.0,18,,,6,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C34,TUMOR MALIGNO DE LOS BRONQUIOS Y DEL PULMON
5,5,597428,33,44,1,67,6,C,3,4,9.0,18,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C34,TUMOR MALIGNO DE LOS BRONQUIOS Y DEL PULMON
6,6,597429,33,44,6,69,6,C,5,0,9.0,23,,,4,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C50,TUMOR MALIGNO DE LA MAMA
10,10,597445,33,44,1,87,6,J,9,8,8.0,67,,,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,J98,OTROS TRASTORNOS RESPIRATORIOS
12,12,597448,33,44,1,57,6,C,3,4,9.0,18,,,5,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C34,TUMOR MALIGNO DE LOS BRONQUIOS Y DEL PULMON
14,14,597451,33,44,6,52,6,C,7,1,9.0,33,,,10,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C71,TUMOR MALIGNO DEL ENCEFALO
17,17,597468,33,44,1,86,6,C,1,8,9.0,12,,,2,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C18,TUMOR MALIGNO DEL COLON
22,22,597491,33,44,1,89,6,C,6,1,,28,,,4,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",337.0,2,337.0,C61,TUMOR MALIGNO DE LA PROSTATA


Eliminamos las columnas que no necesitamos

In [21]:
 #Borramos columnas innecesarias
df_filtro.drop(['Unnamed: 0'], axis=1, inplace=True)
df_filtro.drop(['CausaMuertebas1'], axis=1, inplace=True)
df_filtro.drop(['CausaMuertebas2'], axis=1, inplace=True)
df_filtro.drop(['CausaMuertebas3'], axis=1, inplace=True)
df_filtro.drop(['CausaMuertebas4'], axis=1, inplace=True)
df_filtro.drop(['CausaMortaperin'], axis=1, inplace=True)
df_filtro.drop(['CausaMortaInfan'], axis=1, inplace=True)
df_filtro.drop(['CantidadTotalkg'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

Renombramos algunas columnas

In [22]:
df_filtro.rename(columns={'Población': 'Poblacion', 'TotalAniosId': 'total_anios_Expo_Id'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


#### Convirtiendo Variables categoricas

Convertimos el código CI10 que nos dará una estimación de la enfermedad por defunción

In [23]:
df_filtro['COD_3'].unique()

array(['C06', 'C61', 'C34', 'C50', 'J98', 'C71', 'C18', 'C90', 'J84',
       'I27', 'C16', 'C80', 'J44', 'I26', 'C64', 'C22', 'C15', 'C20',
       'C13', 'C67', 'J22', 'C25', 'C49', 'C24', 'D47', 'C53', 'C56',
       'C55', 'C54', 'C17', 'C32', 'C12', 'C02', 'C70', 'C26', 'D29',
       'C44', 'D32', 'C23', 'C19', 'C41', 'W79', 'J81', 'C60', 'C11',
       'C38', 'C48', 'C76', 'C14', 'R06', 'J96', 'C01', 'C10', 'G92',
       'C03', 'C37', 'C57', 'J85', 'W80', 'C73', 'C33', 'D41', 'R09',
       'C68', 'C04', 'C21', 'D43', 'C51', 'K71', 'J70', 'D40', 'C65',
       'C30', 'C07', 'C96', 'D48', 'D35', 'C69', 'D37', 'D44', 'R91',
       'R04', 'C75', 'C74', 'E04', 'B21', 'W84', 'D39', 'C47', 'C66',
       'C09', 'E05', 'C52', 'C31', 'D12', 'D38', 'C62', 'C40', 'C00',
       'C08', 'C63', 'D02', 'J80', 'D13', 'C05', 'D17', 'D42', 'C39',
       'D33', 'J39', 'J06', 'C72', 'I37', 'P26', 'D15', 'W77', 'D36',
       'J82', 'D21', 'D23', 'D24', 'D30', 'P27', 'P22', 'D27', 'D34',
       'D26', 'P28',

In [24]:
df_filtro['cod_3_num'] = df_filtro['COD_3'].astype('category')
df_filtro['cod_3_num'] = df_filtro['cod_3_num'].cat.reorder_categories(['B21','C00','C01','C02','C03','C04','C05','C06','C07','C08','C09',\
                                                      'C10','C11','C12','C13','C14','C15','C16','C17','C18','C19','C20',\
                                                      'C21','C22','C23','C24','C25','C26','C30','C31','C32','C33','C34',\
                                                      'C37','C38','C39','C40','C41','C44','C47','C48','C49','C50','C51',\
                                                      'C52','C53','C54','C55','C56','C57','C60','C61','C62','C63','C64',\
                                                      'C65','C66','C67','C68','C69','C70','C71','C72','C73','C74','C75',\
                                                      'C76','C80','C90','C96','D02','D11','D12','D13','D14','D15','D16',\
                                                      'D17','D21','D23','D24','D26','D27','D29','D30','D32','D33','D34',\
                                                      'D35','D36','D37','D38','D39','D40','D41','D42','D43','D44','D47',\
                                                      'D48','E04','E05','G92','I26','I27','I28','I37','J06','J22','J39',\
                                                      'J44','J68','J70','J80','J81','J82','J84','J85','J96','J98','K71',\
                                                      'P22','P26','P27','P28','R04','R06','R09','R91','W77','W79','W80',\
                                                      'W84'], ordered=True)
df_filtro['cod_3_num'] = df_filtro['cod_3_num'].cat.codes
print(df_filtro['cod_3_num'].unique())

[  7  51  32  42 119  61  19  68 116 104  17  67 110 103  54  23  16  21
  14  57 108  26  41  25  98  45  48  47  46  18  30  13   3  60  27  83
  38  85  24  20  37 130 114  50  12  34  40  66  15 126 118   2  11 102
   4  33  49 117 131  63  31  94 127  58   5  22  96  43 120 112  93  55
  28   8  69  99  88  59  90  97 128 125  65  64 100   0 132  92  39  56
  10 101  44  29  72  91  52  36   1   9  53  70 113  73   6  77  95  35
  86 109 107  62 106 122  75 129  89 115  78  79  80  84 123 121  82  87
  81 124 111 105  71  76  74]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [25]:
df_filtro.head()

Unnamed: 0,Id,ProvinciaReside,MunicipioReside,Sexo,AnioCumplidos,TamanioMuniResi,CausaMortaReduc,NivelEstudios,Poblacion,Provincia,Comunidad,Latitud,Longitud,Habitantes,Hombres,Mujeres,CodigoPRTR,LongitudE,LatitudE,Contaminante,total_anios_Expo_Id,Total_Kg_expo,COD_3,Descripcion_Cod_3,cod_3_num
0,597382,33,44,1,64,6,9,4,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",2,337.0,C06,TUMOR MALIGNO DE OTRAS PARTES Y DE LAS NO ESPE...,7
1,597392,33,44,1,72,6,28,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",2,337.0,C61,TUMOR MALIGNO DE LA PROSTATA,51
3,597403,33,44,1,82,6,18,6,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",2,337.0,C34,TUMOR MALIGNO DE LOS BRONQUIOS Y DEL PULMON,32
5,597428,33,44,1,67,6,18,3,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",2,337.0,C34,TUMOR MALIGNO DE LOS BRONQUIOS Y DEL PULMON,32
6,597429,33,44,6,69,6,23,4,Oviedo,Asturias,Asturias,43.36026,-5.844759,224005,104605,119400,4168,-5.811063,43.416965,"1,1,1-tricloroetano (TCE)",2,337.0,C50,TUMOR MALIGNO DE LA MAMA,42


Convertimos igualmente el tipo de Contaminante

In [26]:
df_filtro['Contaminante'].unique()

array(['1,1,1-tricloroetano (TCE)', '1,1,2,2-tetracloroetano',
       'Clorofluorocarburos (CFC) ',
       'Cloro y compuestos inorgánicos (como HCl)', 'Cloruro de vinilo',
       'Cobre y compuestos (como Cu)',
       'Compuestos orgánicos volátiles distintos del metano (COVNM)',
       'Cromo y compuestos (como Cr)', 'Diclorometano (DCM)',
       'Dióxido de carbono (CO2)', 'Hexaclorobenceno (HCB)',
       'Hidroclorofluorocarburos (HCFC)', '1,2-dicloroetano (DCE)',
       'Hidrofluorocarburos (HFC)', 'Mercurio y compuestos (como Hg)',
       'Metano (CH4)', 'Monóxido de carbono (CO)', 'Naftaleno',
       'Níquel y compuestos (como Ni)', 'Óxidos de azufre (SOx/SO2)',
       'Óxidos de nitrógeno (NOx/NO2)', 'Óxido de etileno',
       'Óxido nitroso (N2O)', '1,2,3,4,5,6-hexaclorociclohexano (HCH)',
       'Partículas (PM10)', 'PCDD + PCDF (dioxinas + furanos) (como Teq)',
       'Pentaclorofenol (PCP)', 'Perfluorocarburos (PFC)',
       'Plomo y compuestos (como Pb)', 'Policlorobifenil

In [27]:
df_filtro['Contaminante_num'] = df_filtro['Contaminante'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [28]:
df_filtro['Contaminante_num'] = df_filtro['Contaminante_num'].cat.reorder_categories(['1,1,1-tricloroetano (TCE)',\
    '1,1,2,2-tetracloroetano','1,2,3,4,5,6-hexaclorociclohexano (HCH)','1,2-dicloroetano (DCE)','Amoniaco (NH3)',\
    'Antraceno','Arsénico y compuestos (como As)','Benceno ','Cadmio y compuestos (como Cd)','Cianuro de hidrógeno (HCN)',\
    'Cloro y compuestos inorgánicos (como HCl)','Clorofluorocarburos (CFC) ','Cloruro de vinilo',\
    'Cobre y compuestos (como Cu)','Compuestos orgánicos volátiles distintos del metano (COVNM)',\
    'Cromo y compuestos (como Cr)','Diclorometano (DCM)','Dióxido de carbono (CO2)','Hexaclorobenceno (HCB)',\
    'Hidroclorofluorocarburos (HCFC)','Hidrofluorocarburos (HFC)','Mercurio y compuestos (como Hg)',\
    'Metano (CH4)','Monóxido de carbono (CO)','Naftaleno','Níquel y compuestos (como Ni)','Óxido de etileno',\
    'Óxido nitroso (N2O)','Óxidos de azufre (SOx/SO2)','Óxidos de nitrógeno (NOx/NO2)','Partículas (PM10)',\
    'PCDD + PCDF (dioxinas + furanos) (como Teq)','Pentaclorofenol (PCP)','Perfluorocarburos (PFC)',\
    'Plomo y compuestos (como Pb)','Policlorobifenilos (PCB)','Tetracloroetileno (PER) ','Tetraclorometano (TCM)',\
    'Triclorobencenos totales (TCB)','Tricloroetileno ','Triclorometano','Zinc y compuestos (como Zn)'], ordered=True)

df_filtro['Contaminante_num'] = df_filtro['Contaminante_num'].cat.codes
print(df_filtro['Contaminante_num'].unique())

[ 0  1 11 10 12 13 14 15 16 17 18 19  3 20 21 22 23 24 25 28 29 26 27  2
 30 31 32 33 34 35 36 37 38 39  4 40 41  5  6  7  8  9]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [29]:
df_filtro.sample(100)

Unnamed: 0,Id,ProvinciaReside,MunicipioReside,Sexo,AnioCumplidos,TamanioMuniResi,CausaMortaReduc,NivelEstudios,Poblacion,Provincia,Comunidad,Latitud,Longitud,Habitantes,Hombres,Mujeres,CodigoPRTR,LongitudE,LatitudE,Contaminante,total_anios_Expo_Id,Total_Kg_expo,COD_3,Descripcion_Cod_3,cod_3_num,Contaminante_num
1707523,931208,50,297,1,89,6,30,2,Zaragoza,Zaragoza,Aragón,41.65629,-0.876538,674317,328470,345847,3067,-0.824546,41.691825,Partículas (PM10),4,2.428500e+06,C64,TUMOR MALIGNO DEL RINON EXCEPTO DE LA PELVIS R...,54,30
2338344,99012,15,30,1,71,6,67,4,Coruña (A),A Coruña,Galicia,43.37087,-8.395835,246056,114498,131558,1526,-8.430197,43.347640,Zinc y compuestos (como Zn),1,2.070000e+02,J84,OTRAS ENFERMEDADES PULMONARES INTERSTICIALES,116,41
580016,378118,28,79,1,64,6,15,5,Madrid,Madrid,Madrid,40.41669,-3.700346,3255944,1532079,1723865,2682,-3.703790,40.416775,Hidrofluorocarburos (HFC),7,1.046700e+04,C25,TUMOR MALIGNO DEL PANCREAS,26,20
847171,516521,30,16,1,82,5,11,5,Cartagena,Murcia,Murcia,37.60565,-0.991294,211996,106755,105241,5771,-0.923359,37.600591,Metano (CH4),2,1.629000e+06,C16,TUMOR MALIGNO DEL ESTOMAGO,17,22
1563377,809297,46,110,1,77,3,32,2,Xirivella,Valencia/València,Valencia,39.46669,-0.427794,30691,15474,15217,3619,-0.376288,39.469907,Óxido nitroso (N2O),3,1.424224e+05,C65,TUMOR MALIGNO DE LA PELVIS RENAL,55,27
2344506,106950,15,30,1,71,6,28,6,Coruña (A),A Coruña,Galicia,43.37087,-8.395835,246056,114498,131558,746,-8.434987,43.345044,Antraceno,8,1.443000e+03,C61,TUMOR MALIGNO DE LA PROSTATA,51,5
1793904,1018252,8,123,6,100,3,67,3,Molins de Rei,Barcelona,Catalunya,41.41392,2.015836,24067,11753,12314,331,2.064150,41.382450,PCDD + PCDF (dioxinas + furanos) (como Teq),1,2.220000e-03,J98,OTROS TRASTORNOS RESPIRATORIOS,119,31
1192651,371369,28,79,6,83,6,23,2,Madrid,Madrid,Madrid,40.41669,-3.700346,3255944,1532079,1723865,1710,-3.703790,40.416775,Óxidos de azufre (SOx/SO2),3,1.193736e+06,C50,TUMOR MALIGNO DE LA MAMA,42,28
2125264,1135132,8,298,6,83,3,36,4,Vic,Barcelona,Catalunya,41.93029,2.254350,39844,19940,19904,419,2.239349,41.945882,Amoniaco (NH3),2,1.029487e+06,C90,MIELOMA MULTIPLES Y TUMORES MALIGNOS DE CELULA...,68,4
1356856,1080803,8,19,1,42,6,93,99,Barcelona,Barcelona,Catalunya,41.38792,2.169919,1621537,771570,849967,3037,2.173404,41.385064,Óxidos de nitrógeno (NOx/NO2),3,6.463252e+06,W80,INHALACION E INGESTION DE OTROS OBJETOS QUE CA...,131,29


Salvamos el dataframe generado a CSV para disponer de este más adelante

In [30]:
df_filtro.to_csv('../data/csv/model_csv/df_model.csv', sep=';', encoding= 'utf-8', compression='gzip')