# Getting the TOTAL DEATHS from COVID-19 of the 24 departments of Peru

## 0. Run libraries
Empezamos cargando las librerías que necesitamos y leyendo nuestro csv actualizado de fallecidos por COVID-19.

In [82]:
import pandas as pd
import numpy as np

import functions as fn

## 1.

In [83]:
def just_cities(fal_url):
    """
    Función para filtrar a cada fallecido reportado en su año
    y semana epidemiológica en ciudad (Fallecidos = fal, Vacunados = vac)
    """     
    # Seleccionamos solo las col 'FECHA_FALLECIMIENTO' y 'DEPARTAMENTO'      
    fal_col = ['FECHA_FALLECIMIENTO', 'DEPARTAMENTO']   
    df_fal = fn.read_largeCSV_file(fal_url, ';', fal_col)
    fn.variable_fecha(df_fal, 'FECHA_FALLECIMIENTO')
    fn.date_to_epiweek(df_fal,'FECHA_FALLECIMIENTO')
    df_fal['fallecido'] = 1                                         
    df_fal['fallecido'].apply(np.int8) 
    df_fal.info()
    
    return df_fal


fal_url = "RawData/fallecidos_covid.csv"
df = just_cities(fal_url)
del fal_url

Read csv with dask:  0.672107458114624 sec
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200246 entries, 0 to 200245
Columns: 2 entries, FECHA_FALLECIMIENTO to DEPARTAMENTO
dtypes: category(1), int64(1)
memory usage: 1.7 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200246 entries, 0 to 200245
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   DEPARTAMENTO  200246 non-null  category
 1   epi_year      200246 non-null  int64   
 2   epi_week      200246 non-null  int64   
dtypes: category(1), int64(2)
memory usage: 3.2 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200246 entries, 0 to 200245
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   DEPARTAMENTO  200246 non-null  category
 1   epi_year      200246 non-null  int64   
 2   epi_week      200246 non-null  int64   
 3   fallecido     200246 non-null  int64   
dtypes

In [84]:
print(df.head())

  DEPARTAMENTO  epi_year  epi_week  fallecido
0         LIMA      2021        17          1
1         LIMA      2021        17          1
2     AYACUCHO      2021        17          1
3         LIMA      2021        16          1
4     AREQUIPA      2021        31          1


Realizamos un crosstab de del total de fallecidos por COVID-19 por departamento del Perú, por cada semana y año epidemiológico.

In [85]:
epi_ciudades = pd.crosstab(index=[df['epi_year'], df['epi_week']],
                           columns=[df['fallecido'], df['DEPARTAMENTO']],
                           margins = True)

epi_ciudades.to_csv('Data/fallecidosXciudadesXsemanasEpi.csv')
                           

Ahora realizamos un crosstab del TOTAL de fallecidos por departamento.

In [86]:
fal_ciudades = pd.crosstab(index = df['DEPARTAMENTO'],
                           columns = df['fallecido'],
                           margins = True)
del fal_ciudades["All"]
fal_ciudades.columns = ['fallecidos']
fal_ciudades.rename(index={'All': 'PERU'})

Unnamed: 0_level_0,fallecidos
DEPARTAMENTO,Unnamed: 1_level_1
AMAZONAS,1258
ANCASH,6674
APURIMAC,1510
AREQUIPA,9680
AYACUCHO,2121
CAJAMARCA,4135
CALLAO,9982
CUSCO,4794
HUANCAVELICA,1161
HUANUCO,2700


In [None]:
# https://es.wikipedia.org/wiki/Anexo:Departamentos_del_Per%C3%BA_por_poblaci%C3%B3n
dict_dep = {
    "AMAZONAS"	:	426806,
    "ANCASH"	:	1180638,
    "APURIMAC"	:	430736,
    "AREQUIPA"	:	1497438,
    "AYACUCHO"	:	668213,
    "CAJAMARCA"	:	1453711,
    "CALLAO"	:	1129854,
    "CUSCO"	    :	1357075,
    "HUANCAVELICA":	365317,
    "HUANUCO"	:	760267,
    "ICA"	    :	975182,
    "JUNIN"	    :	1361467,
    "LALIBERTAD":	2016771,
    "LAMBAYEQUE":	1310785,
    "LIMA"	    :	10628470,
    "LORETO"	:	1027559,
    "MADREDEDIOS":	173811,
    "MOQUEGUA"	:	192740,
    "PASCO"	    :	271904,
    "PIURA"	    :	2047954,
    "PUNO"	    :	1237997,
    "SANMARTIN"	:	899648,
    "TACNA"	    :	370974,
    "TUMBES"	:	251521,
    "UCAYALI"	:	589110,
    "PERÚ"  	:	32625948,
}



no_habitantes = [426806,
                 1180638,
                 430736,
                 1497438,
                 668213,
                 1453711,
                 1129854,
                 1357075,
                 365317,
                 760267,
                 975182,
                 1361467,
                 2016771,
                 1310785,
                 10628470,
                 1027559,
                 173811,
                 192740,
                 271904,
                 2047954,
                 1237997,
                 899648,
                 370974,
                 251521,
                 589110]

no_habitantes.append(sum(no_habitantes))
fal_ciudades['no_habitantes'] = no_habitantes
del no_habitantes

fal_ciudades['tasa_mortalidad'] = (fal_ciudades['fallecidos']/fal_ciudades['no_habitantes'])*100

fal_ciudades.to_csv('Data/TOTAL_fallecidosXciudades.csv')

# Getting the total of people FULLY VACCINATED (2 doses) by the 24 departments of Peru

There are no direct way to found the total of people fully vaccinated per departments. To achieve this the following is planned:

1. The vaccination dataset **(RawData/TB_VACUNACION_COVID19.csv)** only gives information about the vaccination center called *id_centro_vacunacion*. NOT the department or another relevant location.

2. The vaccination centers dataset **(RawData/TB_CENTRO_VACUNACION.csv)** can be used to "match" the *id_centro_vacunacion* with the *id_ubigeo*. Which is a numeric variable from 0 to 1894 of the specifict district.

3.  Finally with the UBIGEO dataset **(RawData/TB_UBIGEOS.csv)** it is possible to "match" each *id_ubigeo* with the correct department of Peru.


In [99]:
def vac_department(vac_url):
    """
    Función que toma la dirección del  dataset de vacunados y devuelve el número de VACUNADOS 
    por los 24 departamentos del Perú
    """
    vac_col = ['id_centro_vacunacion', 'dosis']                     
    df_vac = fn.read_largeCSV_file(vac_url, ',', vac_col)    
    lst_vac = fn.df_into_chunks(df_vac)               
    
    for df in lst_vac:                                           
        df = df.drop(df[df["dosis"] == 1].index,  inplace=True)     # Drop non fully vaccinated (1 dose)

    for df in lst_vac:
        df['vacunado'] = 1  # To count vaccinated
        df['vacunado'] = df['vacunado'].apply(np.int8)
        del df['dosis']     # Dose var is no needed anymore

    return lst_vac

vac_url = "RawData/TB_VACUNACION_COVID19.csv"
vacxdep = vac_department(vac_url)
del vac_url

Read csv with dask:  6.994384527206421 sec
<class 'pandas.core.frame.DataFrame'>
Int64Index: 34638125 entries, 0 to 544332
Columns: 2 entries, id_centro_vacunacion to dosis
dtypes: int64(1), int8(1)
memory usage: 561.6 MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats i

# See

Dataframe of all **fully vaccinated** people (2 doses) with the *'id_centro_vacunacion'* or place of vaccination.

**IMPORTANT:** "vacxdep" variable is actually a list of *dataframes* or *chunks*.

In [100]:
def vacxdep_chunks(dfs_vac):
    """
    Devuelve un dataframe con el TOTAL DE VACUNADOS por DEPARTAMENTO 
    del PERÚ (recibe una lista de dataframes o chunks)
    """
    var_holder = {}     # Diccionario para guardar nombres                                             
    lst_epi_vac = []    # Lista de dfs para cada sumatoria de chunks
                                         
    for i, chunk in enumerate(dfs_vac):
        var_holder['epi_vac_' + str(i)]= pd.crosstab(index=[chunk['id_centro_vacunacion']],
                                                     columns=chunk['vacunado'])
        lst_epi_vac.append(var_holder['epi_vac_' + str(i)])
    
    merged_epivac = pd.concat(lst_epi_vac, axis=1)  # Merge all dfs
    epi_vac = pd.DataFrame(merged_epivac.sum(numeric_only=True, axis=1))
    epi_vac.columns = ['vacunados']
    epi_vac['vacunados'] = epi_vac['vacunados'].astype(np.int64)
    epi_vac.reset_index(level=0, inplace=True)

    return epi_vac


print("Slice of a dataframe of all fully vaccinated people in Peru (2 doses) per place of vaccination:")
print(vacxdep[0].head()), print("\n")

vacxdep_sum = vacxdep_chunks(vacxdep)
del vacxdep
print("Slice of a SUMMARY dataframe of all fully vaccinated people in Peru (2 doses) per place of vaccination:")
print(vacxdep_sum.head()), print("\n")
print('Fully vaccinated (2 doses): '), 
vacxdep_sum['vacunados'].sum()



Slice of a dataframe of all fully vaccinated people in Peru (2 doses) per place of vaccination:
   id_centro_vacunacion  vacunado
0                    17         1
1                  1828         1
2                103617         1
3                   891         1
7                108281         1


Slice of a SUMMARY dataframe of all fully vaccinated people in Peru (2 doses) per place of vaccination:
   id_centro_vacunacion  vacunados
0                     1        331
1                     3     150269
2                     5       7998
3                     6      33530
4                     8      44992


Fully vaccinated (2 doses): 


15676507

## Read the 2 other csv with the directions

For this case is necesary to match ...

In [101]:
ubigeo_url = 'RawData/TB_UBIGEOS.csv'
vaccenter_url = 'RawData/TB_CENTRO_VACUNACION.csv'

ubigeo = pd.read_csv(ubigeo_url, usecols = ['id_ubigeo', 'departamento'])
vaccenter = pd.read_csv(vaccenter_url, usecols= ['id_centro_vacunacion','id_ubigeo'])

del ubigeo_url, vaccenter_url

vaccenter = vaccenter.merge(ubigeo, on = 'id_ubigeo', how = 'left')
del vaccenter['id_ubigeo']

print("Merged dataframe with: 'id_centro_vacunacion' and 'departamento'") 
print(vaccenter.head(10))
print("NOTE: Necesary to match the department var with the SUMMARY dataframe of all fully vaccinated")



Merged dataframe with: 'id_centro_vacunacion' and 'departamento'
   id_centro_vacunacion departamento
0                  2021         PUNO
1                  3699   SAN MARTIN
2                   154   SAN MARTIN
3                   155   SAN MARTIN
4                  3260   SAN MARTIN
5                  2906       ANCASH
6                  2907       ANCASH
7                  2909       ANCASH
8                  2910       ANCASH
9                  2912       ANCASH
NOTE: Necesary to match the department var with the SUMMARY dataframe of all fully vaccinated


## Now it is possible to found the deparment of all fully vaccinated

In [102]:
vacxdep_sum = vacxdep_sum.merge(vaccenter, on = 'id_centro_vacunacion', how = 'left')
del vacxdep_sum['id_centro_vacunacion']

Finally just get the total of vaccinated grouping by department.

In [109]:
ct_vacxdep = vacxdep_sum.groupby(['departamento']).sum()    # Sum by departments
ct_vacxdep.loc['PERU',:]= ct_vacxdep.sum(axis=0)            # Total of fully vaccinated
ct_vacxdep['vacunados'] = ct_vacxdep['vacunados'].apply(np.int64) 

print(ct_vacxdep)

               vacunados
departamento            
AMAZONAS          165057
ANCASH            644423
APURIMAC          201423
AREQUIPA          763523
AYACUCHO          243124
CAJAMARCA         610599
CALLAO            695497
CUSCO             575067
HUANCAVELICA      149145
HUANUCO           265974
ICA               515352
JUNIN             651236
LA LIBERTAD       935166
LAMBAYEQUE        597063
LIMA             6011436
LORETO            272941
MADRE DE DIOS      49748
MOQUEGUA          116037
PASCO             129634
PIURA             777954
PUNO              345062
SAN MARTIN        374244
TACNA             211017
TUMBES            128651
UCAYALI           188548
PERU            15617921


In [None]:
del ubigeo, vaccenter, vacxdep_sum

Also it is possible to merge this dataframe of fully vaccinated people per department with the total of deceased per department.