# 0. Run libraries and functions

In [78]:
import pandas as pd
import numpy as np

import functions as fn



def just_dates(vac_url, fal_url):
    """
    Función para filtrar a cada vacunado y fallecido reportado en su año
    y semana epidemiológica (Fallecidos = fal, Vacunados = vac)

    Parameters
    ----------
    vac_url: Directorio o url del dataset de VACUNADOS contra COVID-19
    fal_url: Directorio o url del dataset de FALLECIDOS por COVID-19

    Returns
    -------
    lst_vac: Lista de 'chunks' o dataframes de tamaño n de VACUNADOS
    df_fal: Dataframe de FALLECIDOS

    """
    vac_col = ['fecha_vacunacion']  # Seleccionar solamente la columna de fechas de ambos datasets                         
    fal_col = ['FECHA_FALLECIMIENTO']
    
    df_vac = fn.read_largeCSV_file(vac_url, ',', vac_col)   # Leemos los datasets       
    df_fal = fn.read_largeCSV_file(fal_url, ';', fal_col)
    
    fn.variable_fecha_ymd(df_vac, 'fecha_vacunacion')   # Transformamos a formato fecha (datetime64[ns])            
    fn.variable_fecha(df_fal, 'FECHA_FALLECIMIENTO')
    fn.date_to_epiweek(df_fal,'FECHA_FALLECIMIENTO')    # Obtenemos semana epidemiológica de fallecidos              
    
    lst_vac = fn.df_into_chunks(df_vac)   #  Dividimos en chunks el dataset de vacunados  

    # Obtenemos semana epidemiológica de vacunados para cada chunk
    for chunk in lst_vac:                                           
        chunk = fn.date_to_epiweek(chunk, 'fecha_vacunacion')

    # Creamos columnas de 1 para contabilizar cada caso de fallecido
    df_fal['fallecido'] = 1                                         
    df_fal['fallecido'].apply(np.int8) 
    df_fal.info()
    
    # Creamos columnas de 1 en cada chunk para contabilizar cada caso de vacunado
    for chunk in lst_vac:                                        
        chunk['vacunado'] = 1
        chunk['vacunado'].apply(np.int8)
        del chunk
    
    return lst_vac, df_fal
    


def epiweeks(df_fal):
    """
    Devuelve un dataframe con el total de FALLECIDOS por semana y año 
    epidemiológico
    """
    epi_fal = pd.crosstab(index=[df_fal['epi_year'],
                                 df_fal['epi_week']],
                          columns=df_fal['fallecido'])
    epi_fal.columns = ['fallecidos']    
    
    return epi_fal
    
  
    
def epiweeks_chunks(dfs_vac):
    """
    Devuelve un dataframe con el total de VACUNADOS por semana y año 
    epidemiológico (recibe una lista de dataframes o chunks)
    """
    var_holder = {}     # Diccionario para guardar nombres                                             
    lst_epi_vac = []    # Lista de dfs para cada sumatoria de chunks
                                         
    for i, chunk in enumerate(dfs_vac):
        var_holder['epi_vac_' + str(i)]= pd.crosstab(index=[chunk['epi_year'],
                                                            chunk['epi_week']],
                                                     columns=chunk['vacunado'])
        lst_epi_vac.append(var_holder['epi_vac_' + str(i)])
    # Unimos todos los dfs sumados en uno solo
    merged_epivac = pd.concat(lst_epi_vac, axis=1) 
    epi_vac = pd.DataFrame(merged_epivac.sum(numeric_only=True, axis=1))
    epi_vac.columns = ['vacunados']
    
    return epi_vac
  
    
  
def merged_epiweeks(epi_vac, epi_fal):
    """
    Junta los 2 dataframes resultantes, el del total de vacunados y del total de 
    fallecidos por semana epidemiológica
    """
    # Concatenamos ambos dataframes (hay Nan values)
    merged_epiweeks = pd.concat([epi_fal, epi_vac], axis=1)         
    merged_epiweeks = merged_epiweeks.fillna(value = 0)
    # Cambiamos a Int64 ya que existen Nan values y cambia a float64 automáticamente
    merged_epiweeks = merged_epiweeks.astype('Int64')
    merged_epiweeks = merged_epiweeks.rename({'fallecidos': 'deceased', 'vacunados': 'vaccinated'}, axis=1) 
    
    return merged_epiweeks

# 1. Check if the rawdata still have the same format as the used in this code

In [75]:
vac_url = 'RawData/TB_VACUNACION_COVID19.csv'
fal_url = 'RawData/fallecidos_covid.csv'

df_vac = pd.read_csv(vac_url, sep=',', nrows= 3)
print("Dataset of all those vaccinated against COVID-19 in Peru:")
print(df_vac), print("\n")
df_vac.info(), print("\n")

df_fal = pd.read_csv(fal_url, sep=';', nrows= 3)
print("Dataset of all those deceased by COVID-19 in Peru:")
print(df_fal), print("\n")
df_fal.info()

del df_vac
del df_fal


Dataset of all those vaccinated against COVID-19 in Peru:
   id_persona  id_vacunados_covid19 fecha_vacunacion  id_eess  \
0    14395344              11932154       19/07/2021     5814   
1    15330904              13342810       17/06/2021     4339   
2    15611041              13342938       11/06/2021     3617   

   id_centro_vacunacion  id_vacuna  id_grupo_riesgo  dosis  edad  
0                    17          5               62      2    57  
1                  1828          5               54      2    69  
2                103617          2               54      2    70  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id_persona            3 non-null      int64 
 1   id_vacunados_covid19  3 non-null      int64 
 2   fecha_vacunacion      3 non-null      object
 3   id_eess               3 non-null      int64 
 4   id_cen

# 2. Run code

In [80]:
if __name__ == '__main__':
    dfs_vac,df_fal = just_dates(vac_url, fal_url)
    
    epi_fal = epiweeks(df_fal)
    epi_vac = epiweeks_chunks(dfs_vac)
  
    epiweeks = merged_epiweeks(epi_vac, epi_fal)
    
del vac_url, fal_url

epiweeks.to_csv('Data/epi_weeks.csv')

Read csv with dask:  13.33286452293396 sec
<class 'pandas.core.frame.DataFrame'>
Int64Index: 34638125 entries, 0 to 544332
Columns: 1 entries, fecha_vacunacion to fecha_vacunacion
dtypes: object(1)
memory usage: 2.4 GB
Read csv with dask:  0.14126873016357422 sec
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200246 entries, 0 to 200245
Columns: 1 entries, FECHA_FALLECIMIENTO to FECHA_FALLECIMIENTO
dtypes: int64(1)
memory usage: 1.5 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200246 entries, 0 to 200245
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   epi_year  200246 non-null  int64
 1   epi_week  200246 non-null  int64
dtypes: int64(2)
memory usage: 3.1 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del df[date_name]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  '''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 0 to 499999
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   epi_year  500000 non-null  int64
 1   epi_week  500000 non-null  int64
dtypes: int64(2)
memory usage: 11.4 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 500000 to 999999
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   epi_year  500000 non-null  int64
 1   epi_week  500000 non-null  int64
dtypes: int64(2)
memory usage: 11.4 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 1000000 to 181831
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   epi_year  500000 non-null  int64
 1   epi_week  500000 non-null  int64
dtypes: int64(2)
memory usage: 11.4 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 500000 entries, 181832 to 681831
Dat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['vacunado'] = 1
