In [1]:
import pandas as pd
import numpy as np 
import wget
import zipfile
import os
from datetime import date
import datetime

In [2]:
# Dates with historical data and breaks
end_date    = date(2020, 10, 4)
start_date  = date(2020, 4, 12)
break_date  = date(2020, 10, 6)
dateseries  = [date.fromordinal(i) for i in range(start_date.toordinal(), end_date.toordinal()+1)]

# Empty dataset
columns     = ['id','DeathsR']
mxr         = pd.DataFrame(index=dateseries,columns=columns)
mxr         = mxr.reset_index()
mxr         = mxr.rename(columns={"index": "date"})
mxr['date'] = pd.to_datetime(mxr['date'])
mxr['id']              = 0

mxr = mxr.reset_index()
del mxr['index']

In [3]:
# Download the entire raw data and build a new dataset with specific selections - as reported!!
# Data address: https://www.gob.mx/salud/documentos/datos-abiertos-152127
    
for i in dateseries:
    print('Working on: '+str(i), end="\r")

    # Extract year, month, day
    year = str(i.year-2000)
    if i.month<10:
        month = str(0)+str(i.month)
    else:
        month = str(i.month)
    if i.day<10:    
        day   = str(0)+str(i.day)
    else:   
        day   = str(i.day)

    # Download and extract file    
    filenamezip = 'datos_abiertos_covid19_'+day+'.'+month+'.20'+year+'.zip'
    if i.year<=2020:
        url = 'http://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/historicos/'+month+'/'+filenamezip
    else:
        url = 'http://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/historicos/20'+year+'/'+month+'/'+filenamezip
    filenamezip = wget.download(url)
    with zipfile.ZipFile(filenamezip, 'r') as zip_ref:
        zip_ref.extractall()

    # Read data and delete files
    filename = year+month+day+'COVID19MEXICO.csv'
    df = pd.read_csv(filename,encoding='latin1',low_memory=False)
    os.remove(filenamezip)
    os.remove(filename)

    # Data selection
    if i<=break_date:
        s = df[
            (df['FECHA_DEF'] != '9999-99-99') & 
            (df['RESULTADO'] == 1) 
        ][['FECHA_DEF','RESULTADO']] .groupby("FECHA_DEF") .count() .cumsum()
        s.index= pd.to_datetime(s.index)
        idx = pd.date_range(start_date.strftime("%Y-%m-%d"), i.strftime("%Y-%m-%d"))
        s = s.reindex(idx).fillna(method='ffill').fillna(0)
        s = s.rename(columns={'RESULTADO': 'DeathsR'})
    else:
        s = df[
            (df['FECHA_DEF'] != '9999-99-99') & 
            (df['CLASIFICACION_FINAL'] <= 3)
        ][['FECHA_DEF','CLASIFICACION_FINAL']] .groupby("FECHA_DEF") .count() .cumsum()
        s.index= pd.to_datetime(s.index)
        idx = pd.date_range(start_date.strftime("%Y-%m-%d"), i.strftime("%Y-%m-%d"))
        s = s.reindex(idx).fillna(method='ffill').fillna(0)
        s = s.rename(columns={'CLASIFICACION_FINAL': 'DeathsR'})

    mxr.loc[(mxr['date']==pd.to_datetime(i)),'DeathsR'] = s['DeathsR'][-1]

    del s
    del idx 

    del df

    

Working on: 2020-10-04

In [6]:
# Save dataset 
mxr.to_csv('Data_Reported_all.csv', index=False)


In [7]:
# Get data as occurred

#end_date    = date(2021, 2, 11)
end_date    = date(2021, 3, 1)
start_date  = date(2020, 4, 12)
break_date  = date(2020, 10, 6)
dateseries  = [date.fromordinal(i) for i in range(start_date.toordinal(), end_date.toordinal()+1)]

# Empty dataset
columns     = ['id','CLAVE_MUNICIPIO','CLAVE_ENTIDAD','DeathsO']
mxo         = pd.DataFrame(columns=columns)

for i in [end_date]:
    print('Working on: '+str(i), end="\r")
    
    # Extract year, month, day
    year = str(i.year-2000)
    if i.month<10:
        month = str(0)+str(i.month)
    else:
        month = str(i.month)
    if i.day<10:    
        day   = str(0)+str(i.day)
    else:   
        day   = str(i.day)

    # Download and extract file    
    filenamezip = 'datos_abiertos_covid19_'+day+'.'+month+'.20'+year+'.zip'
    if i.year<=2020:
        url = 'http://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/historicos/'+month+'/'+filenamezip
    else:
        url = 'http://datosabiertos.salud.gob.mx/gobmx/salud/datos_abiertos/historicos/20'+year+'/'+month+'/'+filenamezip
    filenamezip = wget.download(url)
    with zipfile.ZipFile(filenamezip, 'r') as zip_ref:
        zip_ref.extractall()

    # Read data and delete files
    filename = year+month+day+'COVID19MEXICO.csv'
    df = pd.read_csv(filename,encoding='latin1',low_memory=False)
    os.remove(filenamezip)
    os.remove(filename)

    # Municipality and entity

    # Data selection
    if i<=break_date:
        s = df[
            (df['FECHA_DEF'] != '9999-99-99') & 
            (df['RESULTADO'] == 1) 
            &
            (df['MUNICIPIO_RES'] == im) &
            (df['ENTIDAD_RES'] == ie) 
        ][['FECHA_DEF','RESULTADO']] .groupby("FECHA_DEF") .count() .cumsum()
        s.index= pd.to_datetime(s.index)
        idx = pd.date_range(start_date.strftime("%Y-%m-%d"), i.strftime("%Y-%m-%d"))
        s = s.reindex(idx).fillna(method='ffill').fillna(0)
        s = s.rename(columns={'RESULTADO': 'DeathsO'})
    else:
        s = df[
            (df['FECHA_DEF'] != '9999-99-99') & 
            (df['CLASIFICACION_FINAL'] <= 3)
        ][['FECHA_DEF','CLASIFICACION_FINAL']] .groupby("FECHA_DEF") .count() .cumsum()
        s.index= pd.to_datetime(s.index)
        idx = pd.date_range(start_date.strftime("%Y-%m-%d"), i.strftime("%Y-%m-%d"))
        s = s.reindex(idx).fillna(method='ffill').fillna(0)
        s = s.rename(columns={'CLASIFICACION_FINAL': 'DeathsO'})

    mxo = mxo.append(s)
    del s,idx
    del df

mxo         = mxo.reset_index()
mxo         = mxo.rename(columns={"index": "date"}) 

Working on: 2021-03-01

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [8]:
mxo

Unnamed: 0,date,CLAVE_ENTIDAD,CLAVE_MUNICIPIO,DeathsO,id
0,2020-04-12,,,663,
1,2020-04-13,,,750,
2,2020-04-14,,,847,
3,2020-04-15,,,945,
4,2020-04-16,,,1035,
5,2020-04-17,,,1153,
6,2020-04-18,,,1273,
7,2020-04-19,,,1421,
8,2020-04-20,,,1597,
9,2020-04-21,,,1773,


In [10]:
# Put both together
mxr = pd.read_csv('Data_Reported_all.csv',encoding='latin1',low_memory=False)
mxr['date'] = mxr['date'].astype('datetime64[ns]')

mx = mxo.join(mxr[mxr['date']<='2020-10-05']['DeathsR']).reset_index()
del mx['CLAVE_ENTIDAD']
del mx['CLAVE_MUNICIPIO']
del mx['id']

In [14]:
mx.to_csv('Data_Reported_all_both.csv', index=False)