## Algorand Covid-19 Project
## Demo: Bulk survey transaction scrape from Algorand mainnet via PureStake API

Source documentation: https://github.com/algorandfoundation/IReport-Covid/blob/master/js/retrieveData.js

In [4]:
import algosdk
import math
import pandas as pd
import numpy as np
from datetime import datetime
from py_algorand import Algorand_IReportScrape # class sript

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [5]:
# attribute array
headers = [
    'a',
    '_t',  
    '_v',  

    # general demographic data
    'gc',  # string, country code (see Location Data section below)
    'gr',  # string, region code  (see Location Data section below)
    'gzp', # string, 3-digit zip code (US only)
    'ga',  # integer, age group, if present must be in 1,11,21,31,41,51,56,61,66,71,76,81,85
    'gs',  # string , gender, if present must be 'm','f'

    # symptoms
    'sz',  # integer, is symptomatic, no-answer=0/no=-1/yes=1
    's1',  # boolean, fever
    's2',  # boolean, cough
    's3',  # boolean, difficulty breathing
    's4',  # boolean, fatigue
    's5',  # boolean, sore throat
    'sds', # date, when symptoms started, yyyy-mm-dd
    'sde', # date, when symptoms ended, yyyy-mm-dd
    'sdn', # boolean, still symptomatic

    #tested
    'tz',  # integer, tested, no-answer=0/no=-1/yes=1
    'tt',  # integer, tried to get tested, no=-1, yes=1, yes but was denied=2
    'td',  # date, test date, yyyy-mm-dd
    'tr',  # integer, test results, -1=negative,1=positive,2=waiting for result
    'tl',  #  integer, test location, 1=Dr office/2=Hospital/3=Urgent care/4=Ad-hoc center/5=Other

    # medical care
    'mz',  #  integer, received care, no-answer=0/no=-1/yes=1
    'm1',  #  boolean, doctor's office
    'm2',  #  boolean, walk-in clinic
    'm3',  #  boolean, virtual care
    'm4',  #  boolean, hospital/ER
    'm5',  #  boolean, other
    'mh',  #  integer, hospitalized, no-answer=0/no=-1/yes=1
    'mhs', #  date, when admitted, yyyy-mm-dd
    'mhe', #  date, when discharged, yyyy-mm-dd
    'mhn', #  boolean, still in hospital

    # quarantine
    'qz',  # integer, was quarantined, no-answer=0/no=-1/yes=1
    'q1',  # boolean, due to symptoms
    'q2',  # boolean, voluntarily
    'q3',  # boolean, personally required
    'q4',  # boolean, general quarantine
    'qds', # date, when quarantine started, yyyy-mm-dd
    'qde', # date, when quarantine ended, yyyy-mm-dd
    'qdn', # boolean, still quarantined
    'ql',  # integer, left quarantine temporarily no-answer=0/no=-1/yes=1

    'consent' # boolean' , user's consent, mandatory, must be 'true'
]

In [7]:
purestake_api_key = 'xxxxxXXXXXXXXXXXXXXXXXXXXxxxxx'
covidData_scraper = Algorand_IReportScrape(purestake_api_key)

In [9]:
txns = covidData_scraper.get_txns()

In [5]:
len(txns)

1282

In [6]:
data_df = pd.DataFrame()

###### DECODING DATA
for i in range(len(txns)):
    if (i%1000 == 0): print("{} transactions decoded".format(i))
    tx_dict = txns[i]
    tx_code = tx_dict['tx']
    encoded_note = tx_dict['noteb64']
    decoded_note = algosdk.encoding.msgpack.unpackb(algosdk.encoding.base64.b64decode(encoded_note))
    decoded_note = decoded_note[b'd']
    decoded_note_data = {
        key.decode() if isinstance(key, bytes) else key:
        val.decode() if isinstance(val, bytes) else val
        for key, val in decoded_note.items()
    }
    decoded_note_data.update({'a':tx_code})
    cleaned_note_data = {key:None for key in headers}
    cleaned_note_data.update(decoded_note_data)
    #print(cleaned_note_data)
    data_df = data_df.append(cleaned_note_data, ignore_index=True)

0 transactions decoded
1000 transactions decoded


In [7]:
data_df.head()

Unnamed: 0,_t,_v,a,consent,ga,gc,gr,gs,gzp,m1,m2,m3,m4,m5,mh,mhe,mhn,mhs,mz,q1,q2,q3,q4,qde,qdn,qds,ql,qz,s1,s2,s3,s4,s5,sde,sdn,sds,sz,td,tl,tr,tt,tz
0,report,1,UGY5YWRRYLLCDC5SD2BIAUEBXKZROG4N56VT6QRZBZD566...,1.0,55,US,NY,f,100.0,,,,,,,,,,-1,,1.0,,,,1.0,2020-03-10,1.0,1,,,,,,,,,-1,,,,-1.0,-1
1,report,1,5USYC4OOGQYGSKRS3LLTRCSLYIX5CY6Y2HIDRY6S3MISED...,1.0,40,AU,ACT,m,,,,,,,,,,,-1,,,,,,,,,-1,,,,,,,,,-1,,,,-1.0,-1
2,report,1,MSJEL2ZTRSNKJLA6G5H5ITPLY6ZFYBFVG47G3765XM7OQJ...,1.0,65,US,,m,100.0,,,,,,,,,,-1,True,,,,2020-03-20,,2020-03-13,-1.0,1,,,,,,,,,-1,2020-03-20,3.0,-1.0,,1
3,report,1,FJD33KOQXQ5E2SA5VWB5MAIHXZTKVZUM5NEK7FJN56UKGS...,1.0,40,US,NY,f,104.0,,,,,,,,,,-1,,,,,,,,,-1,,,,,,,,,-1,,,,-1.0,-1
4,report,1,QTLPDQHRDHMD5NGGIZFSRZ4MQACT3MK76JZ6JMDXDNLIRZ...,1.0,20,US,MD,m,207.0,,,,,,,,,,-1,,,,,,,,,-1,,,,,,,,,-1,,,,-1.0,-1


In [8]:
cols = ["_t","_v","tx_id","consent","age_group","country_code","region_code","gender","3_dig_zip",
          "doctors_office","walk_in_clinic","virtual_care","hospital_or_ER","other",
           "hospitalized","when_discharged","still_in_hospital","when_admitted","received_care",
          "symptom_quarantine","voluntary_quarantine","personally_required_quarantine","general_quarantine",
          "when_quarantine_ended","still_in_quarantine","when_quarantine_started","left_quarantine_temporarily","was_quarantined",
          "fever","cough","difficulty_breathing","fatigue","sore_throat","when_symptoms_ended","still_symptomatic","when_symptoms_started","is_symptomatic",
          "test_date","test_location","test_results","tried_to_get_tested","tested"]

print(len(cols)==len(headers)==len(data_df.columns))

True


In [9]:
data_df.columns = cols
print(data_df.shape)
display(data_df.head())

(1282, 42)


Unnamed: 0,_t,_v,tx_id,consent,age_group,country_code,region_code,gender,3_dig_zip,doctors_office,walk_in_clinic,virtual_care,hospital_or_ER,other,hospitalized,when_discharged,still_in_hospital,when_admitted,received_care,symptom_quarantine,voluntary_quarantine,personally_required_quarantine,general_quarantine,when_quarantine_ended,still_in_quarantine,when_quarantine_started,left_quarantine_temporarily,was_quarantined,fever,cough,difficulty_breathing,fatigue,sore_throat,when_symptoms_ended,still_symptomatic,when_symptoms_started,is_symptomatic,test_date,test_location,test_results,tried_to_get_tested,tested
0,report,1,UGY5YWRRYLLCDC5SD2BIAUEBXKZROG4N56VT6QRZBZD566...,1.0,55,US,NY,f,100.0,,,,,,,,,,-1,,1.0,,,,1.0,2020-03-10,1.0,1,,,,,,,,,-1,,,,-1.0,-1
1,report,1,5USYC4OOGQYGSKRS3LLTRCSLYIX5CY6Y2HIDRY6S3MISED...,1.0,40,AU,ACT,m,,,,,,,,,,,-1,,,,,,,,,-1,,,,,,,,,-1,,,,-1.0,-1
2,report,1,MSJEL2ZTRSNKJLA6G5H5ITPLY6ZFYBFVG47G3765XM7OQJ...,1.0,65,US,,m,100.0,,,,,,,,,,-1,True,,,,2020-03-20,,2020-03-13,-1.0,1,,,,,,,,,-1,2020-03-20,3.0,-1.0,,1
3,report,1,FJD33KOQXQ5E2SA5VWB5MAIHXZTKVZUM5NEK7FJN56UKGS...,1.0,40,US,NY,f,104.0,,,,,,,,,,-1,,,,,,,,,-1,,,,,,,,,-1,,,,-1.0,-1
4,report,1,QTLPDQHRDHMD5NGGIZFSRZ4MQACT3MK76JZ6JMDXDNLIRZ...,1.0,20,US,MD,m,207.0,,,,,,,,,,-1,,,,,,,,,-1,,,,,,,,,-1,,,,-1.0,-1


In [10]:
date = str(datetime.today().strftime('%Y-%m-%d'))

In [11]:
data_df.to_csv('data/covidData'+date+'.csv',index=False)