# Algorand Covid-19 Project

## Demo: Bulk survey transaction scrape from Algorand mainnet via PureStake API
## Rahul Zalkikar | rz1567@nyu.edu

### Source documentation:
https://github.com/algorandfoundation/IReport-Covid/blob/master/js/retrieveData.js

In [148]:
import algosdk
import math
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import numpy as np
from datetime import datetime


import py_algorand # class sript

In [115]:
# attribute array
headers = [
    
    'a',
    
    '_t',  
    '_v',  

    # general demographic data
    'gc',  # string, country code (see Location Data section below)
    'gr',  # string, region code  (see Location Data section below)
    'gzp', # string, 3-digit zip code (US only)
    'ga',  # integer, age group, if present must be in 1,11,21,31,41,51,56,61,66,71,76,81,85
    'gs',  # string , gender, if present must be 'm','f'

    # symptoms
    'sz',  # integer, is symptomatic, no-answer=0/no=-1/yes=1
    's1',  # boolean, fever
    's2',  # boolean, cough
    's3',  # boolean, difficulty breathing
    's4',  # boolean, fatigue
    's5',  # boolean, sore throat
    'sds', # date, when symptoms started, yyyy-mm-dd
    'sde', # date, when symptoms ended, yyyy-mm-dd
    'sdn', # boolean, still symptomatic

    #tested
    'tz',  # integer, tested, no-answer=0/no=-1/yes=1
    'tt',  # integer, tried to get tested, no=-1, yes=1, yes but was denied=2
    'td',  # date, test date, yyyy-mm-dd
    'tr',  # integer, test results, -1=negative,1=positive,2=waiting for result
    'tl',  #  integer, test location, 1=Dr office/2=Hospital/3=Urgent care/4=Ad-hoc center/5=Other

    # medical care
    'mz',  #  integer, received care, no-answer=0/no=-1/yes=1
    'm1',  #  boolean, doctor's office
    'm2',  #  boolean, walk-in clinic
    'm3',  #  boolean, virtual care
    'm4',  #  boolean, hospital/ER
    'm5',  #  boolean, other
    'mh',  #  integer, hospitalized, no-answer=0/no=-1/yes=1
    'mhs', #  date, when admitted, yyyy-mm-dd
    'mhe', #  date, when discharged, yyyy-mm-dd
    'mhn', #  boolean, still in hospital

    # quarantine
    'qz',  # integer, was quarantined, no-answer=0/no=-1/yes=1
    'q1',  # boolean, due to symptoms
    'q2',  # boolean, voluntarily
    'q3',  # boolean, personally required
    'q4',  # boolean, general quarantine
    'qds', # date, when quarantine started, yyyy-mm-dd
    'qde', # date, when quarantine ended, yyyy-mm-dd
    'qdn', # boolean, still quarantined
    'ql',  # integer, left quarantine temporarily no-answer=0/no=-1/yes=1

    'consent' # boolean' , user's consent, mandatory, must be 'true'
]

In [2]:
purestake_api_key = '94IaBae3fC56OeZtiWIY03TL5K4qeQjE5thyZbF0'
covidData_scraper = py_algorand.Algorand_IReportScrape(purestake_api_key)

algod last round: 5807195
algod time since last round: 1587840052
algod catchup: 0
algod latest version: https://github.com/algorandfoundation/specs/tree/4a9db6a25595c6fd097cf9cc137cc83027787eaa
####################
{'hash': 'CSAQV5WCW2MFSTSQATMNYBWHTQWXLDWNL2BPWH3WASCAOY3FOS4Q', 'previousBlockHash': 'P2URKRI7ATPOV3K25IUTIC3Q5XVQME7SIKSZJ7KPODHJI553EZXQ', 'seed': 'N6GSSH6MYXZSOZJLZOZA72SBMZMSGNIGL3X4GGCQO7TJ2I5G3LLA', 'proposer': 'FTXSKED23VEXNW442T2JKNPPNUC2WKFNRWBVQTFMT7HYX365IVLZXYILAI', 'round': 5807195, 'period': 0, 'txnRoot': 'WRS2VL2OQ5LPWBYLNBCZV3MEQ4DACSRDES6IUKHGOWYQERJRWC5A', 'reward': 114302, 'rate': 25999980, 'frac': 1159865138, 'txns': {}, 'timestamp': 1585763530, 'currentProtocol': 'https://github.com/algorandfoundation/specs/tree/4a9db6a25595c6fd097cf9cc137cc83027787eaa', 'nextProtocol': '', 'nextProtocolApprovals': 0, 'nextProtocolVoteBefore': 0, 'nextProtocolSwitchOn': 0, 'upgradePropose': '', 'upgradeApprove': False}

 total rounds: 161195
found 161500 transactions


In [116]:
txns = covidData_scraper.get_txns()

In [None]:
data_df = pd.DataFrame()

###### DECODING DATA
for i in range(len(txns)):
    if (i%1000 == 0): print("{} transactions decoded".format(i))
    tx_dict = txns[i]
    tx_code = tx_dict['tx']
    encoded_note = tx_dict['noteb64']
    decoded_note = algosdk.encoding.msgpack.unpackb(algosdk.encoding.base64.b64decode(encoded_note))
    decoded_note = decoded_note[b'd']
    decoded_note_data = {
        key.decode() if isinstance(key, bytes) else key:
        val.decode() if isinstance(val, bytes) else val
        for key, val in decoded_note.items()
    }
    decoded_note_data.update({'a':tx_code})
    cleaned_note_data = {key:None for key in headers}
    cleaned_note_data.update(decoded_note_data)
    #print(cleaned_note_data)
    data_df = data_df.append(cleaned_note_data, ignore_index=True)

0 transactions decoded
1000 transactions decoded
2000 transactions decoded
3000 transactions decoded
4000 transactions decoded
5000 transactions decoded
6000 transactions decoded
7000 transactions decoded
8000 transactions decoded
9000 transactions decoded
10000 transactions decoded
11000 transactions decoded
12000 transactions decoded
13000 transactions decoded
14000 transactions decoded
15000 transactions decoded
16000 transactions decoded
17000 transactions decoded
18000 transactions decoded
19000 transactions decoded
20000 transactions decoded
21000 transactions decoded
22000 transactions decoded
23000 transactions decoded
24000 transactions decoded
25000 transactions decoded
26000 transactions decoded
27000 transactions decoded
28000 transactions decoded
29000 transactions decoded
30000 transactions decoded


In [None]:
data_df.head()

In [None]:
cols = ["_t","_v","tx_id","consent","age_group","country_code","region_code","gender","3_dig_zip",
          "doctors_office","walk_in_clinic","virtual_care","hospital_or_ER","other",
           "hospitalized","when_discharged","still_in_hospital","when_admitted","received_care",
          "symptom_quarantine","voluntary_quarantine","personally_required_quarantine","general_quarantine",
          "when_quarantine_ended","still_in_quarantine","when_quarantine_started","left_quarantine_temporarily","was_quarantined",
          "fever","cough","difficulty_breathing","fatigue","sore_throat","when_symptoms_ended","still_symptomatic","when_symptoms_started","is_symptomatic",
          "test_date","test_location","test_results","tried_to_get_tested","tested"]

print(len(cols)==len(headers)==len(data_df.columns))

In [None]:
data_df.columns = cols
print(data_df.shape)
display(data_df.head())

In [None]:
datetime.today().strftime('%Y-%m-%d')

In [None]:
data_df.to_csv('covidData.csv',index=False)