# Algorand Covid-19 Project

## Data Processing
## Rahul Zalkikar | rz1567@nyu.edu

### Source documentation:
https://github.com/algorandfoundation/IReport-Covid/blob/master/js/retrieveData.js

In [9]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
data_df = pd.read_csv('covidData.csv')
print(data_df.shape)
data_df.head()

(50000, 41)


Unnamed: 0,_t,_v,consent,age_group,country_code,region_code,gender,3_dig_zip,doctors_office,walk_in_clinic,...,sore_throat,when_symptoms_ended,still_symptomatic,when_symptoms_started,is_symptomatic,test_date,test_location,test_results,tried_to_get_tested,tested
0,report,1,True,30.0,US,PA,m,190.0,,,...,,,,,-1.0,,,,-1.0,-1.0
1,report,1,True,40.0,DE,,f,,,,...,,,,,-1.0,,,,-1.0,-1.0
2,report,1,True,20.0,IT,72,f,,,,...,,,,,-1.0,,,,-1.0,-1.0
3,report,1,True,55.0,IT,72,m,,,,...,,,,,-1.0,,,,-1.0,-1.0
4,report,1,True,20.0,GB,LAN,m,,,,...,True,2020-03-24,,2020-03-13,1.0,,,,-1.0,-1.0


In [5]:
def clean_df(df):
    if (len(list(set(df.consent))) == 1) & (list(set(df.consent))[0] == True):
        print("consent given for all data")
    else:
        print("error")
        
    df = df.drop(['_t','_v','consent'],axis = 1)
    
    bool_cols = ['doctors_office','walk_in_clinic','virtual_care','hospital_or_ER','other','still_in_hospital',
                'symptom_quarantine','voluntary_quarantine','personally_required_quarantine','general_quarantine','still_in_quarantine',
                "fever","cough","difficulty_breathing","fatigue","sore_throat","still_symptomatic"]
                 
    for col in bool_cols:
        df[col]= df[col].replace(True, 1)
        df[col]= df[col].replace(False, 0)
    
    return df
    

In [6]:
# use anon 3 digit zipcode to try and impute state abbrevs
def long_lat(df):
    lat_long_df = pd.read_csv(r'C:\Users\rayzc\Downloads\us-zip-code-latitude-and-longitude.csv',sep=';')
    states = []
    match = 0
    for state,area in zip(df.region_code,df['3_dig_zip']):
        region = state
        if (str(region)=='nan') & (str(area)!='nan'):
            for s,z in zip(lat_long_df.State,lat_long_df.Zip):
                if (str(int(area)) in str(int(z))):
                    region = s
                    #print(str(int(area)),'matched with',region)
                    match+=1
                    break
        states.append(region)

    df.insert(3, "state_code", states)
    print("{} US 3-dig-zipcodes w/ missing region codes matched with a state".format(match))
    
    all_nan_inds = df.index[df.iloc[:,6:].isna().all(axis=1)]
    print("Removed {} rows with all nans in feature columns".format(len(all_nan_inds)))
    df = cleaner_data_df.drop(all_nan_inds,axis=0)
    
    
    return df

In [7]:
cleaner_data_df = clean_df(data_df)
cleaner_data_df = long_lat(cleaner_data_df)
print(cleaner_data_df.shape)
cleaner_data_df.head(10)

consent given for all data
7100 US 3-dig-zipcodes w/ missing region codes matched with a state
Removed 200 rows with all nans in feature columns
(49800, 39)


Unnamed: 0,age_group,country_code,region_code,state_code,gender,3_dig_zip,doctors_office,walk_in_clinic,virtual_care,hospital_or_ER,...,sore_throat,when_symptoms_ended,still_symptomatic,when_symptoms_started,is_symptomatic,test_date,test_location,test_results,tried_to_get_tested,tested
0,30.0,US,PA,PA,m,190.0,,,,,...,,,,,-1.0,,,,-1.0,-1.0
1,40.0,DE,,,f,,,,,,...,,,,,-1.0,,,,-1.0,-1.0
2,20.0,IT,72,72,f,,,,,,...,,,,,-1.0,,,,-1.0,-1.0
3,55.0,IT,72,72,m,,,,,,...,,,,,-1.0,,,,-1.0,-1.0
4,20.0,GB,LAN,LAN,m,,,,,,...,1.0,2020-03-24,,2020-03-13,1.0,,,,-1.0,-1.0
5,50.0,IT,,,f,,,,,,...,,,,,-1.0,,,,-1.0,-1.0
6,50.0,IT,,,f,,,,,,...,,,,,-1.0,,,,,-1.0
7,20.0,IT,72,72,m,,,,,,...,,,,,-1.0,,,,-1.0,-1.0
8,20.0,IT,72,72,f,,,,,,...,,,,,-1.0,,,,-1.0,-1.0
9,40.0,IT,72,72,m,,,,,,...,,,,,-1.0,,,,-1.0,-1.0


In [8]:
cleaner_data_df.to_csv('cleaner_covidData.csv',index=False)