# Covid19 Data Analysis

#### Overview
...

#### Data Sources
* **Covid19**
    * [Johns Hopkins University - Covid19 Data](https://github.com/CSSEGISandData/COVID-19)
* **Demographics**
    * [US Census Bureau - 2019 Annual Social and Economic Supplements](https://www.census.gov/content/census/en/data/datasets/2019/demo/cps/cps-asec-2019.html)
    * [US Census Bureau - 2019 Current Population Survey](https://www.census.gov/content/census/en/data/datasets/2019/demo/cps/cps-basic-2019.html)
    * [US Census Bureau - International Demographic Overview](https://www.census.gov/data-tools/demo/idb/region.php?T=13&RT=0&A=both&Y=2020&C=&R=1)
    * [The World Bank - World Development Indicators](https://datacatalog.worldbank.org/dataset/world-development-indicators)
    * [The World Bank - Population Estimates and Projections](https://datacatalog.worldbank.org/dataset/population-estimates-and-projections)
    * [IMF - World Economic Outlook](https://www.imf.org/external/pubs/ft/weo/2020/01/weodata/download.aspx)

In [1]:
import logging
import datetime

import os
import io
import re
import json
import numpy as np
import pandas as pd
# import scipy.stats
# import tensorflow
# import boto3
import pyarrow
import requests

In [2]:
def CONFIG():
    """"""
    _pid = 'analysis-covid19'
    _now = datetime.datetime.now()
    _cwd = os.getcwd()
    _in = os.path.join(_cwd,'in')
    _out = os.path.join(_cwd,'out')
    _cov = os.path.join(_in,'covid')
    
    _cfg = {
        'paths': {
            'in': _in
            ,'covid': _cov
            ,'out': _out
            ,'eda': os.path.join(_cwd,'eda')
            ,'pq': os.path.join(_out)
        }
        ,'refresh': os.path.join(_cwd,'refresh.sh')
        ,'jhu-covid': {
            'csse-dly': os.path.join(_cov,'jhu','csse-dly','{DT}.csv')
            ,'csse-dly-us': os.path.join(_cov,'jhu','csse-dly-us','{DT}.csv')
            ,'csse-ts': {
                'base': os.path.join(_cov,'jhu','csse-ts','time_series_covid19_{STAT}_{RGN}.csv')
                ,'options': {
                    'stat': ['confirmed','deaths','recovered']
                    ,'rgn': ['global','US']
                }
                ,'errata': os.path.join(_cov,'jhu','csse-ts','Errata.csv')
            }
            ,'who-ts': os.path.join(_cov,'jhu','who-ts','who_covid_19_sit_rep_time_series.csv')
            ,'fips': os.path.join(_cov,'jhu','csse-fips-lkup.csv')
        }
        ,'cb-idb': { # census bureau - international database
            'base': 'https://api.census.gov/data/timeseries/idb/1year?'
            ,'query': {
                'get': ['AREA_KM2','NAME','AGE','POP','FIPS','SEX']
                ,'time': ['2019']
            }
        }
#         ,'wb-dev': {
#             'base'
#         }
#         ,'wb-pop': {
            
#         }
#         ,'imf-econ': {
            
#         }
    }
    
    return _cfg

In [3]:
def LOGGER():
    """"""
    _lvl = logging.INFO
    _str = io.StringIO()
    _handler = logging.StreamHandler(_str)

    _handler.setLevel(_lvl)
    _handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    _log = logging.getLogger()
    _handler.setLevel(_lvl)
    _log.addHandler(_handler)

    return _log,_str

In [4]:
def CSSE_DLY(base,start=datetime.date(2020,4,12),end=datetime.date(2020,4,30)):
    """"""
    _file = '%m-%d-%Y'
    _fmt = '%Y%m%d'
    df = None
    gaps = []
    
    for i in range((end-start).days+1):
        dt = (start+datetime.timedelta(days=i))
        if df is not None:
            try:
                df = df.append(pd.read_csv(base.format(DT=dt.strftime(_file)),header=0).assign(data_dt=dt))
            except Exception as e:
                gaps.append((dt,str(e)))
        else:
            try:
                df = pd.read_csv(base.format(DT=dt.strftime(_file)),header=0).assign(data_dt=dt)
            except Exception as e:
                gaps.append((dt,str(e)))
    return df # gaps

In [5]:
def CLEANER(df):
    """"""
    # cols
    def _COLS(x):
        """"""
        if not x[0].isalnum():
            x=x[1:]
        if not x[-1].isalnum():
            x=x[:-1]
        return x.upper()
    df.columns = map(lambda x: _COLS(x),df.columns)
    
    return df

In [6]:
def PARQUET(df,fout):
    """"""
    _byt = io.BytesIO()
    content = df.to_parquet(_byt,engine='pyarrow',index=False)
    with open(fout,'w') as f:
        f.write(_byt.read().decode('UTF-8'))

In [7]:
def EDA(df,fout,n=100):
    """"""
    name = os.path.basename(fout).split('.')[0]
    
    lb = '\n'
    lblb = '\n\n'
    
    # columns & types
    content = '# EDA - {} Files {}'.format(os.path.basename(name).upper(),lblb)
    content+='#### Column Name [IDX] -  Dtype (Head / Tail) \n'
    dtypes = df.dtypes.to_dict()
    head = df.head(1).T.iloc[:,0].to_list() # to_dict() - head.get(j)
    tail = df.tail(1).T.iloc[:,0].to_list() # to_dict() - tail.get(j)
    for i,j in enumerate(df.columns):
        content+='- **{}** [{}] - {} ({} / {}) {}'.format(j, i, dtypes.get(j), head[i], tail[i], lb)
    
    # html
    content+='{}#### Head / Tail [n={}] Sample {}'.format(lb+lblb,n,lblb)
    content+=(df.head(n).append(df.tail(n)).to_html(None,index=True,header=True))
    
    with open(fout,'w') as f:
        f.write(content)

In [8]:
def DB_IDB(base,query):
    """"""
    url = (base + '&'.join([k+'='+','.join(v) for k,v in query.items()]))
    response = requests.get(url).json()
    df = pd.DataFrame(response[1:],columns=response[0])
    return df

In [11]:
if __name__=='__main__':
# def MAIN():
#     """"""
    ERRS = []
    CFG = CONFIG()
    LOG,FEED = LOGGER()
        
    try:
        segment='covid'
        src = 'jhu'
        covid = {
            'csse-dly': CSSE_DLY(CFG['jhu-covid']['csse-dly']),
            'csse-dly-us': CSSE_DLY(CFG['jhu-covid']['csse-dly-us']),
            'csse-ts': pd.read_csv(CFG['jhu-covid']['csse-ts']['base'].format(STAT='confirmed',RGN='US'),header=0),
            'who-ts': pd.read_csv(CFG['jhu-covid']['who-ts'],header=0),
            'csse-ts-err': pd.read_csv(CFG['jhu-covid']['csse-ts']['errata'],header=0),
            'fips': pd.read_csv(CFG['jhu-covid']['fips'],header=0)
        }
        for k,v in covid.items():
            # clean df
            covid[k] = CLEANER(v)
            # md eda file
            _ = os.path.join(CFG['paths']['eda'],'{}-{}-{}.md'.format(segment,src,k))
            EDA(covid[k],_,n=20)
            # parquet clean file
            _ = os.path.join(CFG['paths']['pq'],segment,'{}-{}.parquet'.format(src,k))
            PARQUET(covid[k],_)
    
    except Exception as e:
        LOG.critical(str(e))
        print(FEED.getvalue())
        raise
    
    try:
        segment='demo'
        name = 'cb-idb'
        df = DB_IDB(**CFG[name])
        # clean
        df = CLEANER(df)
        # md eda file
        _ = os.path.join(CFG['paths']['eda'],'{}-{}.md'.format(segment,name))
        EDA(df,_,n=20)
        # parquet clean file
        _ = os.path.join(CFG['paths']['pq'],segment,'{}.parquet'.format(name))
        PARQUET(df,_)
    
    except Exception as e:
        LOG.critical(str(e))
        print(FEED.getvalue())
        raise