In [17]:
import datetime
import os
import io
import re
import json
import numpy as np
import pandas as pd
import scipy.stats
# import boto3

In [195]:
def CONFIG():
    """"""
    _pid = 'analysis-covid19'
    _now = datetime.datetime.now()
    _cwd = os.getcwd()
    _data = os.path.join(_cwd,'data')
    
    _cfg = {
        'data': _data
        ,'eda': os.path.join(_cwd,'eda')
        ,'csse': os.path.join(_data,'csse-dly','{DT}.csv')
        ,'csse-us': os.path.join(_data,'csse-dly-us','{DT}.csv')
        ,'csse-ts': {
            'base': os.path.join(_data,'csse-ts','time_series_covid19_{STAT}_{RGN}.csv')
            ,'options': {
                'stat': ['confirmed','deaths','recovered']
                ,'rgn': ['global','US']
            }
            ,'errata': os.path.join(_data,'csse-ts','Errata.csv')
        }
        ,'who-ts': os.path.join(_data,'who-ts','who_covid_19_sit_rep_time_series.csv')
        ,'fips': os.path.join(_data,'csse-fips-lkup.csv')
    }
    
    return _cfg

CFG = CONFIG()

In [221]:
def CSSE_DLY(base,start=datetime.date(2020,4,12),end=datetime.date(2020,4,30)):
    """"""
    _file = '%m-%d-%Y'
    _fmt = '%Y%m%d'
    df = None
    gaps = []
    
    for i in range((end-start).days+1):
        dt = (start+datetime.timedelta(days=i))
        if df is not None:
            try:
                df = df.append(pd.read_csv(base.format(DT=dt.strftime(_file)),header=0).assign(data_dt=dt))
            except Exception as e:
                gaps.append((dt,str(e)))
        else:
            try:
                df = pd.read_csv(base.format(DT=dt.strftime(_file)),header=0).assign(data_dt=dt)
            except Exception as e:
                gaps.append((dt,str(e)))
    return df # gaps

In [222]:
def CLEANER(df):
    """"""
    def _COLS(x):
        """"""
        if not x[0].isalnum():
            x=x[1:]
        if not x[-1].isalnum():
            x=x[:-1]
        return x.upper()
    # cols
    df.columns = map(lambda x: _COLS(x),df.columns)
    
    return df

In [227]:
def EDA(df,name,n=100,path=None):
    """"""
    if not path:
        path=os.path.join(os.getcwd(),'eda')
    
    lb = '\n'
    lblb = '\n\n'
    
    # columns & types
    content = '# EDA - {} Files {}'.format(os.path.basename(name).upper(),lblb)
    content+='#### Column Name [IDX] -  Dtype (Head / Tail) \n'
    dtypes = df.dtypes.to_dict()
    head = df.head(1).T.iloc[:,0].to_list() # to_dict() - head.get(j)
    tail = df.tail(1).T.iloc[:,0].to_list() # to_dict() - tail.get(j)
    for i,j in enumerate(df.columns):
        content+='- **{}** [{}] - {} ({} / {}) {}'.format(j, i, dtypes.get(j), head[i], tail[i], lb)
    
    # html
    content+='{}#### Head / Tail [n={}] Sample {}'.format(lb+lblb,n,lblb)
    content+=(df.head(n).append(df.tail(n)).to_html(None,index=True,header=True))
    
    with open(os.path.join(path,'{}.md'.format(name)),'w') as f:
        f.write(content)

    # html data sample
#     df.head(n).append(df.tail(n)).to_html(os.path.join(path,'{}.html'.format(name)),index=True,header=True)

In [228]:
if __name__=='__main__':
# def MAIN():
#     """"""
    data = {
        'csse': None,
        'csse-us': None,
        'csse-ts': None,
        'who-ts': None,
        'csse-ts-err': None,
        'fips': None
    }
    ERRS = []
    CFG = CONFIG()
    
    data['csse'] = CLEANER(CSSE_DLY(CFG['csse']))
    data['csse-us']  = CLEANER(CSSE_DLY(CFG['csse-us']))
    data['csse-ts'] = CLEANER(pd.read_csv(CFG['csse-ts']['base'].format(STAT='confirmed',RGN='US'),header=0)) # ['confirmed-us']
    data['who-ts'] = CLEANER(pd.read_csv(CFG['who-ts'],header=0))
    data['csse-ts-err'] = CLEANER(pd.read_csv(CFG['csse-ts']['errata'],header=0))
    data['fips'] = CLEANER(pd.read_csv(CFG['fips'],header=0))
    
    for k in data.keys():
        EDA(data[k],k,n=20)
        
#     return data