In [37]:
import requests
import pandas as pd
#df = pd.read_csv('/home/will/notebooks/usage_data/cmg/2020-04-19__2020-04-26/Fire-MainService.raw.2020-04-19__2020-04-26.csv')

def add_time_cols(df):
    '''
    Returns a dataframe with added columns describing the time of each measurement.
    
    df     input dataframe
    '''
    df.loc[:,'hr'] = df.index.map(lambda d: d.timetuple().tm_hour)    
    df.loc[:,'month'] = df.index.map(lambda d: d.timetuple().tm_mon)
    df.loc[:,'year'] = df.index.map(lambda d: d.timetuple().tm_year)
    df.loc[:,'mday'] = df.index.map(lambda d: d.timetuple().tm_mday)
    df.loc[:,'yday'] = df.index.map(lambda d: d.timetuple().tm_yday)
    df.loc[:,'yrmonth'] = df['year']*100 + df['month']
    df.loc[:,'yryday'] = df['year']*1000 + df['yday']
    return df

def process_df(df):
    #df =df.set_index(pd.to_datetime(df.ts))
    #del df['ts']
    #df.columns = ['usage']
    return add_time_cols(df)

def load_df(key, hdf='dataset.h5'):
    df = pd.read_hdf(hdf, key)
    return process_df(df)

def see_available_datasets(hdf='dataset.h5'):
    store = pd.HDFStore(hdf)
    return store.keys()

def download_dataset(url='https://wtgtemp.s3.amazonaws.com/dataset.h5', hdf='dataset.h5', force=False):
    if os.path.exists(hdf):
        print("Dataset already exists.  Skipping.  Rename or delete the file to download again")
        return False
    print("Downloading...")
    resp = requests.get(url)
    with open(hdf, 'wb') as f:
        f.write(resp.content)

Some functions Ive found generally useful for slicing and dicing

In [35]:
#retrievers ====================
def get_item(df, name, value):
    '''
    Returns a subsection of the dataframe df constituted by all rows corresponding to an arbitrary column value
    
    df:        dataframe input
    name:      name of the column whose value is being examined. (Ex: "year")
    value:     column value whose rows will be included in the returned dataframe. (Ex: "2018")
    '''
    return df.ix[df[name]==value]

def get_day(df, yryday):
    '''
    Returns the subsection of the dataframe df corresponding to yryday
    
    df:          dataframe input
    yrmonth:     year and day (in "year_yday" format, ex: 201842 for 2/11) whose subsection is returned
    '''
    return get_item(df, 'yryday', yryday)

def get_month(df, yrmonth):
    '''
    Returns the subsection of the dataframe df corresponding to yrmonth
    
    df:          dataframe input
    yrmonth:     year and month (in "year_month" format, ex: 201803 for 3/2018) whose subsection is returned
    '''
    return get_item(df, 'yrmonth', yrmonth)

#iterators ==================
def get_list_dfs(df, name):
    '''
    Decomposes an input dataframe into separate dataframes, each with rows corresponding to a unique value on a specified column.
    
    df:      input dataframe
    name:    column whose values are examined
    
    '''
    return [df.ix[df[name]==value] for value in df[name].unique()]

def month_dfs(df):
    '''
    Returns a list of dfs, one for each month
    '''
    #return [df.ix[df.yrmonth==yrmonth] for yrmonth in df.yrmonth.unique()]
    return get_list_dfs(df, 'yrmonth')

def day_dfs(df):
    '''
    Returns a list of dfs, one for each day.
    '''
    return get_list_dfs(df, 'yryday')


In [38]:
download_dataset()
see_available_datasets()

NameError: name 'os' is not defined