In [15]:
import os
import boto3
import pickle
import pandas as pd

In [16]:
def maybe_pickle(data, filename, force=False):
    if os.path.exists(filename) and not force:
        # You may override by setting force=True.
        print('%s already present - Skipping pickling.' % filename)
    else:
        print('Pickling %s.' % filename)
        try:
            with open(filename, 'wb') as f:
                pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        except Exception as e:
            print('Unable to save data to', filename, ':', e)


def maybe_download(table, force=False):
    storage_file = table + ".pkl"
    if force or not os.path.exists(storage_file):
        data = data_retrieve(table)
        maybe_pickle(data, storage_file)
    return storage_file
        
def data_retrieve(table, page_size=50):
    """download data from table"""
    client = boto3.client('dynamodb')
    paginator = client.get_paginator('scan')

    counter = 0
    items = []
    for page in paginator.paginate(TableName=table, PaginationConfig={"PageSize": page_size}):
        items.extend(page["Items"])
        
        counter += 1
        if counter % 50 == 0:  # print debug every 50 pages
            print("next page. {}. Count {}. ScannedCount: {}".format(counter, page["Count"], page["ScannedCount"]))


    return items
    

In [17]:
data_file = maybe_download("apthunt")
data = pickle.load(open(data_file, 'rb'))

In [18]:
# To go from dynamo format to python
from boto3.dynamodb import types

deserializer = types.TypeDeserializer()
python_data = [{k: deserializer.deserialize(v) for k,v in d.items()} for d in data]

In [19]:
df = pd.DataFrame(python_data)

In [37]:
df["parsed_district"].describe()

count        55351
unique         969
top       berkeley
freq          1036
Name: parsed_district, dtype: object