In [1]:
from dask.distributed import Client, progress
client = Client(n_workers=4, threads_per_worker=1)
client

0,1
Client  Scheduler: tcp://127.0.0.1:51490,Cluster  Workers: 4  Cores: 4  Memory: 34.36 GB


In [3]:
import dask
import json
import os

os.makedirs('data', exist_ok=True)              # Create data/ directory

b = dask.datasets.make_people()                 # Make records of people
b.map(json.dumps).to_textfiles('data/*.json')   # Encode as JSON, write to disk

['data/0.json',
 'data/1.json',
 'data/2.json',
 'data/3.json',
 'data/4.json',
 'data/5.json',
 'data/6.json',
 'data/7.json',
 'data/8.json',
 'data/9.json']

In [4]:
!head -n 2 data/0.json

{"age": 61, "name": ["Danny", "Hebert"], "occupation": "Payroll Clerk", "telephone": "+1-(446)-705-7967", "address": {"address": "705 Crane Promenade", "city": "Hesperia"}, "credit-card": {"number": "3726 548771 90391", "expiration-date": "09/24"}}
{"age": 65, "name": ["Reed", "Horn"], "occupation": "Theatre Technician", "telephone": "(297) 090-8128", "address": {"address": "543 Valdez Thruway", "city": "St. Joseph"}, "credit-card": {"number": "4690 1877 6875 3602", "expiration-date": "10/16"}}


In [5]:
import dask.bag as db
import json

b = db.read_text('data/*.json').map(json.loads)
b

dask.bag<loads-d..., npartitions=10>

In [6]:
b.take(2)

({'age': 61,
  'name': ['Danny', 'Hebert'],
  'occupation': 'Payroll Clerk',
  'telephone': '+1-(446)-705-7967',
  'address': {'address': '705 Crane Promenade', 'city': 'Hesperia'},
  'credit-card': {'number': '3726 548771 90391', 'expiration-date': '09/24'}},
 {'age': 65,
  'name': ['Reed', 'Horn'],
  'occupation': 'Theatre Technician',
  'telephone': '(297) 090-8128',
  'address': {'address': '543 Valdez Thruway', 'city': 'St. Joseph'},
  'credit-card': {'number': '4690 1877 6875 3602',
   'expiration-date': '10/16'}})

In [7]:
b.filter(lambda record: record['age'] > 30).take(2)

({'age': 61,
  'name': ['Danny', 'Hebert'],
  'occupation': 'Payroll Clerk',
  'telephone': '+1-(446)-705-7967',
  'address': {'address': '705 Crane Promenade', 'city': 'Hesperia'},
  'credit-card': {'number': '3726 548771 90391', 'expiration-date': '09/24'}},
 {'age': 65,
  'name': ['Reed', 'Horn'],
  'occupation': 'Theatre Technician',
  'telephone': '(297) 090-8128',
  'address': {'address': '543 Valdez Thruway', 'city': 'St. Joseph'},
  'credit-card': {'number': '4690 1877 6875 3602',
   'expiration-date': '10/16'}})

In [8]:
b.map(lambda record: record['occupation']).take(2)

('Payroll Clerk', 'Theatre Technician')

In [9]:
b.count().compute()

10000

In [12]:
result = (b.filter(lambda record: record['age'] > 30)
           .map(lambda record: record['occupation'])
           .frequencies(sort=True)
           .topk(15, key=1))
result

dask.bag<topk-ag..., npartitions=1>

In [13]:
result.compute()

[('Transport Officer', 15),
 ('Medical Physicist', 14),
 ('Judge', 14),
 ('Loans Manager', 14),
 ('Garage Manager', 13),
 ('Sales Manager', 13),
 ('Public Relations Of?cer', 13),
 ('Careers Advisor', 13),
 ('Playgroup Leader', 13),
 ('Taxidermist', 12),
 ('Aircraft Designer', 12),
 ('Pool Attendant', 12),
 ('Actor', 12),
 ('Travel Agent', 12),
 ('Technical Assistant', 12)]

In [14]:
(b.filter(lambda record: record['age'] > 30)  # Select records of interest
  .map(json.dumps)                            # Convert Python objects to text
  .to_textfiles('data/processed.*.json'))     # Write to local disk


['data/processed.0.json',
 'data/processed.1.json',
 'data/processed.2.json',
 'data/processed.3.json',
 'data/processed.4.json',
 'data/processed.5.json',
 'data/processed.6.json',
 'data/processed.7.json',
 'data/processed.8.json',
 'data/processed.9.json']

In [15]:
b.take(1)

({'age': 61,
  'name': ['Danny', 'Hebert'],
  'occupation': 'Payroll Clerk',
  'telephone': '+1-(446)-705-7967',
  'address': {'address': '705 Crane Promenade', 'city': 'Hesperia'},
  'credit-card': {'number': '3726 548771 90391', 'expiration-date': '09/24'}},)

In [16]:
def flatten(record):
    return {
        'age': record['age'],
        'occupation': record['occupation'],
        'telephone': record['telephone'],
        'credit-card-number': record['credit-card']['number'],
        'credit-card-expiration': record['credit-card']['expiration-date'],
        'name': ' '.join(record['name']),
        'street-address': record['address']['address'],
        'city': record['address']['city']
    }

b.map(flatten).take(1)

({'age': 61,
  'occupation': 'Payroll Clerk',
  'telephone': '+1-(446)-705-7967',
  'credit-card-number': '3726 548771 90391',
  'credit-card-expiration': '09/24',
  'name': 'Danny Hebert',
  'street-address': '705 Crane Promenade',
  'city': 'Hesperia'},)

In [17]:
df = b.map(flatten).to_dataframe()
df.head()

Unnamed: 0,age,city,credit-card-expiration,credit-card-number,name,occupation,street-address,telephone
0,61,Hesperia,09/24,3726 548771 90391,Danny Hebert,Payroll Clerk,705 Crane Promenade,+1-(446)-705-7967
1,65,St. Joseph,10/16,4690 1877 6875 3602,Reed Horn,Theatre Technician,543 Valdez Thruway,(297) 090-8128
2,65,Fairhope,08/19,3414 746274 26567,Eldon Stanley,Taxidermist,980 Brooklyn Bay,1-949-426-0203
3,64,Grayslake,12/25,3703 305616 78130,Neida Ingram,Windscreen Fitter,1197 Tunnel Townline,979.626.8484
4,51,Trotwood,04/17,3447 873898 09457,Latoyia Swanson,Pest Controller,1381 Century Trail,(631) 561-7894
