In [5]:
import ujson, csv
import random
import time
from pathlib import Path
from faker import Faker
from tqdm import tqdm_notebook as tqdm

#### Global setup

In [2]:
BENCHMARK_DIR = Path("/workspace/drepr/volumes/benchmark_data")
assert BENCHMARK_DIR.exists()

faker = Faker()

#### Generate datasets and their representation

**HR Dataset**

```
Company:
    + name: string
    + address: string
    + phone: string
    + employee: array of employee id

Employee:
    + id: string
    + name: string
```

In [3]:
n_employees = 1000000
employees = []

for i in tqdm(range(n_employees)):
    employees.append({
        "id": f"E{i:09d}",
        "name": faker.name(),
        "salary": random.randint(0, 1000)
    })

HBox(children=(IntProgress(value=0, max=1000000), HTML(value='')))




In [44]:
n_companies = 100
min_n_employees = 1
max_n_employees = 10
companies = []

employee_pools = set(range(len(employees)))

for i in tqdm(range(n_companies)):
    company_employees = random.sample(employee_pools, random.randint(min_n_employees, max_n_employees))
    employee_pools = employee_pools.difference(company_employees)
    
    companies.append({
        "name": f"company-{i}",
        "address": faker.address(),
        "phone": faker.phone_number(),
        "employees": [employees[i]['id'] for i in company_employees]
    })

HBox(children=(IntProgress(value=0), HTML(value='')))




In [7]:
# with open(BENCHMARK_DIR / "hr_company.json", "w") as f:
#     ujson.dump(companies, f, indent=2)
    
# with open(BENCHMARK_DIR / "hr_employee.json", "w") as f:
#     ujson.dump(employees, f, indent=2)
    
# with open(BENCHMARK_DIR / "hr_company_employee.small.json", "w") as f:
#     ujson.dump({
#         "companies": companies,
#         "employees": employees,
#     }, f, indent=2)

with open(BENCHMARK_DIR / "hr_employee.csv", "w") as f:
    writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["id", "name", "salary"])
    for e in employees:
        writer.writerow([e['id'], e['name'], e['salary']])

In [36]:
with open(BENCHMARK_DIR / "hr_company_employee.repr.yml", "w") as f:
    f.write('''
resources: 
    default:
        type: JSON
variables:
    cname:
        location: ["companies", "..", "name"]
    caddress:
        location: ["companies", "..", "address"]
    cemployees:
        location: ["companies", "..", "employees", ".."]
    eid:
        location: ["employees", "..", "id"]
    ename:
        location: ["employees", "..", "name"]
alignments: []
''')

In [14]:
start = time.time()
with open(BENCHMARK_DIR / "hr_company_employee.json", "r") as f:
    data = ujson.load(f)
end = time.time()
print("take", end - start)

take 0.7629098892211914


In [13]:
start = time.time()

with open(BENCHMARK_DIR / "hr_employee.csv", "r") as f:
    reader = csv.reader(f, delimiter=',', quoting=csv.QUOTE_ALL)
    rows = [row for row in reader]
#     for row in rows[1:]:
#         salary += int(row[2])

end = time.time()
print('salary', salary, 'time:', end - start)

salary 0 time: 1.7156329154968262
