-
Notifications
You must be signed in to change notification settings - Fork 9
/
dataset_builder.py
34 lines (27 loc) · 1014 Bytes
/
dataset_builder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import numpy as np
from random import shuffle
from faker import Faker
fake = Faker()
'''
If you'd like to use something other than ISBN numbers in your datasets,
faker provides good support for that. I encourage you to read the docs
to learn about the different providers that they have available:
https://faker.readthedocs.io/en/latest/providers.html
'''
NUM_BASKETS = 1000
NUM_UNIQUE_ITEMS = 30
while NUM_BASKETS <= 1000:
item_dict = {}
for i in range(1, NUM_UNIQUE_ITEMS + 1):
item_dict[i] = fake.isbn13(separator="-")
filename = 'dataset_{0}_baskets_{1}_objects.txt'.format(str(NUM_BASKETS), str(NUM_UNIQUE_ITEMS))
with open(filename, 'w') as f:
for i in range(NUM_BASKETS):
curr_basket = []
for i in range(1, NUM_UNIQUE_ITEMS):
if np.random.uniform() < 1/i:
curr_basket.append(item_dict[i])
shuffle(curr_basket)
f.write(' '.join(curr_basket) + '\n')
f.close()
NUM_BASKETS *= 10