## TEST DATASET INITIALISATION

In [1]:
import sys
sys.path.append('../')

import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

import datasets
import importlib
importlib.reload(datasets)

from datasets.utils import dec2bin, dec2base
from datasets.random_hierarchy_model import sample_rules

In [41]:
n = 2
v = 2
m = 2

L = 1
s = 2

input_size = s**L # number of pixels, actual input size is (input_size x num_features) because of one-hot encoding
num_data = n * (m**((s**L-1)//(s-1))) # total number of data
print(input_size, num_data)

2 4


## STANDARD SAMPLING (WITHOUT REPLACEMENT)

In [42]:
random.seed()
seed_rules = 12345678 # seed of the random hierarchy model
seed_sample = random.randrange(10000000,99999999)
print('sampling seed:', seed_sample)

train_size = -1 # size of the training set
test_size = 0 # size of the test set
input_format = 'long' # alternative: onehot
# to generate the full dataset: set trainset=num_data, test_size=0

dataset = datasets.RandomHierarchyModel(
    num_features=v, # vocabulary size
    num_synonyms=m, # features multiplicity
    num_layers=L, # number of layers
    num_classes=n, # number of classes
    tuple_size=s, # number of branches of the tree
    seed_rules=seed_rules,
    seed_sample=seed_sample,
    train_size=train_size,
    test_size=test_size,
    input_format=input_format,
    whitening=0, # 1 to whiten the input
    replacement=False
)

print(dir(dataset)) 
# for the input points call trainset.input
print(dataset.features.size()) # dimension: train_size x num_features x input_size
# for the labels call trainset.output
print(dataset.labels.size()) # dimension: train_size

23924332


In [43]:
x = dataset.features
print(x.size())

if 'onehot' in input_format:
    print(x.mean(dim=1).mean())
    print(x.norm(dim=1).mean())

elif 'long' in input_format:
    for i in range(x.size(0)):
        print(x[i,:], dataset.labels[i])

torch.Size([4, 2])
tensor([1, 1]) tensor(0)
tensor([2, 1]) tensor(0)
tensor([2, 2]) tensor(1)
tensor([1, 2]) tensor(1)


In [44]:
L = len(dataset.rules)
print('rules: list of length ', len(dataset.rules), ',')
print('first element of size ', dataset.rules[0].size(), ', (num_classes x num_synonyms x tuple_size)')

for l in range(1,L):
    print(f'{l+1}-th element of size ', dataset.rules[l].size(), ', (num_features x num_synonyms x tuple_size)')
print('rules[l][v,j] = j-th rep of the v-th level-(L-l) feature,')
print('e.g. list of tuples corresponding to class 0:')
print(dataset.rules[0][0,:])

rules: list of length  1 ,
first element of size  torch.Size([2, 2, 2]) , (num_classes x num_synonyms x tuple_size)
rules[l][v,j] = j-th rep of the v-th level-(L-l) feature,
e.g. list of tuples corresponding to class 0:
tensor([[0, 0],
        [1, 0]])


## SAMPLING WITH REPLACEMENT (REQUIRED FOR DATASET LARGER THAN sys.maxsize)

In [60]:
random.seed()
seed_rules = 12345678 # seed of the random hierarchy model
seed_sample = random.randrange(10000000,99999999)
print('sampling seed:', seed_sample)

train_size = -1 # size of the training set
test_size = 0 # size of the test set
input_format = 'long' # alternative: onehot
# to generate the full dataset: set trainset=num_data, test_size=0

dataset = datasets.RandomHierarchyModel(
    num_features=v, # vocabulary size
    num_synonyms=m, # features multiplicity
    num_layers=L, # number of layers
    num_classes=n, # number of classes
    tuple_size=s, # number of branches of the tree
    seed_rules=seed_rules,
    seed_sample=seed_sample,
    train_size=train_size,
    test_size=test_size,
    input_format=input_format,
    whitening=0, # 1 to whiten the input
    replacement=True
)

sampling seed: 37774993


In [61]:
x = dataset.features
print(x.size())

if 'onehot' in input_format:
    print(x.mean(dim=1).mean())
    print(x.norm(dim=1).mean())

elif 'long' in input_format:
    for i in range(x.size(0)):
        print(x[i,:], dataset.labels[i])

torch.Size([4, 2])
tensor([2, 1]) tensor(0)
tensor([2, 2]) tensor(1)
tensor([1, 1]) tensor(0)
tensor([1, 1]) tensor(0)
