In [None]:
import numpy as np

In [None]:
def load_dataset(path):
    points = []
    max_index = -1
    with open(path) as f:
        for line in f:
            indices = [int(s.split(':')[0]) - 1 for s in line.split(' ')[1:]]
            points.append(indices)
            max_index = max(max_index, max(indices))

    return points, max_index + 1

In [None]:
PARTS = 4

In [None]:
class Dataset:
    def __init__(self, points, features):
        self.shape = len(points), features
        self.points = points

In [None]:
def get_column_perm(data, perm):
    examples = np.full((PARTS, data.shape[1]), 0)
    per_part = data.shape[0] // PARTS
    for i in range(data.shape[0]):
        part = min(PARTS - 1, i // per_part)
        for index in data.points[perm[i]]:
            examples[part, index] += 1
    ftrs = [()] * data.shape[1]
    for i in range(data.shape[1]):
        s = np.sum(examples[:, i])
        owner = np.argmax(examples[:, i])
        if s == 0:
            target = 0.0
            owner = np.random.randint(0, PARTS)
        else:
            target = np.max(examples[:, i]) / s
        ftrs[i] = (target, owner, i)
    ftrs.sort(reverse=True)
    groups = [[] for _ in range(PARTS)]
    for _, owner, i in ftrs:
        groups[owner].append(i)
    owner = {}

    for i in range(PARTS):
        for f in groups[i]:
            owner[f] = i
    order = [i for l in groups for i in l]
    return order, owner


def show_dataset(dataset, perm=None, column_perm=None, alpha=0.01, beta=0.01, seed=None):
    if seed is not None:
        np.random.seed(seed)
    if perm is None:
        perm = list(range(dataset.shape[0]))
    if column_perm is None:
        column_perm = get_column_perm(dataset, perm)
    v_index = np.random.choice(dataset.shape[0], int(dataset.shape[0] * alpha), replace=False)
    v_index.sort()

    h_index = set(np.random.choice(dataset.shape[1], int(dataset.shape[1] * beta), replace=False))
    num = {}
    for i, index in enumerate([x for x in column_perm[0] if x in h_index]):
        num[index] = i

    colors = [[255, 255, 255], [255, 0, 0], [0, 255, 0]]
    per_part = dataset.shape[0] // PARTS
    pic = np.full((len(v_index), len(h_index), 3), [0, 0, 0], dtype=np.uint8)
    for ii, i in enumerate(v_index):
        part = min(PARTS - 1, i // per_part)
        indices = dataset.points[perm[i]]
        for index in indices:
            if index in h_index:
                f_part = column_perm[1][index]
                pic[ii, num[index]] = colors[f_part % len(colors)]

    from PIL import Image
    img = Image.fromarray(pic, 'RGB')
    img.show()



In [None]:
p, m = load_dataset("../data/rcv1")
with open("../permute-rcv1-2/best-final.txt") as f:
    permutation = [int(line) for line in f]
    points = [p[permutation[i]] for i in range(len(p))]
    p = points
d = Dataset(p, m)
show_dataset(d, seed=42, alpha=0.01, beta=0.2)

In [None]:
p, m = load_dataset("../data/rcv1")
d = Dataset(p, m)
show_dataset(d, seed=42, alpha=0.01, beta=0.2)