# Minimize Variance by genetic algorithm (simple greedy as baseline)

In [1]:
import numpy as np
from tqdm.auto import tqdm
from simple.geneopt import GeneOpt

np.set_printoptions(suppress=True, linewidth=140, edgeitems=5, precision=4)

In [2]:
k = 128

In [3]:
np.random.seed(0)
data = np.random.randn(100, k)
data

array([[ 1.7641,  0.4002,  0.9787,  2.2409,  1.8676, ...,  1.3264, -0.6946, -0.1496, -0.4352,  1.8493],
       [ 0.6723,  0.4075, -0.7699,  0.5392, -0.6743, ..., -0.5758,  0.142 , -0.3193,  0.6915,  0.6947],
       [-0.7256, -1.3834, -1.5829,  0.6104, -1.1889, ...,  0.8802, -1.6981,  0.3873, -2.2556, -1.0225],
       [ 0.0386, -1.6567, -0.9855, -1.4718,  1.6481, ..., -1.5678, -1.1792,  1.3014,  0.8953,  1.375 ],
       [-1.3322, -1.9686, -0.6601,  0.1758,  0.4987, ...,  0.4033, -0.918 ,  0.2525,  0.8203,  1.3599],
       ...,
       [ 0.0622,  1.3174, -0.59  ,  1.1107,  1.0483, ..., -0.2114,  1.1733,  0.5092, -0.1583,  0.6917],
       [-0.1102,  0.2098, -0.2684, -0.2211,  1.4088, ...,  0.4517, -1.9099,  0.25  , -0.8667,  0.8052],
       [-0.8144, -0.248 ,  0.1575,  0.2942, -0.1956, ...,  1.1981,  0.6364, -0.0728,  0.9758, -0.4174],
       [-0.0015, -0.762 , -0.5344,  0.4405, -1.324 , ...,  0.5922, -1.6977, -0.2491, -0.4006, -0.3961],
       [ 1.5913, -0.585 ,  0.9393,  0.4974, -0.4705,

In [5]:
# Iteratively add one variable at a time, choosing the one that minimizes the variance of the sum after each addition.
selected = []
remaining = list(range(k))
N = 10

for _ in tqdm(range(N)):
    min_variance = np.inf
    best = None

    for var in remaining:
        variance = data[:, selected + [var]].sum(axis=1).var()
        if variance < min_variance:
            min_variance = variance
            best = var

    selected.append(best)
    remaining.remove(best)

print(selected)

  0%|          | 0/10 [00:00<?, ?it/s]

[93, 87, 108, 25, 1, 48, 85, 72, 23, 49]


In [6]:
data[:, selected].sum(axis=1).var()

2.964297069498361

In [7]:
def model(i0: int=(0, k-1), i1: int=(0, k-1), i2: int=(0, k-1), i3: int=(0, k-1), i4: int=(0, k-1),
          i5: int=(0, k-1), i6: int=(0, k-1), i7: int=(0, k-1), i8: int=(0, k-1), i9: int=(0, k-1)) -> float:
    """Target function accepting 10 indexes of selected variables and returning the variance of the sum"""

    feature_indexes = [i0, i1, i2, i3, i4, i5, i6, i7, i8, i9]    
    variance = data[:, feature_indexes].sum(axis=1).var()
    return -variance

In [8]:
G = GeneOpt(model)
best = G.maximize(population_size=64000, generations=100)
best

  0%|          | 0/100 [00:00<?, ?it/s]

{'i0': 34,
 'i1': 43,
 'i2': 50,
 'i3': 116,
 'i4': 90,
 'i5': 32,
 'i6': 80,
 'i7': 93,
 'i8': 25,
 'i9': 24}

In [9]:
-model(**best)

2.641366569857427