In [1]:
import json
import numpy as np
import torch
from scipy.cluster.vq import kmeans
from tqdm import tqdm
import random

In [2]:
with open("info.json", "r") as f:
    labels = np.array(json.load(f))

In [3]:
labels[:2]  # label_index, cx, cy, bwidth, bheight, imagewidth, imageheight

array([[8.00000000e+00, 5.87000012e-01, 7.33333349e-01, 1.23999998e-01,
        3.44000012e-01, 5.00000000e+02, 3.75000000e+02],
       [8.00000000e+00, 4.18000013e-01, 8.47999990e-01, 1.78000003e-01,
        2.90666670e-01, 5.00000000e+02, 3.75000000e+02]])

In [4]:
image_size = 640

In [5]:
labeld_image_width_height = labels[:, [5, 6]]

In [6]:
labeld_image_width_height.shape

(15662, 2)

In [7]:
labeld_image_width_height[:, 0].shape  # 是多少？

(15662,)

In [8]:
labeld_image_width_height[:, [0]].shape  # 是多少？

(15662, 1)

### 这句话做了过后，得到的std_image_width_height，里面长边就是640了（也就是image_size）

In [10]:
labeld_image_max_line = labeld_image_width_height.max(axis=1, keepdims=True)
std_image_width_height = image_size * labeld_image_width_height / labeld_image_max_line
labeld_image_width_height[0], std_image_width_height[0]

(array([500., 375.]), array([640., 480.]))

In [14]:
std_box_wh = std_image_width_height * labels[:, [3, 4]]

In [15]:
keep_index = (std_box_wh >= 2).any(axis=1)
keep_wh = std_box_wh[keep_index]
keep_wh.shape

(15662, 2)

In [16]:
num_anchor = 9   # 3(stage, level, stride=8, 16, 32) x 3(shape, wh ratio=0.5, 1.0, 2.0)

In [20]:
keep_wh_std = keep_wh.std(0)
whiten_wh = keep_wh / keep_wh_std
k, a = kmeans(whiten_wh, num_anchor, iter=30)
k

array([[3.04342701, 1.54575119],
       [1.5025807 , 1.15750661],
       [2.01987943, 3.74190682],
       [0.53331137, 0.73968643],
       [1.01056589, 2.31723734],
       [3.46754357, 2.88828802],
       [2.03268818, 2.43810222],
       [0.65804997, 1.37093969],
       [0.25855089, 0.33852129]])

In [21]:
new_anchor = k * keep_wh_std
new_anchor = new_anchor[new_anchor.prod(axis=1).argsort()]  # 用面积来排序，arg形状的总是返索引
new_anchor

array([[ 41.86714518,  46.55312883],
       [ 86.35910868, 101.72098018],
       [106.55802903, 188.53033281],
       [243.3128871 , 159.1792172 ],
       [163.64093154, 318.66429297],
       [492.8221249 , 212.57024574],
       [329.15318929, 335.28551823],
       [327.07906841, 514.58349718],
       [561.49931802, 397.19464499]])

In [22]:
import torch

In [23]:
keep_wh = torch.FloatTensor(keep_wh)
new_anchor = torch.FloatTensor(new_anchor)

# 分析保留条件
- 是否保留 = max(anchor.width / obj.width, obj.width / anchor.width, anchor.height / obj.height, obj.height / anchor.height) < anchor_t，这里anchor_t = 4
- 考虑一维度情况：
    - 是否保留 = max(anchor / obj, obj / anchor) < anchor_t，这里anchor_t = 4

In [24]:
anchor_size = 10
obj_size = 2
max(anchor_size / obj_size, obj_size / anchor_size) 

5.0

In [25]:
anchor_size = 3
obj_size = 2
max(anchor_size / obj_size, obj_size / anchor_size)

1.5

* 结论是：10 -> 2  不满足
* 结论是：3  -> 2  满足

# 转换条件
- 考虑一维度情况：
    - 是否保留 = max(anchor / obj, obj / anchor) < anchor_t，这里anchor_t = 4
- 转换后:
    - 是否保留 = min(anchor / obj, obj / anchor) > 1 / anchor_t，这里anchor_t = 4

In [26]:
anchor_t = 4
1 / anchor_t

0.25

In [27]:
anchor_size = 10
obj_size = 2
min(anchor_size / obj_size, obj_size / anchor_size) 

0.2

In [28]:
anchor_size = 3
obj_size = 2
min(anchor_size / obj_size, obj_size / anchor_size)

0.6666666666666666

* 结论是：10 -> 2  不满足
* 结论是：3  -> 2  满足

# 计算宽宽比，高高比

In [29]:
#keep_wh = torch.FloatTensor(keep_wh)
#new_anchor = torch.FloatTensor(new_anchor)

#keep_wh    ->  N x 2
#new_anchor  -> K x 2   K = 9

In [31]:
keep_wh[:, None].shape  # ->  N x 1 x 2
keep_wh[:, None].shape, new_anchor[None].shape  # ->  1 x K x 2

(torch.Size([15662, 1, 2]), torch.Size([1, 9, 2]))

In [32]:
ratio = keep_wh[:, None] / new_anchor[None]  # N x K x 2
ratio.shape

torch.Size([15662, 9, 2])

In [81]:
box_div_anchor = ratio
anchor_div_box = 1 / ratio

In [95]:
# 1. 先把a/b, b/a里面大的数挑出来：
merge_max = torch.max(box_div_anchor, anchor_div_box)  # 不改变数组维度，计算出几个数组里每一个相同位置的最大值
# 2. 再把大数里面的宽宽比和高高比两个值里面更大的值挑出来
max_ratio = merge_max.max(2)[0]  # 比较每一行里最大的值，max()返回的是元素和数组， 所以取[0]，要元素
torch.max(box_div_anchor, anchor_div_box).shape, max_ratio.shape  # ==>（n, k, 2), (n, k)
# 此时出来的就是每一条记录里相对9个anchor的最大的宽宽比或高高比（每1条记录有9个值）

min_ratio = max_ratio.min(1)[0]  # <== 这9个框里面最贴合的那个
# 不管是来自宽宽比还是高高比，min表示最贴合
#（不考虑分数？大于1取最小，小于1要取最大吧？）
# 当然不可能有比1小的“最大比例”了，因为前期已经取过max了，每一个格子上的取的都是大数/小数，显然是大于1的


# test
# a = torch.rand(2, 2)
# b = torch.rand(2, 2)
# torch.max(a, b).shape

In [102]:
matched_cond = min_ratio < anchor_t
matched_cond.shape  # N x K

tensor(0.9967)

In [109]:
bpr  = (max_ratio<anchor_t).any(1).float().mean() # 如果前面没有取最贴合的框，这里就要用any来找是否至少有一个满足条件的 
bpr1 = (min_ratio<anchor_t).float().mean() # 如果已经取过了最小的框，那么 min_ratio < anchor_t 表示直接拿最小的框来跟anchor_t比，如果连它都不满足条件，那么组anchor就是可以放弃的了。两种思路
fitness = ((1/min_ratio) / bpr).float().mean()
bpr, bpr1, fitness

# 拿着bool值去求mean，可见bpr不是返框的，是返指标的
# 0.9967表示宽宽高高比的大值里面，至少有一个是小于anchor_t的比例（在实际代码中，如果低于99.9%，就要重新训练了 

(tensor(0.9967), tensor(0.9967), tensor(0.7195))

# 适应度指标

In [100]:
ratio = keep_wh[:, None] / new_anchor[None]
box_div_anchor = ratio
anchor_div_box = 1 / ratio
min_ratio = torch.min(box_div_anchor, anchor_div_box).min(2)[0]

# min_ratio -> N x K
# 取每个box对9个anchor匹配度最好的那个
min_ratio = min_ratio.max(1)[0]
fitness = (min_ratio * (min_ratio > 1 / anchor_t)).float().mean()
fitness

tensor(0.7119)

In [113]:
def fitness(box_wh, anchor):
    ratio = box_wh[:, None] / anchor[None]
    box_div_anchor = ratio
    anchor_div_box = 1 / ratio
    min_ratio = torch.max(box_div_anchor, anchor_div_box).max(2)[0]

    # min_ratio -> N x K
    # 取每个box对9个anchor匹配度最好的那个
    min_ratio = min_ratio.min(1)[0]
    return ((1 / min_ratio) * (min_ratio < anchor_t).float()).mean()

def bpr(box_wh, anchor):
    ratio = box_wh[:, None] / anchor[None]
    box_div_anchor = ratio
    anchor_div_box = 1 / ratio
    max_ratio = torch.max(box_div_anchor, anchor_div_box).max(2)[0]

    # min_ratio -> N x K
    # 取每个box对9个anchor匹配度最好的那个
    min_ratio = max_ratio.min(1)[0]
    return (min_ratio < anchor_t).float().mean()

In [114]:
fitness(keep_wh, new_anchor)

tensor(0.7164)

In [115]:
bpr(keep_wh, new_anchor)

tensor(0.9967)

### 遗传算法

- 适应度评估（fitness）
- 突变，随机拿出几个数值，进行增加或减少

In [116]:
iter_count = 1000
anchor_shape = new_anchor.shape

current_fitness = fitness(keep_wh, new_anchor)
current_bpr = bpr(keep_wh, new_anchor)
print(f"current fitness = {current_fitness:.5f}, current bpr = {current_bpr:.5f}")

current fitness = 0.71640, current bpr = 0.99674


In [118]:
pbar = tqdm(range(iter_count), desc="求解中...")
for _ in pbar:
    
    # 随机选几个anchor的值，做突变
    mutate_coeff = torch.ones_like(new_anchor)
    while (mutate_coeff == 1).all():
        # 拿90%的数乘一个均匀分布再乘一个正态分布
        # 另外10%就是0了（因为是bool值了）
        mutate_range = (torch.rand(anchor_shape) < 0.9) * np.random.random() * torch.randn(anchor_shape)
        # 继续操，把范围调整下，这里因为加了1，所以为0的部分就变成1了（这就是while == 1的由来，如果全是1，说明这一轮没有变过
        mutate_coeff = (mutate_range * 0.1 + 1).clamp(0.3, 3.0)
    
    # 这里限了2.0，因为我们设了对2.0以下的框不感兴趣
    mutate_anchor = (new_anchor * mutate_coeff).clamp(2.0)
    mutate_fitness = fitness(keep_wh, mutate_anchor)
    pbar.desc = f"变异得到了: {current_fitness:.5f}"
    
    if mutate_fitness > current_fitness:
        current_fitness = mutate_fitness
        new_anchor = mutate_anchor  # 如果适配度更高，则替换anchor为当前的

变异得到了: 0.74184: 100%|██████████| 1000/1000 [00:04<00:00, 203.27it/s]


In [119]:
bpr(keep_wh, new_anchor)

tensor(0.9997)

In [120]:
current_fitness

tensor(0.7418)

In [121]:
new_anchor[new_anchor.prod(1).argsort()]

tensor([[ 25.5119,  38.2884],
        [ 49.2631,  56.2926],
        [ 54.5022, 129.4706],
        [103.6827,  79.3115],
        [120.0293, 157.0993],
        [148.7720, 293.6986],
        [261.9033, 172.0917],
        [281.1001, 368.2150],
        [506.8087, 350.5019]])