### 排序方法

#### 1. 一维数组分类

In [5]:
#### 将数组中相近的数据聚类在一起

import numpy as np
import pandas as pd


#### 根据标准差分类

def Zcluser(X, threshold = None):
        idxs, groups = [],[]
        array = np.array(X.copy())
        DATA=pd.Series(np.array(X))
        if threshold is None:
            threshold = np.std(array)

        while len(DATA):
            group_idx, group_dt = [], []
            if len(DATA) == 1:
                group_idx.extend(list(DATA.index))
                group_dt.extend(list(DATA))
                DATA=DATA.drop(DATA.index)
            else:
                idx = DATA.index[0]
                dt = DATA[idx]
                DATA = DATA.drop(idx)
                dist = abs(DATA - dt)

                if (dist <= threshold).any():
                    data = DATA[dist <= threshold]
                    DATA = DATA.drop(data.index)
                    group_idx.extend([idx] + list(data.index))
                    group_dt.extend([dt]+list(data))
                else:
                    group_idx.extend([idx])
                    group_dt.extend([dt])

            idxs.append(group_idx)
            groups.append(group_dt)

        return idxs, groups

print()
X = [0.3, 0.35, 2.0, 2.1, 8, 25]

Zcluser(X)[-1]




[[0.3, 0.35, 2.0, 2.1, 8.0], [25.0]]

##### 2. 多维数组排序

In [12]:

import numpy as np

def get_sorted_top_k_d2(array, top_k=1, axis=-1, reverse=True):
    """
    多维数组排序（支持1维）
    Args:
        array: 多维数组
        top_k: 取数
        axis: 轴维度
        reverse: 是否倒序

    Returns:
        top_sorted_values: 排序后值
        top_sorted_indexes: 值对应位置
    """
    axis_length = array.shape[axis]
    if top_k > axis_length:
        print(f"top_k: {top_k} 超界, 将被设置为：{axis_length}")
        top_k = axis_length

    if reverse:
        # axis_length = array.shape[axis]
        partition_index = np.take(np.argpartition(array, kth=-top_k, axis=axis),
                                  range(axis_length - top_k, axis_length), axis)
    else:
        partition_index = np.take(np.argpartition(array, kth=top_k, axis=axis), range(0, top_k), axis)
    top_values = np.take_along_axis(array, partition_index, axis)
    # 分区后重新排序
    sorted_index = np.argsort(top_values, axis=axis)
    if reverse:
        sorted_index = np.flip(sorted_index, axis=axis)
    top_sorted_values = np.take_along_axis(top_values, sorted_index, axis)
    top_sorted_indexes = np.take_along_axis(partition_index, sorted_index, axis)
    return top_sorted_values, top_sorted_indexes



print("一维数组排序：")

X = np.array([0.81918824, 0.81611025, 0.8363614, 0.8174835, 0.81348515, 0.8403781 ])
res = get_sorted_top_k_d2(X, top_k=3)
print(f"排序后结果：{res}")

print()
print("二维数组排序：")
X = np.array([
                [0.81918824, 0.81611025, 0.8363614, 0.8174835, 0.81348515, 0.8403781],
                [0.81918824, 0.8363614, 0.8174835, 0.81348515, 0.8403781, 0.81611025 ],
            ])

res = get_sorted_top_k_d2(X, top_k=2)
print(f"排序后结果：{res}")

一维数组排序：
排序后结果：(array([0.8403781 , 0.8363614 , 0.81918824]), array([5, 2, 0], dtype=int64))

二维数组排序：
排序后结果：(array([[0.8403781, 0.8363614],
       [0.8403781, 0.8363614]]), array([[5, 2],
       [4, 1]], dtype=int64))


#### 3. tensor 数组余璇相似度计算

In [24]:
import numpy as np
import torch
from torch import Tensor

def cos_sim(a: Tensor, b: Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))

X = np.array([0.81918824, 0.81611025, 0.8363614, 0.8174835, 0.81348515, 0.8403781 ]).reshape(-1, 1)
X = torch.tensor(X)
x1 = torch.tensor(np.array([0.01]))

cos_score = cos_sim(x1, X)[0]
top_result = torch.topk(cos_score, k=2)
index = top_result.indices.detach().cpu().numpy().tolist() ### 相似 nid
scores = top_result.values.detach().cpu().numpy().tolist() ### 相似分数
print("Tensor 内积：", cos_score)
scores

Tensor 内积： tensor([1., 1., 1., 1., 1., 1.], dtype=torch.float64)


[1.0, 1.0]

##### 余璇相似度

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

X = np.array([0.81918824, 0.81611025, 0.8363614, 0.8174835, 0.81348515, 0.8403781 ]).reshape(-1, 1)
x1 = np.array([0.01]).reshape(-1, 1)

scores = cosine_similarity(x1, X)[0]
scores

array([1., 1., 1., 1., 1., 1.])