In [33]:
# find models on https://huggingface.co/models?library=sentence-transformers&language=zh&sort=downloads
from sentence_transformers import SentenceTransformer
from dipamkara.embedding import find_distance, Metric
from numpy import asarray

sentence_model = SentenceTransformer('moka-ai/m3e-small')

sentences1 = "我不是猪"
sentences2 = "不我是猪"
sentences3 = "不我不是猪"
sentences4 = "我是猪吗"

text_embeddings1 = asarray(sentence_model.encode(sentences1).tolist())
text_embeddings2 = asarray(sentence_model.encode(sentences2).tolist())
text_embeddings3 = asarray(sentence_model.encode(sentences3).tolist())
text_embeddings4 = asarray(sentence_model.encode(sentences4).tolist())

distance_between_12 = find_distance(text_embeddings1, text_embeddings2, metric=Metric.EUCLIDEAN_Z_SCORE)
distance_between_13 = find_distance(text_embeddings1, text_embeddings3, metric=Metric.EUCLIDEAN_Z_SCORE)
distance_between_14 = find_distance(text_embeddings1, text_embeddings4, metric=Metric.EUCLIDEAN_Z_SCORE)

print(f'{sentences1}:{sentences2} {distance_between_12:.3f}')
print(f'{sentences1}:{sentences3} {distance_between_13:.3f}')
print(f'{sentences1}:{sentences4} {distance_between_14:.3f}')

我不是猪:不我是猪 5.638
我不是猪:不我不是猪 5.321
我不是猪:我是猪吗 7.881


In [34]:
sentences5 = "苹果"
sentences6 = "这是水果还是科技公司"
sentences7 = "吴子豪有两个眼睛和一个嘴巴"

text_embeddings5 = asarray(sentence_model.encode(sentences5).tolist())
text_embeddings6 = asarray(sentence_model.encode(sentences6).tolist())
text_embeddings7 = asarray(sentence_model.encode(sentences7).tolist())

distance_between_5_6 = find_distance(text_embeddings5, text_embeddings6, metric=Metric.EUCLIDEAN)
distance_between_5_65 = find_distance(text_embeddings5, (text_embeddings6 + text_embeddings5), metric=Metric.EUCLIDEAN)
distance_between_7_5 = find_distance(text_embeddings7, text_embeddings6, metric=Metric.EUCLIDEAN)
distance_between_7_6 = find_distance(text_embeddings7, text_embeddings6, metric=Metric.EUCLIDEAN)
distance_between_7_65 = find_distance(text_embeddings7, (text_embeddings6 + text_embeddings5), metric=Metric.EUCLIDEAN)
{
    f'metric': Metric.EUCLIDEAN,
    f'{sentences5} vs {sentences6}': distance_between_5_6, 
    f'{sentences5} vs {sentences6} + {sentences5}': distance_between_5_65, 
    f'{sentences7} vs {sentences5}': distance_between_7_5, 
    f'{sentences7} vs {sentences6}': distance_between_7_6, 
    f'{sentences7} vs {sentences6} + {sentences5}': distance_between_7_65
}

{'metric': <Metric.EUCLIDEAN: 'euclidean'>,
 '苹果 vs 这是水果还是科技公司': np.float64(10.348851305443796),
 '苹果 vs 这是水果还是科技公司 + 苹果': np.float64(17.395088547223775),
 '吴子豪有两个眼睛和一个嘴巴 vs 苹果': np.float64(14.373382783011161),
 '吴子豪有两个眼睛和一个嘴巴 vs 这是水果还是科技公司': np.float64(14.373382783011161),
 '吴子豪有两个眼睛和一个嘴巴 vs 这是水果还是科技公司 + 苹果': np.float64(26.359705384131434)}

In [35]:
text_embeddings5 = asarray(sentence_model.encode(sentences5).tolist())
text_embeddings6 = asarray(sentence_model.encode(sentences6).tolist())
text_embeddings7 = asarray(sentence_model.encode(sentences7).tolist())

distance_between_5_6 = find_distance(text_embeddings5, text_embeddings6, metric=Metric.EUCLIDEAN_L2)
distance_between_5_65 = find_distance(text_embeddings5, (text_embeddings6 + text_embeddings5), metric=Metric.EUCLIDEAN_L2)
distance_between_7_5 = find_distance(text_embeddings7, text_embeddings6, metric=Metric.EUCLIDEAN_L2)
distance_between_7_6 = find_distance(text_embeddings7, text_embeddings6, metric=Metric.EUCLIDEAN_L2)
distance_between_7_65 = find_distance(text_embeddings7, (text_embeddings6 + text_embeddings5), metric=Metric.EUCLIDEAN_L2)
{
    f'metric': Metric.EUCLIDEAN_L2,
    f'{sentences5} vs {sentences6}': distance_between_5_6, 
    f'{sentences5} vs {sentences6} + {sentences5}': distance_between_5_65, 
    f'{sentences7} vs {sentences5}': distance_between_7_5, 
    f'{sentences7} vs {sentences6}': distance_between_7_6, 
    f'{sentences7} vs {sentences6} + {sentences5}': distance_between_7_65
}

{'metric': <Metric.EUCLIDEAN_L2: 'euclidean_l2'>,
 '苹果 vs 这是水果还是科技公司': np.float64(0.5733254565543235),
 '苹果 vs 这是水果还是科技公司 + 苹果': np.float64(0.28050320131787926),
 '吴子豪有两个眼睛和一个嘴巴 vs 苹果': np.float64(0.8481474896191218),
 '吴子豪有两个眼睛和一个嘴巴 vs 这是水果还是科技公司': np.float64(0.8481474896191218),
 '吴子豪有两个眼睛和一个嘴巴 vs 这是水果还是科技公司 + 苹果': np.float64(0.8111842003881923)}

In [36]:
text_embeddings5 = asarray(sentence_model.encode(sentences5).tolist())
text_embeddings6 = asarray(sentence_model.encode(sentences6).tolist())
text_embeddings7 = asarray(sentence_model.encode(sentences7).tolist())

distance_between_5_6 = find_distance(text_embeddings5, text_embeddings6, metric=Metric.EUCLIDEAN_Z_SCORE)
distance_between_5_65 = find_distance(text_embeddings5, (text_embeddings6 + text_embeddings5), metric=Metric.EUCLIDEAN_Z_SCORE)
distance_between_7_5 = find_distance(text_embeddings7, text_embeddings6, metric=Metric.EUCLIDEAN_Z_SCORE)
distance_between_7_6 = find_distance(text_embeddings7, text_embeddings6, metric=Metric.EUCLIDEAN_Z_SCORE)
distance_between_7_65 = find_distance(text_embeddings7, (text_embeddings6 + text_embeddings5), metric=Metric.EUCLIDEAN_Z_SCORE)
{
    f'metric': Metric.EUCLIDEAN_Z_SCORE,
    f'{sentences5} vs {sentences6}': distance_between_5_6, 
    f'{sentences5} vs {sentences6} + {sentences5}': distance_between_5_65, 
    f'{sentences7} vs {sentences5}': distance_between_7_5, 
    f'{sentences7} vs {sentences6}': distance_between_7_6, 
    f'{sentences7} vs {sentences6} + {sentences5}': distance_between_7_65
}

{'metric': <Metric.EUCLIDEAN_Z_SCORE: 'euclidean_z_score'>,
 '苹果 vs 这是水果还是科技公司': np.float64(12.973061304265562),
 '苹果 vs 这是水果还是科技公司 + 苹果': np.float64(6.347148823625168),
 '吴子豪有两个眼睛和一个嘴巴 vs 苹果': np.float64(19.191669015214906),
 '吴子豪有两个眼睛和一个嘴巴 vs 这是水果还是科技公司': np.float64(19.191669015214906),
 '吴子豪有两个眼睛和一个嘴巴 vs 这是水果还是科技公司 + 苹果': np.float64(18.35527376172994)}

In [37]:
text_embeddings5 = asarray(sentence_model.encode(sentences5).tolist())
text_embeddings6 = asarray(sentence_model.encode(sentences6).tolist())
text_embeddings7 = asarray(sentence_model.encode(sentences7).tolist())

distance_between_5_6 = find_distance(text_embeddings5, text_embeddings6, metric=Metric.COSINE)
distance_between_5_65 = find_distance(text_embeddings5, (text_embeddings6 + text_embeddings5), metric=Metric.COSINE)
distance_between_7_5 = find_distance(text_embeddings7, text_embeddings6, metric=Metric.COSINE)
distance_between_7_6 = find_distance(text_embeddings7, text_embeddings6, metric=Metric.COSINE)
distance_between_7_65 = find_distance(text_embeddings7, (text_embeddings6 + text_embeddings5), metric=Metric.COSINE)
{
    f'metric': Metric.COSINE,
    f'{sentences5} vs {sentences6}': distance_between_5_6, 
    f'{sentences5} vs {sentences6} + {sentences5}': distance_between_5_65, 
    f'{sentences7} vs {sentences5}': distance_between_7_5, 
    f'{sentences7} vs {sentences6}': distance_between_7_6, 
    f'{sentences7} vs {sentences6} + {sentences5}': distance_between_7_65
}

{'metric': <Metric.COSINE: 'cosine'>,
 '苹果 vs 这是水果还是科技公司': np.float64(0.16435103956661168),
 '苹果 vs 这是水果还是科技公司 + 苹果': np.float64(0.039341022974789364),
 '吴子豪有两个眼睛和一个嘴巴 vs 苹果': np.float64(0.35967708207360927),
 '吴子豪有两个眼睛和一个嘴巴 vs 这是水果还是科技公司': np.float64(0.35967708207360927),
 '吴子豪有两个眼睛和一个嘴巴 vs 这是水果还是科技公司 + 苹果': np.float64(0.32900990347971537)}

In [38]:
text_embeddings5 = asarray(sentence_model.encode(sentences5).tolist())
text_embeddings6 = asarray(sentence_model.encode(sentences6).tolist())
text_embeddings7 = asarray(sentence_model.encode(sentences7).tolist())

distance_between_5_6 = find_distance(text_embeddings5, text_embeddings6, metric=Metric.CHEBYSHEV)
distance_between_5_65 = find_distance(text_embeddings5, (text_embeddings6 + text_embeddings5), metric=Metric.CHEBYSHEV)
distance_between_7_5 = find_distance(text_embeddings7, text_embeddings6, metric=Metric.CHEBYSHEV)
distance_between_7_6 = find_distance(text_embeddings7, text_embeddings6, metric=Metric.CHEBYSHEV)
distance_between_7_65 = find_distance(text_embeddings7, (text_embeddings6 + text_embeddings5), metric=Metric.CHEBYSHEV)
{
    f'metric': Metric.CHEBYSHEV,
    f'{sentences5} vs {sentences6}': distance_between_5_6, 
    f'{sentences5} vs {sentences6} + {sentences5}': distance_between_5_65, 
    f'{sentences7} vs {sentences5}': distance_between_7_5, 
    f'{sentences7} vs {sentences6}': distance_between_7_6, 
    f'{sentences7} vs {sentences6} + {sentences5}': distance_between_7_65
}

{'metric': <Metric.CHEBYSHEV: 'chebyshev'>,
 '苹果 vs 这是水果还是科技公司': np.float64(1.5694004893302917),
 '苹果 vs 这是水果还是科技公司 + 苹果': np.float64(3.2744686603546143),
 '吴子豪有两个眼睛和一个嘴巴 vs 苹果': np.float64(2.012061893939972),
 '吴子豪有两个眼睛和一个嘴巴 vs 这是水果还是科技公司': np.float64(2.012061893939972),
 '吴子豪有两个眼睛和一个嘴巴 vs 这是水果还是科技公司 + 苹果': np.float64(4.639104247093201)}