In [13]:
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer, util



In [14]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/LaBSE')

In [15]:
import numpy as np
sentences = ["This is an example sentence", "Đây là câu dài"]
embeddings = model.encode(sentences)
def cos_sim(a, b):
    """Calculates the cosine similarity between two vectors.
    """
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

similarity = cos_sim(embeddings[0], embeddings[1])

print(f"Cosine similarity: {similarity:.4f}")

Cosine similarity: 0.6602


In [24]:
import json
with open("LVT_vie.txt", "r", encoding="utf-8") as f_vie:
    lines = [line.strip() for line in f_vie.readlines()]

with open("LVT_eng.txt", "r", encoding="utf-8") as f_eng:
    lines_eng = [line.strip() for line in f_eng.readlines()]

# 3. Encode all sentences
embeddings_vie = model.encode(lines)       # Shape: [V, d]
embeddings_eng = model.encode(lines_eng)   # Shape: [E, d]

# 4. Compute similarity matrix (PyTorch tensor), shape: [V, E]
similarity_matrix = util.cos_sim(embeddings_vie, embeddings_eng)

# 5. Collect sentence pairs with similarity > 0.5
threshold = 0.55
pairs_above_threshold = []

for i in range(len(lines)):
    for j in range(len(lines_eng)):
        score = similarity_matrix[i, j].item()
        if score > threshold:
            pairs_above_threshold.append({
                "vie_sentence": lines[i],
                "eng_sentence": lines_eng[j],
                "score": score
            })
counts = {}
for pair in pairs_above_threshold:
    vs = pair["vie_sentence"]
    counts[vs] = counts.get(vs, 0) + 1

# 7. Remove all pairs if vie_sentence appears more than once
filtered_pairs = [
    pair for pair in pairs_above_threshold
    if counts[pair["vie_sentence"]] == 1
]

# 8. Save filtered pairs to JSON
output_path = "pairs_unique_vie.json"
with open(output_path, "w", encoding="utf-8") as f_json:
    json.dump(filtered_pairs, f_json, ensure_ascii=False, indent=4)

print(f"Original pairs above threshold: {len(pairs_above_threshold)}")
print(f"Filtered pairs: {len(filtered_pairs)}")
print(f"Results saved to {output_path}")

Original pairs above threshold: 268
Filtered pairs: 216
Results saved to pairs_unique_vie.json
