In [2]:
import torch
from sentence_transformers import SentenceTransformer, util
import re
import unicodedata

In [3]:
MODEL_PATH = "sentence-transformers/all-MiniLM-L6-v2"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"    

In [4]:
print(f"⏳ Loading Similarity Model on {DEVICE}...")
try:
    sim_model = SentenceTransformer(MODEL_PATH, device=DEVICE)
    print("✅ Similarity Model loaded.")
except Exception as e:
    print(f"⚠️ Error loading Similarity Model: {e}")
    sim_model = None

⏳ Loading Similarity Model on cpu...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Similarity Model loaded.


In [6]:
def preprocess_text(text):
    text = unicodedata.normalize("NFC", text)
    text = re.sub(r"<[^>]*>", " ", text)
    text = re.sub(r"[@#]{2,}", " ", text)
    text = re.sub(r"[^0-9a-zA-ZÀ-ỹ.,!?;:()\-\s]", " ", text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [7]:
def calculate_similarity(cv_text: str, jd_text: str) -> float:
    """Tool tính điểm tương đồng giữa CV và JD."""
    if sim_model is None:
        return 0.0
        
    processed_cv = preprocess_text(cv_text)
    processed_jd = preprocess_text(jd_text)
    
    emb1 = sim_model.encode(processed_jd, convert_to_tensor=True)
    emb2 = sim_model.encode(processed_cv, convert_to_tensor=True)
    
    similarity = util.cos_sim(emb1, emb2)
    return round(similarity.item(), 4)