# Thử embedding thuần để tìm job liên quan sử dụng multilingual-e5-large

In [2]:
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel


def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large')


In [7]:
# Each input text should start with "query: " or "passage: ", even for non-English texts.
# For tasks other than retrieval, you can simply use the "query: " prefix.
input_texts = ['air commodore',
               'doctor']

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

# Nhận embeddings từ mô hình
outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# Chuẩn hóa embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)

# Tính cosine similarity giữa hai từ
score = (embeddings[0] @ embeddings[1].T).item() * 100  # Chuyển thành số duy nhất
print(f"Similarity Score: {score:.2f}")

Similarity Score: 79.31


# Test bản baseline

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import subprocess

  from tqdm.autonotebook import tqdm, trange


In [2]:
queries = r"D:\TalentCLEF\TaskA\Data\TaskA\validation\english\queries"
corpus_elements = r"D:\TalentCLEF\TaskA\Data\TaskA\validation\english\corpus_elements"

In [None]:
queries = pd.read_csv(queries,sep="\t")
corpus_elements = pd.read_csv(corpus_elements, sep="\t")
queries_ids = queries.q_id.to_list()
queries_texts = queries.jobtitle.to_list()
map_queries = dict(zip(queries_ids,queries_texts))

corpus_ids = corpus_elements.c_id.to_list()
corpus_texts = corpus_elements.jobtitle.to_list()
map_corpus = dict(zip(queries_ids,queries_texts))
model = SentenceTransformer("all-MiniLM-L6-v2")

query_embeddings = model.encode(queries_texts, convert_to_tensor=True)
corpus_embeddings = model.encode(corpus_texts, convert_to_tensor=True)
similarities = util.cos_sim(query_embeddings, corpus_embeddings).cpu().numpy()
import numpy as np
results = []
for q_idx, q_id in enumerate(queries_ids):
    print(q_idx)
    sorted_indices = np.argsort(-similarities[q_idx])  # Decrease order
    for rank, c_idx in enumerate(sorted_indices[:10]):  # For this tutorial consider only 10 relevant files
        doc_id = corpus_ids[c_idx]
        score = similarities[q_idx, c_idx]
        results.append(f"{str(q_id)} Q0 {str(doc_id)} {rank+1} {score:.4f} baseline_model")

with open("evaluation_baseline.trec", "w", encoding="utf-8") as f:
    f.write("\n".join(results))



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
