In [1]:
# For CPU version
!pip install faiss-cpu
# For GPU version
!pip install faiss-gpu
!pip install sentence_transformers
!pip install pytrec_eval
!pip install ir_datasets
!pip install cherche --upgrade -q
!pip install nltk



Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0.post1
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvr

In [2]:
import faiss
import torch
from sentence_transformers import SentenceTransformer
import ir_datasets
import numpy as np
import pytrec_eval
import ir_datasets
import nltk
import nltk
from nltk.corpus import wordnet as wn
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string



# Charger le dataset scidocs de BEIR
dataset = ir_datasets.load("vaswani")


  from tqdm.autonotebook import tqdm, trange


In [3]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
def get_synonyms(word):
    synonyms = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace('_', ' '))
    return synonyms

def expand_query(query):
    words = query.split()
    expanded_queries = set()
    expanded_queries.add(query)

    for word in words:
        synonyms = get_synonyms(word)
        for synonym in synonyms:
            new_query = query.replace(word, synonym)
            expanded_queries.add(new_query)

    return list(expanded_queries)

def treat_querie(text):

  expanded_queries = expand_query(text)
  final_req=[]
  for eq in expanded_queries:
      final_req.append(eq)
  tokens = set()
  for sentence in final_req:
      words = word_tokenize(sentence)
      tokens.update(words)  # Add tokens to the set
      sentence = ' '.join(tokens)
      return sentence

queries = [treat_querie(query.text) for query in dataset.queries_iter()]


queries[10]


[INFO] [starting] http://ir.dcs.gla.ac.uk/resources/test_collections/npl/npl.tar.gz
[INFO] [finished] http://ir.dcs.gla.ac.uk/resources/test_collections/npl/npl.tar.gz: [00:01] [2.13MB] [1.34MB/s]


'OF CIRCUIT outcome CONTACTS BREAKER THE ON OXIDATION'

In [5]:
def create_vocabulary(queries):
    stop_words = set(stopwords.words('english'))
    vocabulary = set()

    for query in queries:
        # Tokeniser les mots de la query
        words = word_tokenize(query.lower())

        # Enlever les stopwords et la ponctuation
        words = [word for word in words if word not in stop_words and word not in string.punctuation]

        # Ajouter les mots à l'ensemble de vocabulaire
        vocabulary.update(words)

    return vocabulary

vocabulary = create_vocabulary(queries)

# Afficher le vocabulaire
print(vocabulary)



{'relationships', 'inductance', 'nets', 'parametric', 'prompt', 'emitted', 'gather', 'tack', 'compactness', 'honegh', 'showing', 'semiconductor', 'low', 'possibilities', 'circumference', 'determination', 'virtual', 'observations', 'temperampereture', 'error', 'hybridise', 'dig', 'synthesis', 'film', 'temperatures', 'pressures', 'distribution', 'devices', 'turn', 'indium', 'reactive', 'investigations', 'filter', 'work', 'element', 'units', 'loneke', 'capacitive', 'planet', 'collision', 'diagrams', 'microwave', 'richly', 'results', 'boundary', 'reflection', 'slab', 'rectification', 'masses', 'absorption', 'mechanisms', 'equator', 'abstracts', 'meteors', 'habituate', 'statonec', 'distortions', 'relay', 'machines', 'running', 'differential', 'techniques', 'printed', 'band', 'phase', 'superconductor', 'metallic', 'pass', 'coupled', 'shift', 'rates', 'active', 'minimal', 'inward', 'positive', 'servosystem', 'ray', 'narrow', 'resistance', 'states', 'resonator', 'block', 'one', 'information', 

In [6]:
def reduce_articles_to_vocabulary(articles, vocabulary):
        words = article.split()
        reduced_words = set([word for word in words if word in vocabulary])
        reduced_article = ' '.join(reduced_words)
        return reduced_article

documents = []
for item in dataset.docs_iter():
    if len(item) == 2:
        doc_id, article = item
        documents.append({'id': doc_id, 'article': reduce_articles_to_vocabulary(article, vocabulary)})
    elif len(item) >= 3:
        doc_id, title, article = item[:3]
        documents.append({'id': doc_id, 'title': title, 'article': reduce_articles_to_vocabulary(article, vocabulary)})

# Print the length of the documents list to ensure it was created correctly
print(len(documents))
documents[:10]




11429


[{'id': '1', 'article': 'random data digital'},
 {'id': '2',
  'article': 'mathematical analogue computer electronic stability equations amplifiers derivation systems'},
 {'id': '3',
  'article': 'construction transformer electronic circuit given details'},
 {'id': '4', 'article': 'computer'},
 {'id': '5',
  'article': 'transformer computer pulse transistor fast digital logical circuits efficiency coupled systems'},
 {'id': '6', 'article': 'logical circuits binary'},
 {'id': '7',
  'article': 'miniature units elements electronic electrical circuit using densities'},
 {'id': '8', 'article': 'loss element theory circuit explained'},
 {'id': '9',
  'article': 'nonlinear element adder binary transistor circuit using circuits design'},
 {'id': '10', 'article': 'microwave'}]

In [7]:
# Charger un modèle pré-entraîné pour générer des embeddings (utilisant le GPU)
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
def tokenize_documents(documents):
    return [{'id': doc['id'], 'tokens': word_tokenize(doc['article'].lower())} for doc in documents]


In [9]:
tokenized_docs = tokenize_documents(documents)


In [10]:
def compute_embeddings(documents, model, batch_size=64):
    embeddings = []
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        batch_embeddings = model.encode(batch, convert_to_tensor=True, device=device)
        embeddings.append(batch_embeddings)
    embeddings = torch.cat(embeddings, dim=0)
    return embeddings.cpu().numpy()

embeddings = compute_embeddings(documents, model)

In [11]:
dimension = embeddings.shape[1]

# Build a HNSW flat index on the CPU
cpu_index = faiss.IndexHNSWFlat(dimension, 32)  # 32 is the number of neighbors to consider

# Add embeddings to the index
cpu_index.add(embeddings)


In [12]:
!pip install rank-bm25


Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [13]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import faiss
import torch
def bm25_search(query, tokenized_docs, bm25, N=5):
    tokenized_query = word_tokenize(query.lower())
    doc_scores = bm25.get_scores(tokenized_query)
    top_n_doc_indices = sorted(range(len(tokenized_docs)), key=lambda i: doc_scores[i], reverse=True)[:N]
    results = {tokenized_docs[i]['id']: float(doc_scores[i]) for i in top_n_doc_indices}
    return top_n_doc_indices, results

# Initialize BM25
bm25 = BM25Okapi([doc['tokens'] for doc in tokenized_docs])

In [14]:
def refine_with_embeddings(query, top_n_indices, documents, embeddings, model, k=5):
    query_embedding = model.encode([query], convert_to_tensor=True, device=device).cpu().numpy()
    top_n_embeddings = embeddings[top_n_indices]
    faiss_index = faiss.IndexFlatL2(dimension)
    faiss_index.add(top_n_embeddings)
    distances, indices = faiss_index.search(query_embedding, k)
    refined_results = {documents[top_n_indices[doc_id]]['id']: float(distances[0][i]) for i, doc_id in enumerate(indices[0])}
    return refined_results

# Combined search function
def combined_search(query, bm25, tokenized_docs, documents, embeddings, model, N=5, k=5):
    top_n_indices, bm25_results = bm25_search(query, tokenized_docs, bm25, N)
    refined_results = refine_with_embeddings(query, top_n_indices, documents, embeddings, model, k)
    return refined_results

In [15]:
run = {}
for queryid, querytext in dataset.queries_iter():
        top_n_docs = combined_search(querytext, bm25, tokenized_docs, documents, embeddings, model, 100, 100)
        run[queryid] = {doc_id: rank + 1 for rank, (doc_id, _) in enumerate(top_n_docs.items())}

In [16]:
import pytrec_eval
import pandas as pd

qrel = {k: {kk:int(vv) for kk,vv in v[['doc_id','relevance']].values} for k, v in pd.DataFrame(dataset.qrels_iter()).groupby('query_id')[['doc_id','relevance']]}

evaluator = pytrec_eval.RelevanceEvaluator(qrel, {'map', 'ndcg_cut' , 'P.10'})

evaluation_results = evaluator.evaluate(run)

evaluation_df = pd.DataFrame(evaluation_results).T
mean_scores = evaluation_df.mean()

print(mean_scores)

map              0.055623
P_10             0.077419
ndcg_cut_5       0.080067
ndcg_cut_10      0.078540
ndcg_cut_15      0.086175
ndcg_cut_20      0.089537
ndcg_cut_30      0.102663
ndcg_cut_100     0.224808
ndcg_cut_200     0.224808
ndcg_cut_500     0.224808
ndcg_cut_1000    0.224808
dtype: float64


In [19]:
print(combined_search(querytext, bm25, tokenized_docs, documents, embeddings, model, 100, 100))

{'9566': 61.80387878417969, '8336': 68.1569595336914, '7464': 69.256591796875, '5893': 70.0606689453125, '8049': 70.293701171875, '10810': 71.16964721679688, '7813': 71.45637512207031, '10083': 72.23856353759766, '3833': 72.68445587158203, '6713': 73.08155822753906, '9298': 73.39193725585938, '9108': 74.02996826171875, '3983': 74.26953125, '10514': 74.49614715576172, '659': 74.49893188476562, '1156': 74.51539611816406, '4733': 74.5890121459961, '6798': 74.75718688964844, '8019': 74.92147827148438, '6091': 74.92559814453125, '11022': 74.99189758300781, '7675': 75.0343246459961, '9297': 75.31538391113281, '8138': 75.59040069580078, '4195': 75.73612976074219, '2808': 76.13597106933594, '11347': 76.23736572265625, '8651': 76.461181640625, '3256': 76.51048278808594, '4735': 76.86026000976562, '11': 77.89012908935547, '8460': 77.94376373291016, '419': 78.34517669677734, '5049': 78.977783203125, '9560': 79.02005767822266, '443': 79.10749816894531, '5672': 79.41031646728516, '11094': 79.894302