In [1]:
# For CPU version
!pip install faiss-cpu

# For GPU version
!pip install faiss-gpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [3]:
!pip install sentence_transformers
!pip install pytrec_eval
!pip install ir_datasets
!pip install cherche --upgrade -q


Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m133.1/227.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [80]:
import faiss
import torch
from sentence_transformers import SentenceTransformer
import ir_datasets
import numpy as np
import pytrec_eval
import ir_datasets


# Charger le dataset scidocs de BEIR
dataset = ir_datasets.load("vaswani")
documents = [doc.text for doc in dataset.docs_iter()]


[INFO] [starting] http://ir.dcs.gla.ac.uk/resources/test_collections/npl/npl.tar.gz
[INFO] [finished] http://ir.dcs.gla.ac.uk/resources/test_collections/npl/npl.tar.gz: [00:00] [2.13MB] [3.06MB/s]


In [81]:
print(dataset)
queries = [query.text for query in dataset.queries_iter()]
print(queries[:3])

Dataset(id='vaswani', provides=['docs', 'queries', 'qrels'])
['MEASUREMENT OF DIELECTRIC CONSTANT OF LIQUIDS BY THE USE OF MICROWAVE TECHNIQUES\n', 'MATHEMATICAL ANALYSIS AND DESIGN DETAILS OF WAVEGUIDE FED MICROWAVE RADIATIONS\n', 'USE OF DIGITAL COMPUTERS IN THE DESIGN OF BAND PASS FILTERS HAVING GIVEN PHASE AND ATTENUATION CHARACTERISTICS\n']


In [82]:
documents = []
for item in dataset.docs_iter():
    if len(item) == 2:
        doc_id, article = item
        documents.append({'id': doc_id, 'article': article})
    elif len(item) >= 3:
        doc_id, article, title = item[:3]
        documents.append({'id': doc_id, 'title': title, 'article': article})
documents[:1]


[{'id': '1',
  'article': 'compact memories have flexible capacities  a digital data storage\nsystem with capacity up to bits and random and or sequential access\nis described\n'}]

**REDUCE TEXT**

In [83]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import string

nltk.download('punkt')
nltk.download('stopwords')

def summarize_article(article, num_sentences=3):
    # Tokenize the article into sentences
    sentences = sent_tokenize(article)

    # Tokenize the article into words
    words = word_tokenize(article.lower())

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words and word not in string.punctuation]

    # Get the frequency of each word
    word_freq = Counter(words)

    # Score each sentence based on word frequencies
    sentence_scores = {}
    for sentence in sentences:
        sentence_words = word_tokenize(sentence.lower())
        score = sum(word_freq[word] for word in sentence_words if word in word_freq)
        sentence_scores[sentence] = score

    # Select the top N sentences
    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]

    # Join the top sentences to form the summary
    summary = ' '.join(top_sentences)

    return summary
for doc in documents:
    doc['article'] = summarize_article(doc['article'])

documents[:1]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[{'id': '1',
  'article': 'compact memories have flexible capacities  a digital data storage\nsystem with capacity up to bits and random and or sequential access\nis described'}]

**REDUCE QUERIES**

In [84]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')


def create_vocabulary(queries):
    stop_words = set(stopwords.words('english'))
    vocabulary = set()

    for query in queries:
        # Tokeniser les mots de la query
        words = word_tokenize(query.lower())

        # Enlever les stopwords et la ponctuation
        words = [word for word in words if word not in stop_words and word not in string.punctuation]

        # Ajouter les mots à l'ensemble de vocabulaire
        vocabulary.update(words)

    return vocabulary

vocabulary = create_vocabulary(queries)

# Afficher le vocabulaire
print(vocabulary)


{'articles', 'harmonic', 'breaker', 'obtained', 'states', 'available', 'oxidation', 'counters', 'back', 'relation', 'presence', 'whereby', 'spherical', 'digital', 'ferromagnetic', 'elements', 'near', 'integral', 'computer', 'absorption', 'sunrise', 'rates', 'adder', 'transfer', 'servo', 'nets', 'paths', 'canonical', 'electrode', 'fast', 'tunnel', 'made', 'wish', 'producing', 'conductors', 'send', 'instantaneous', 'gases', 'relays', 'loss', 'could', 'diurnal', 'arithmetic', 'electrical', 'drift', 'printed', 'frequency', 'predicting', 'disc', 'electron', 'possibilities', 'comparison', 'mechanical', 'analysis', 'effect', 'fields', 'behaviour', 'resistivity', 'inductance', 'components', 'pertinent', 'independent', 'discharges', 'meteors', 'noise', 'division', 'pressures', 'governing', 'problems', 'give', 'cosmic', 'tuned', 'systems', 'please', 'narrow', 'minimal', 'rays', 'splitting', 'filter', 'form', 'random', 'contacts', 'moving', 'pretreatment', 'approach', 'variables', 'sun', 'connect

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**KEEPING ONLY THE QUERIES VOCABULARY**

In [85]:
def reduce_articles_to_vocabulary(articles, vocabulary):
        # Tokenize the article into words
        words = article.split()

        # Filter out words not in the vocabulary
        reduced_words = set([word for word in words if word in vocabulary])

        reduced_article = ' '.join(reduced_words)

        # Append the reduced article to the list

        return reduced_article



In [86]:
documents = []
for item in dataset.docs_iter():
    if len(item) == 2:
        doc_id, article = item
        documents.append({'id': doc_id, 'article': reduce_articles_to_vocabulary(article, vocabulary)})
    elif len(item) >= 3:
        doc_id, article, title = item[:3]
        documents.append({'id': doc_id, 'title': title, 'article': reduce_articles_to_vocabulary(article, vocabulary)})

# Print the length of the documents list to ensure it was created correctly
print(len(documents))
documents[:10]


11429


[{'id': '1', 'article': 'random digital data'},
 {'id': '2',
  'article': 'systems computer equations electronic derivation analogue stability amplifiers linear mathematical'},
 {'id': '3',
  'article': 'calculating details construction circuit electronic transformer given'},
 {'id': '4', 'article': 'computer'},
 {'id': '5',
  'article': 'logical systems computer transistor fast digital pulse circuits transformer efficiency coupled'},
 {'id': '6', 'article': 'logical binary circuits'},
 {'id': '7',
  'article': 'circuit electronic units using miniature densities elements electrical'},
 {'id': '8', 'article': 'explained circuit loss theory'},
 {'id': '9',
  'article': 'transistor circuit design using circuits nonlinear binary adder switching'},
 {'id': '10', 'article': 'switching microwave'}]

Article: Off the Beaten Path: Let’s Replace Term-Based
Retrieval with k-NN Search

In [87]:
# Charger un modèle pré-entraîné pour générer des embeddings (utilisant le GPU)
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)



In [88]:
# Convertir les documents en embeddings en utilisant le GPU
def compute_embeddings(documents, model, batch_size=64):
    embeddings = []
    for i in range(0, len(documents), batch_size):
        batch = documents[i:i + batch_size]
        batch_embeddings = model.encode(batch, convert_to_tensor=True, device=device)
        embeddings.append(batch_embeddings)
    embeddings = torch.cat(embeddings, dim=0)
    return embeddings.cpu().numpy()

embeddings = compute_embeddings(documents, model)

In [89]:
dimension = embeddings.shape[1]

# Build a HNSW flat index on the CPU
cpu_index = faiss.IndexHNSWFlat(dimension, 32)  # 32 is the number of neighbors to consider

# Add embeddings to the index
cpu_index.add(embeddings)


In [90]:
# Fonction pour rechercher les k voisins les plus proches
def search(query, k=5):
    query_embedding = model.encode([query])
    distances, indices = cpu_index.search(query_embedding, k)
    results = [(documents[idx], distances[0][i]) for i, idx in enumerate(indices[0])]
    return results

# Exemple de recherche
query = "google"
results = search(query, k=3)

print("Query:", query)
print("Results:")
for result, distance in results:
    print(f"Document: {result}, Distance: {distance}")

Query: google
Results:
Document: {'id': '7066', 'article': 'information available'}, Distance: 75.23753356933594
Document: {'id': '7817', 'article': 'pulses input effect amplifiers amplifier pulse function'}, Distance: 75.23811340332031
Document: {'id': '8317', 'article': 'transmission infinite analysis'}, Distance: 75.97570037841797


# **Transformation de la fonction search pour l'évaluation**

In [91]:
qrels = {}
for qrel in dataset.qrels_iter():
    if qrel.query_id not in qrels:
        qrels[qrel.query_id] = {}
    qrels[qrel.query_id][qrel.doc_id] = qrel.relevance

# Print qrels to verify its structure
print("Qrels:", qrels)

Qrels: {'1': {'1239': 1, '1502': 1, '4462': 1, '4569': 1, '5472': 1, '5502': 1, '6471': 1, '6480': 1, '6664': 1, '6824': 1, '7923': 1, '8150': 1, '8172': 1, '8277': 1, '9219': 1, '9859': 1, '9988': 1, '10081': 1, '10588': 1}, '2': {'414': 1, '1894': 1, '3785': 1, '4720': 1, '5894': 1, '6736': 1, '7113': 1, '7555': 1, '7749': 1, '7808': 1, '8241': 1, '8383': 1, '9112': 1, '9835': 1, '10802': 1}, '3': {'141': 1, '148': 1, '813': 1, '1610': 1, '2429': 1, '3059': 1, '3272': 1, '3398': 1, '3614': 1, '3688': 1, '3708': 1, '4437': 1, '4710': 1, '4725': 1, '4833': 1, '5476': 1, '5662': 1, '5856': 1, '5976': 1, '6351': 1, '6885': 1, '6974': 1, '7086': 1, '7177': 1, '7304': 1, '7571': 1, '8007': 1, '8232': 1, '8957': 1, '9289': 1, '10174': 1, '10484': 1, '10486': 1}, '4': {'2042': 1, '2180': 1, '3595': 1, '4057': 1, '7985': 1}, '5': {'775': 1, '922': 1, '3484': 1, '4740': 1}, '6': {'402': 1, '2546': 1, '4180': 1, '4419': 1, '5435': 1, '5437': 1, '5440': 1, '8099': 1, '10162': 1, '11178': 1}, '7'

In [92]:
def search(query, k=5):
    query_embedding = model.encode([query])
    distances, indices = cpu_index.search(query_embedding, k)
    results = {
        str(doc_id): float(distances[0][i]) for i, doc_id in enumerate(indices[0])
    }
    return results
print(search("google",5))

{'7065': 75.23753356933594, '7816': 75.23811340332031, '8316': 75.97570037841797, '6816': 76.2222900390625, '7817': 76.47123718261719}


In [93]:
# Perform search and evaluate for each query in the dataset
run = {}
for query in dataset.queries_iter():
    query_id = query.query_id
    query_text = query.text
    results = search(query_text, k=5)
    run[query_id] = results

# Print run to verify its structure
print("Run:", run)


Run: {'1': {'10057': 59.56406784057617, '9547': 60.27980422973633, '9519': 60.564483642578125, '9557': 61.01860046386719, '9565': 62.32025146484375}, '2': {'9709': 56.01290512084961, '9757': 56.601070404052734, '10257': 56.70231628417969, '10284': 56.93160629272461, '9457': 57.18441390991211}, '3': {'3709': 53.807777404785156, '3784': 53.928932189941406, '5709': 54.21758270263672, '5065': 54.24217987060547, '4674': 54.44663619995117}, '4': {'7031': 63.74185562133789, '9832': 63.92003631591797, '9715': 66.17645263671875, '9332': 66.59835052490234, '9732': 66.76446533203125}, '5': {'2009': 64.98783874511719, '2002': 67.74758911132812, '2012': 69.640625, '3784': 70.51730346679688, '9715': 70.66012573242188}, '6': {'10107': 57.65153503417969, '10197': 57.69532012939453, '6832': 58.686378479003906, '10183': 58.68714904785156, '10188': 58.717247009277344}, '7': {'3822': 65.65267944335938, '3784': 65.75665283203125, '4784': 66.2749252319336, '3984': 67.447509765625, '3884': 67.74113464355469}

Transformation format to Dictionnaire car pytrc_eval

In [94]:
# Evaluate using pytrec_eval
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'ndcg', 'map'})
evaluation_results = evaluator.evaluate(run)

# Print evaluation results
for query_id, metrics in evaluation_results.items():
    for metric, value in metrics.items():
        print(f"Query ID: {query_id} - {metric}: {value}")

# Compute and print aggregated measure
try:
    aggregated_ndcg = pytrec_eval.compute_aggregated_measure(
        'ndcg',
        [metrics['ndcg'] for metrics in evaluation_results.values()]
    )
    print("Aggregated NDCG:", aggregated_ndcg)
except KeyError as e:
    print(f"KeyError: {e} not found in evaluation results.")


Query ID: 1 - map: 0.0
Query ID: 1 - ndcg: 0.0
Query ID: 2 - map: 0.0
Query ID: 2 - ndcg: 0.0
Query ID: 3 - map: 0.0
Query ID: 3 - ndcg: 0.0
Query ID: 4 - map: 0.0
Query ID: 4 - ndcg: 0.0
Query ID: 5 - map: 0.0
Query ID: 5 - ndcg: 0.0
Query ID: 6 - map: 0.0
Query ID: 6 - ndcg: 0.0
Query ID: 7 - map: 0.0
Query ID: 7 - ndcg: 0.0
Query ID: 8 - map: 0.0
Query ID: 8 - ndcg: 0.0
Query ID: 9 - map: 0.0
Query ID: 9 - ndcg: 0.0
Query ID: 10 - map: 0.0
Query ID: 10 - ndcg: 0.0
Query ID: 11 - map: 0.0
Query ID: 11 - ndcg: 0.0
Query ID: 12 - map: 0.0
Query ID: 12 - ndcg: 0.0
Query ID: 13 - map: 0.0
Query ID: 13 - ndcg: 0.0
Query ID: 14 - map: 0.0
Query ID: 14 - ndcg: 0.0
Query ID: 15 - map: 0.0
Query ID: 15 - ndcg: 0.0
Query ID: 16 - map: 0.0
Query ID: 16 - ndcg: 0.0
Query ID: 17 - map: 0.0
Query ID: 17 - ndcg: 0.0
Query ID: 18 - map: 0.0
Query ID: 18 - ndcg: 0.0
Query ID: 19 - map: 0.0
Query ID: 19 - ndcg: 0.0
Query ID: 20 - map: 0.0
Query ID: 20 - ndcg: 0.0
Query ID: 21 - map: 0.0
Query ID: 21 -

In [95]:
import pandas as pd
#qrles
qrel = {k: {kk:int(vv) for kk,vv in v[['doc_id','relevance']].values} for k, v in pd.DataFrame(dataset.qrels_iter()).groupby('query_id')[['doc_id','relevance']]}
evaluator = pytrec_eval.RelevanceEvaluator(
    qrel, {'map', 'ndcg_cut'})

In [96]:
pd.DataFrame(evaluator.evaluate(run)).T.mean()

map              0.0
ndcg_cut_5       0.0
ndcg_cut_10      0.0
ndcg_cut_15      0.0
ndcg_cut_20      0.0
ndcg_cut_30      0.0
ndcg_cut_100     0.0
ndcg_cut_200     0.0
ndcg_cut_500     0.0
ndcg_cut_1000    0.0
dtype: float64