# Colbert with Pyterrier 

## Setup

- install libraries (if necessary)
- all imports here
- connecting to tira & printoptions etc.

In [3]:
#!pip install transformers faiss-gpu faiss-cpu torch
#!pip install tira ir-datasets python-terrier
#!pip install sentence-transformers

#!pip install faiss-gpu==1.6.3  # version in colbert tutorial

[31mERROR: Could not find a version that satisfies the requirement faiss-gpu==1.6.3 (from versions: 1.7.1.post3, 1.7.2)[0m[31m
[0m[31mERROR: No matching distribution found for faiss-gpu==1.6.3[0m[31m
[0m

In [4]:
#!pip install --upgrade git+https://github.com/terrierteam/pyterrier_colbert.git

In [5]:
# ColBERTv2 checkpoint trained on MS MARCO Passage Ranking (388MB compressed)
!wget https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/colbertv2.0.tar.gz -P downloads/
!tar -xvzf downloads/colbertv2.0.tar.gz -C downloads/

--2024-07-31 09:23:38--  https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/colbertv2.0.tar.gz
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405924985 (387M) [application/octet-stream]
Saving to: ‘downloads/colbertv2.0.tar.gz’


2024-07-31 09:24:52 (5.30 MB/s) - ‘downloads/colbertv2.0.tar.gz’ saved [405924985/405924985]

colbertv2.0/
colbertv2.0/artifact.metadata
colbertv2.0/vocab.txt
colbertv2.0/tokenizer.json
colbertv2.0/special_tokens_map.json
colbertv2.0/tokenizer_config.json
colbertv2.0/config.json
colbertv2.0/pytorch_model.bin


In [7]:
import os
import sys
import time
import json
import re
import importlib
import random

import numpy as np
import pandas as pd
import torch
import faiss
import pyterrier as pt

# Encoder and Tokenizer models
from transformers import AutoTokenizer, AutoModel
from transformers import BertModel, BertTokenizer
from transformers import RobertaModel, RobertaTokenizerFast, RobertaTokenizer
from transformers import DebertaModel, DebertaTokenizerFast, DebertaTokenizer
from transformers import DistilBertTokenizer, DistilBertModel
from sentence_transformers import SentenceTransformer

# Tira and Pyterrier Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.third_party_integrations import ir_datasets
from tira.rest_api_client import Client

In [8]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()


# colbert pyterrier
import pyterrier_colbert.indexing



# Print options for pandas
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.precision", 4)
pd.set_option("display.max_rows", None)
pd.set_option('display.float_format', '{:.5f}'.format)


# Use GPU if available
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"device: {device}")

# TODO: set seed!!


COLAB='google.colab' in sys.modules
if COLAB:
    # mount to drive
    pass


PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


device: cpu


## The Dataset

### instanciate the dataset

In [9]:
checkpoint="http://www.dcs.gla.ac.uk/~craigm/colbert.dnn.zip"
dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

#!rm -rf /colbertindex

indexer = pyterrier_colbert.indexing.ColBERTIndexer(checkpoint, ".", "colbertindex", chunksize=3)
indexer.index(dataset.get_corpus_iter())


ir-lab-sose-2024/ir-acl-anthology-20240504-training documents:   0%|          | 0/126958 [00:00<?, ?it/s]

[Jul 31, 09:31:59] [0] 		 #> Local args.bsize = 128
[Jul 31, 09:31:59] [0] 		 #> args.index_root = .
[Jul 31, 09:31:59] [0] 		 #> self.possible_subset_sizes = [69905]


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ColBERT were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [4]:
# Mini-Test corpus - to use with the test-run cells
corpus = {
    "doc1": ["doc1", "Elephants are the largest living land animals. Three living species are currently recognised: the African bush elephant (Loxodonta africana), the African forest elephant (L. cyclotis), and the Asian elephant (Elephas maximus). They are the only surviving members of the family Elephantidae and the order Proboscidea; extinct relatives include mammoths and mastodons. Distinctive features of elephants include a long proboscis called a trunk, tusks, large ear flaps, pillar-like legs, and tough but sensitive grey skin. The trunk is prehensile, bringing food and water to the mouth and grasping objects. Tusks, which are derived from the incisor teeth, serve both as weapons and as tools for moving objects and digging. The large ear flaps assist in maintaining a constant body temperature as well as in communication. African elephants have larger ears and concave backs, whereas Asian elephants have smaller ears and convex or level backs."],
    "doc2": ["doc2", "Ants are eusocial insects of the family Formicidae and, along with the related wasps and bees, belong to the order Hymenoptera. Ants evolved from vespoid wasp ancestors in the Cretaceous period. More than 13,800 of an estimated total of 22,000 species have been classified. They are easily identified by their geniculate (elbowed) antennae and the distinctive node-like structure that forms their slender waists.\nAnts form colonies that range in size from a few dozen individuals often living in small natural cavities to highly organised colonies that may occupy large territories with sizeable nest that consist of millions of individuals or into the hundreds of millions in super colonies. Typical colonies consist of various castes of sterile, wingless females, most of which are workers (ergates), as well as soldiers (dinergates) and other specialised groups. Nearly all ant colonies also have some fertile males called \"drones\" and one or more fertile females called \"queens\" (gynes). The colonies are described as superorganisms because the ants appear to operate as a unified entity, collectively working together to support the colony."],
    "doc3": ["doc3", "Volkswagen (VW) is a German automobile manufacturer headquartered in Wolfsburg, Lower Saxony, Germany. Founded in 1937 by the German Labour Front under the Nazi Party and revived into the global brand it is known as today post-World War II by the British Army officer Ivan Hirst, it is known for the iconic Beetle and serves as the flagship brand of the Volkswagen Group, the largest automotive manufacturer by worldwide sales in 2016 and 2017.[1] The group's biggest market is China (including Hong Kong and Macau), which delivers 40 percent of its sales and profits.[2][3] Its name is derived from the German-language terms Volk and Wagen, translating to \"people's car\" when combined."],
    "doc4": ["doc4", "Bayerische Motoren Werke AG, commonly abbreviated to BMW (German pronunciation: [ˌbeːʔɛmˈveː]), is a German multinational manufacturer of luxury vehicles and motorcycles headquartered in Munich, Bavaria, Germany. The company was founded in 1916 as a manufacturer of aircraft engines, which it produced from 1917 to 1918 and again from 1933 to 1945 creating engines for aircraft that were used in the Second World War."],
    "doc5": ["doc5","Dragon Ball (Japanese: ドラゴンボール, Hepburn: Doragon Bōru) is a Japanese media franchise created by Akira Toriyama in 1984. The initial manga, written and illustrated by Toriyama, was serialized in Weekly Shōnen Jump from 1984 to 1995, with the 519 individual chapters collected in 42 tankōbon volumes by its publisher Shueisha. Dragon Ball was originally inspired by the classical 16th-century Chinese novel Journey to the West, combined with elements of Hong Kong martial arts films. Dragon Ball characters also use a variety of East Asian martial arts styles, including karate[1][2][3] and Wing Chun (kung fu).[2][3][4] The series follows the adventures of protagonist Son Goku from his childhood through adulthood as he trains in martial arts. He spends his childhood far from civilization until he meets a teen girl named Bulma, who encourages him to join her quest in exploring the world in search of the seven orbs known as the Dragon Balls, which summon a wish-granting dragon when gathered. Along his journey, Goku makes several other friends, becomes a family man, discovers his alien heritage, and battles a wide variety of villains, many of whom also seek the Dragon Balls."],
    "doc6": ["doc6", "The Matrix is a 1999 science fiction action film[5][6] written and directed by the Wachowskis.[a] It is the first installment in the Matrix film series, starring Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss, Hugo Weaving and Joe Pantoliano, and depicts a dystopian future in which humanity is unknowingly trapped inside the Matrix, a simulated reality that intelligent machines have created to distract humans while using their bodies as an energy source.[7] When computer programmer Thomas Anderson, under the hacker alias \"Neo\", uncovers the truth, he joins a rebellion against the machines along with other people who have been freed from the Matrix."],
    "doc7": ["doc7", "In computer science, Backus–Naur form (/ˌbækəs ˈnaʊər/) (BNF or Backus normal form) is a notation used to describe the syntax of programming languages or other formal languages. It was developed by John Backus and Peter Naur. BNF can be described as a metasyntax notation for context-free grammars. Backus–Naur form is applied wherever exact descriptions of languages are needed, such as in official language specifications, in manuals, and in textbooks on programming language theory. BNF can be used to describe document formats, instruction sets, and communication protocols. Over time, many extensions and variants of the original Backus–Naur notation have been created; some are exactly defined, including extended Backus–Naur form (EBNF) and augmented Backus–Naur form (ABNF)."],
    "doc8": ["doc8", "Spaceflight (or space flight) is an application of astronautics to fly objects, usually spacecraft, into or through outer space, either with or without humans on board. Most spaceflight is uncrewed and conducted mainly with spacecraft such as satellites in orbit around Earth, but also includes space probes for flights beyond Earth orbit. Such spaceflight operate either by telerobotic or autonomous control. The more complex human spaceflight has been pursued soon after the first orbital satellites and has reached the Moon and permanent human presence in space around Earth, particularly with the use of space stations. Human spaceflight programs include the Soyuz, Shenzhou, the past Apollo Moon landing and the Space Shuttle programs. Other current spaceflight are conducted to the International Space Station and to China's Tiangong Space Station."],
    "doc9": ["doc9", "Pippi Longstocking (Swedish: Pippi Långstrump) is the fictional main character in an eponymous series of children's books by Swedish author Astrid Lindgren. Pippi was named by Lindgren's daughter Karin, who asked her mother for a get-well story when she was off school.  Pippi is red-haired, freckled, unconventional and superhumanly strong – able to lift her horse one-handed. She is playful and unpredictable. She often makes fun of unreasonable adults, especially if they are pompous and condescending. Her anger comes out in extreme cases, such as when a man mistreats his horse. Pippi, like Peter Pan, does not want to grow up. She is the daughter of a buccaneer captain and has adventure stories to tell about that, too. Her four best friends are her horse and monkey, and the neighbours' children, Tommy and Annika."],
    "doc10": ["doc10", "Food processing is the transformation of agricultural products into food, or of one form of food into other forms. Food processing takes many forms, from grinding grain into raw flour, home cooking, and complex industrial methods used in the making of convenience foods. Some food processing methods play important roles in reducing food waste and improving food preservation, thus reducing the total environmental impact of agriculture and improving food security."]
}

In [6]:
DATA_PATH = "."
CORPUS_PATH = os.path.join(DATA_PATH, "dataset_corpus.json")

# Load the dataset
if os.path.exists(CORPUS_PATH):
    with open(CORPUS_PATH, "r") as f:
        corpus = json.load(f)
else:
    dataset = ir_datasets.load("ir-lab-sose-2024/ir-acl-anthology-20240504-training")
    corpus = dataset.docs_store().docs
    with open(CORPUS_PATH, "w") as f:
        json.dump(obj=corpus, fp=f, indent=2, ensure_ascii=False)
    del dataset # Free space? or is this unnecessary??

print(f"{len(corpus)} documents.")

126958 documents.


#### create subset of corpus - only documents that appear in qrels

In [5]:
# Test corpus of only relevant document (+ a few nonrelevant)
dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

relevant_docnos = dataset.get_qrels()["docno"].unique()

# some random choice of non-relevant docs added to corpus subset
nonrelevant_docnos = list(corpus.keys() - set(relevant_docnos))

nonrelevant_docnos = np.random.choice(nonrelevant_docnos, size=100)
relevant_docnos = list(relevant_docnos) + list(nonrelevant_docnos)

corpus = {k: corpus[k] for k in relevant_docnos}
print(f"{len(corpus)} relevant documents. (relevant to dev-set)")

2405 relevant documents. (relevant to dev-set)


#### different corpus structures

In [5]:
# corpus is originally a dict: {"docno": ["docno", "text"], }
# now    like this:  records   [{"docno": "docno", "text": "text"}, ...] # easy to make a dataframe from this
#     or like this:  list      [["docno", "text"], ...] # same as original but as list
#     or like this:  dict      {"docno": "text", ...} # i think this is easiest to handle.

list_corpus = [v for v in corpus.values()]
record_corpus = [{"docno": v[0], "text": v[1]} for v in corpus.values()]
dict_corpus = {v[0]: v[1] for v in corpus.values()}

#### batch the corpus

In [7]:
# Hilfsfunktionen zum batchen, damit ich es nicht jedesmal umschreiben muss, wenn ich eine andere corpus struktur versuche
def batch_dict_corpus(corpus, batch_size):
    """ Hilfsfunktion um einen dict corpus {docno: ["docno", "text"], ...} zu batchen. """
    batches = []
    corpus_keys = list(corpus.keys())
    for anker in range(0, len(corpus), batch_size):
        batch_keys = corpus_keys[anker:anker+batch_size]
        batch = {k:corpus[k] for k in batch_keys}
        batches.append(batch)
    return batches

def batch_list_corpus(corpus, batch_size):
    """ Hilfsfunktion um einen list corpus [["docno", "text"], ...] oder [{"docno":"docno", "text":"text"}] zu batchen. """
    batches = [corpus[i:i+batch_size] for i in range(0, len(corpus), batch_size)]
    return batches

def batch_corpus(corpus, batch_size):
    if type(corpus) == dict:
        return batch_dict_corpus(corpus, batch_size)
    elif type(corpus) == list:
        return batch_list_corpus(corpus, batch_size)

In [8]:
batch_size = 500
batched_corpus = batch_corpus(dict_corpus, 500)
print(f"{len(batched_corpus)} batches.")

NameError: name 'dict_corpus' is not defined

### preprocessing

TODO: 
- outlier entfernen?
- split nach sätzen?

#### cleaning up

In [9]:
# Most common abbrevations in corpus and other small things to substitute
abbrevations = {
    "e.g.": "for example",
    "E.g.": "for example",
    "U.S.": "united states",
    "w.r.t.": "with respect to",
    "i.e.": "that is",
    "i.i.d.": "independent and identically distributed",
    "i.i.": "independent and identically",
    "v.s.": "versus", "vs.": "versus",
    "etc.": "and so on", #TODO: besser et cetera? oder ist das zu exotisch
    "1st": "first", "2nd": "second", "3rd": "third", "4th": "fourth", "5th": "fifth",
    "e2e": "end-to-end",
    "E2E": "end-to-end",
    "iii)": "", "ii)": "", "i)": "", "iv)": "", "v)": "",
    "?": ".", "!": ".",
    "a)": "", "b)": "", "c)": "", "d)": "", "e)": ""
}

# Very common letter-number-combinations that will not be substituted
letter_number_exceptions = ["L2","F1","L1","F2","seq2seq","Seq2Seq","word2vec","Word2Vec","2D"]

def preprocess_text(text, lower=False, years=False, percentages=False, numbers=False, 
                        letter_numbers=False, abbrev=False, special_characters=False):
    if lower:
        text = text.lower()

    # Substitue Years
    if years:
        text = re.sub(r'\b(19|20)\d{2}\b', 'YEAR', text)

    # Substitue Percentages
    if percentages:
        text = re.sub(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?%', "PERCENTAGE", text)

    # Substitute all remaining numbers
    if numbers:
        text = re.sub(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b', 'NUMBER', text)

    # Remove words that are combinations of letters and numbers 
    # (except L2, F1, word2vec, ... common in corpus and probably important for context)
    if letter_numbers:
        #pattern = r'\b(?!(L2|F1|L1|F2|seq2seq|word2vec|Seq2Seq|Word2Vec|2D)\b)\w*\d+\w*\b'
        pattern = rf'\b(?!({"|".join(letter_number_exceptions)})\b)\w*\d+\w*\b'
        text = re.sub(pattern, '', text)

    # Substitute most common abbrevations
    if abbrev:
        for abbrevation, substitution in abbrevations.items():
            text = text.replace(abbrevation, substitution)

    # Remove all characters that are not normal text
    if special_characters:
        text = re.sub(r'[^a-zA-Z0-9\s\-\.\,]', '', text)

    # Punkt hinter Titel des papers setzen, falls bert genutzt wird, [SEP] token hinter titel setzten.?????
    #text = re.sub(r'\n\n', ". ", text)
    if len(text.split("\n\n")) < 2:
        text = re.sub(r'\n\n', ". ", text)  # if bert is used, sep is added at the end
    else:
        text = re.sub(r'\n\n', ". [SEP] " if bert else ". ", text)

    # Aufeinanderfolgende whitespaces durch einzelnes blank ersetzen.
    text = re.sub(r'\s+', ' ', text).strip()

    return text

#### Splitting corpus into shorter passages

In [84]:
def split_corpus(corpus, split_size, overlap, tokens=True):
    """ corpus must be record_corpus """ # FIXME
    split_corpus = []

    for item in corpus:
        docno = item["docno"]
        text = item["text"]

        if tokens: # split auf token-ebene
            # preprocessing sollte vorher geschehen!
            #text = preprocess_text(text, *([False]*7)) # shortcut for writing False seven times. I need to change the parameters...
            text = text.split(" ") # text becomes list of tokens
        # else: split auf character-ebene -> string wird indiziert statt eine liste

        split_text = [text[max(0,i-overlap):i+split_size] 
                        for i in range(0, len(text), split_size)]

        for text_passage in split_text:
            split_corpus.append({"docno": docno, "text": text_passage})

    return split_corpus

In [85]:

#record_corpus = [{
#    "docno": item["docno"],
#    "text": preprocess_text(item["text"], *([False]*7))
#} for item in record_corpus ]

token_split_size = 200
token_overlap = 50
splitted_corpus = split_corpus(record_corpus, token_split_size, token_overlap, tokens=False)

print(f"Im Schnitt {round(len(splitted_corpus)/len(corpus))} Korpus-Einträge pro dokument")
batched_corpus = batch_corpus(splitted_corpus, batch_size=20)

Im Schnitt 7 Korpus-Einträge pro dokument


### little fast analysis

In [None]:
# Little Dataset-text-length analysis: 
# TODO: make a "train"-dataset, that has outliers removed (too long /short docs)

no_titles = []
lengths = []
for i, (docno, text) in enumerate(dict_corpus.items()):
    if len(text.split("\n\n")) < 2:
        no_titles.append(docno)
    else:
        length = len(preprocess_text(text, *([True]*7)).split(" "))
        lengths.append(length)

print("n docs without abstract", len(no_titles)) # too many


import matplotlib.pyplot as plt

lengths = np.array(lengths)

print(sum((lengths < 350)*(lengths > 50)))

plt.hist([l for l in lengths if l < 350 and l > 50], bins=100)
plt.show()

## The Model / The Retrieval System

### the model

In [10]:
# FIXME: das hier nur schnelle lösung! umbedingt besser machen!!
model_name_to_type_map = {
    "prajjwal1/bert-mini": [BertModel, BertTokenizer],
    "microsoft/deberta-base": [BertModel, BertTokenizer], # DebertaModel? oder sogar v2? was ist der unterschied??
    "intfloat/e5-small-v2": [AutoModel, AutoTokenizer],
    "intfloat/e5-base-v2": [AutoModel, AutoTokenizer],
    "thenlper/gte-small": [AutoModel, AutoTokenizer],
    "thenlper/gte-base": [AutoModel, AutoTokenizer],
    "olm/olm-roberta-base-dec-2022": [RobertaModel, RobertaTokenizer],
    "distilbert-base-uncased": [DistilBertModel, DistilBertTokenizer],
    "paraphrase-MiniLM-L6-v2": [SentenceTransformer, None],
    "allenai/specter": [AutoModel, AutoTokenizer],
}

def load_model(name):
    try:
        model_class, tokenizer_class = model_name_to_type_map[name]
    except Exception as e:
        print(f"model {name} not defined yet. ({e})")
        return None

    model = model_class.from_pretrained(name)
    if tokenizer_class is not None:
        tokenizer = tokenizer_class.from_pretrained(name)
    else:
        tokenizer = None

    return model, tokenizer



In [19]:
# Load the model (TinyBERT or another) 
# TODO: model config (model: "DeBerta", and pretrained: "microsoft/deberta-base" )

#model_name = "prajjwal1/bert-mini"
#model_name = "microsoft/deberta-base" # ACHTUNG FEHLER BEI TOKENIZER! FIXME
#model_name = 'intfloat/e5-base-v2' # add "query: " before queries and "passage: " before passages!
#model_name = 'intfloat/e5-small-v2' # add "query: " before queries and "passage: " before passages!
#model_name = "thenlper/gte-small"
model_name = "thenlper/gte-base"
#model_name = "olm/olm-roberta-base-dec-2022"  ## nicht so gut
#model_name = 'allenai/specter' # Mit average=False benutzen! und FlatIP statt FlatL2! # Spezialisiert auf Scientific Papers

model, tokenizer = load_model(model_name)
model = model.to(device)

# colbert model checkpoint
checkpoint = 'downloads/colbertv2.0'
doc_maxlen=300

### the embedding (of document corpus)

In [12]:
def average_pool(last_hidden_states, attention_mask):
    """ Calculates average pooling of hidden states (with attention mask) """
    # mask paddings with 0 -> ignore in average calculation
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0) 
    #last_hidden = last_hidden_states
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


def encode(model, tokenizer, texts, max_length=512, avg_pool=False): # avg. doc length = 144 (after preprocessing only those with abstract.)
    """ Encode texts with model """
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    inputs.to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        last_hidden_states = outputs.hidden_states[-1]
        if avg_pool: 
            return average_pool(last_hidden_states, inputs["attention_mask"])
        else: # [CLS] embeddings
            return last_hidden_states[:,0,:]  # cls embeddings
    return None


def encode_with_sentence_transformer(model, texts, prompt):
    """ Encode texts with model """
    with torch.no_grad():
        embeddings = model(texts)#, prompt="Retrieve semantically relevant texts")
    return embeddings


In [13]:
def encode_documents(corpus, model, tokenizer, batched=True, avg_pool=True, 
                     normalize=True, preprocess=True, **preprocess_params):
    """ corpus is list-corpus """
    if not batched:
        corpus = [corpus]
    if type(corpus[0]) == list and type(corpus[0][0]) == list: # FIXME
        print("WRONG CORPUS STRUCTURE; ONLY LIST-CORPUS OR RECORD-CORPUS ALLOWEDD!")

    embeddings = None # will be np.array of shape [num_docs, embedding_size]
    docnos = []  # for embedding-vector index to docno translation
    for j, batch in enumerate(corpus):
        print(f"\rBatch {j+1:3d}/{len(corpus)} ", end="")

        if type(batch) == dict: # FIXME: unsauber!! irgendwie anders lösen, oder auf ein Format festlegen
            docnos += list(batch.keys())
            texts = list(batch.values())
        elif type(batch) == list:
            docnos += [i["docno"] for i in batch]
            texts = [i["text"] for i in batch]

        #if preprocess:
        #    texts = [preprocess_text(t, **preprocess_params) for t in texts]

        #if "e5" in model_name.lower():
        #    texts = ["passage: "+t for t in texts]
        
        batch_embeddings = encode(model=model, tokenizer=tokenizer, texts=texts, avg_pool=avg_pool)

        if embeddings is None:
            embeddings = batch_embeddings
        else:
            embeddings = torch.concatenate([embeddings, batch_embeddings], dim=0)

    if normalize:
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
    return docnos, embeddings # TODO: yield docnos, embeddings!? -> speicherschonender?


In [15]:
# Encode the document corpus
batched = True
avg_pool=True
normalize=False
preprocess = False
preprocess_params = {
    "lower": True,
    "numbers": True,
    "letter_numbers": True,
    "abbrev": True,
    "special_characters": True,
    "bert": False,
}

#docnos, embeddings = encode_documents(batched_corpus, model, tokenizer, batched=batched, normalize=normalize,
#                                      avg_pool=avg_pool, preprocess=preprocess, **preprocess_params)
#print("embeddings shape:", embeddings.shape)

with open("encoded_corpus/gte_base-vanilla_avgpool-docnos.txt", "r") as f:
    docnos = json.load(f)

with open("encoded_corpus/gte_base-vanilla_avgpool-embeddings.npy", "rb") as f:
    embeddings = np.load(f)

#embeddings = embeddings.cpu().numpy().astype(np.float32) # TODO: variante: gpu-index 
embedding_size = embeddings.shape[1]
if np.isnan(embeddings).any():
    print("WARNUNG: NaN-Werte in den Embeddings gefunden!")

### the index (faiss)

In [48]:
# Create a FAISS index with embeddings (and docnos for indices)
embedding_size = embeddings.shape[1]

index_type = "FlatIP"
#index_type = "LSH"
#index_params = {"d":embedding_size, "n_bits": 4*embedding_size}

if index_type == "FlatIP":
    index = faiss.IndexFlatIP(embedding_size)
elif index_type == "FlatL2":
    index = faiss.IndexFlatL2(embedding_size)
else:
    index = faiss.index_factory(embedding_size, index_type)#(**index_params) # TODO: wie kann man mit index factory parameter übergeben?????

#index = faiss.IndexHNSWFlat(embedding_size,8) 

faiss.normalize_L2(embeddings)
index.add(embeddings)

In [49]:
index_dir = "./indexe"
index_name = "tiny_bert-with_tokens.index"
faiss.write_index(index, os.path.join(index_dir, index_name))

In [16]:
index_dir = "./indexe"
index_name = "ivf_10000_IP-gte_base.index"
index = faiss.read_index(os.path.join(index_dir, index_name))

In [None]:
def search_index(embedding, index, k):
    d, c = index.search(embedding, k) # distances, candidates

## The Retrieval

In [17]:
# Kleine hilfsfunktionen
def distance2score(distances):
    return np.exp(-distances)

def softmax(scores):
    return np.exp(scores) / sum(np.exp(scores))

### test-run (with mini-corpus)

In [67]:
queries = [
    "I like animals",
    "Who builds Ferraris",
    "What do kids like?",
    "where is the moon",
#    "how far away is the sun from the earth",
    "what is nebukadnezar",
    "i am interested in maths",
    "is a hot-dog a sandwich",
    #"What is the difference between Indian and African elephants",
]

# for e5
if preprocess:
    queries = [preprocess_text(q, **preprocess_params) for q in queries]
#queries = ["querie: "+q for q in queries]


results_df = None
for query in queries:
    query_embedding = encode(model=model, tokenizer=tokenizer, texts=[query], avg_pool=avg_pool).cpu().numpy()
    faiss.normalize_L2(query_embedding)

    d, c = index.search(query_embedding, k=5) # d=distances, c=candidates
    if index_type == "FlatL2":
        d = distance2score(d)

    results = sorted(list(zip(d[0], c[0])), key=lambda x: x[0], reverse=True)
    results = [{"docno": docnos[candidate], "score": score, "rank": i+1, "query": query} 
                for i, (score,candidate) in enumerate(results)]

    for item in results:
        item_doc = item["docno"]
        item["doc-text"] = dict_corpus[item_doc][:20]+"..."

    if results_df is None:
        results_df = pd.DataFrame(results)
    else:
        results_df = pd.concat([results_df, pd.DataFrame(results)])

#print(results_df)
#display(results_df)

In [68]:
display(results_df)

Unnamed: 0,docno,score,rank,query,doc-text
0,doc9,0.69713,1,I like animals,Pippi Longstocking (...
1,doc9,0.68957,2,I like animals,Pippi Longstocking (...
2,doc9,0.67891,3,I like animals,Pippi Longstocking (...
3,doc2,0.66673,4,I like animals,Ants are eusocial in...
4,doc2,0.64393,5,I like animals,Ants are eusocial in...
0,doc6,0.65441,1,Who builds Ferraris,The Matrix is a 1999...
1,doc6,0.65358,2,Who builds Ferraris,The Matrix is a 1999...
2,doc5,0.63777,3,Who builds Ferraris,Dragon Ball (Japanes...
3,doc6,0.637,4,Who builds Ferraris,The Matrix is a 1999...
4,doc5,0.63136,5,Who builds Ferraris,Dragon Ball (Japanes...


#### with sentence transformer

In [None]:
# WITH SENTENCE TRANSFORMER
queries = [ "I like animals", "Who builds Ferraris", "What do kids like?", "where is the moon", "what is nebukadnezar", "maths is interesting", "is a hot-dog a sandwich", ]
queries = [preprocess_text(q, True, *([False]*5), True) for q in queries]

results_df = None

for query in queries:
    query_embedding = encode_with_sentence_transformer(model, [query])
    similarities = index.search(query_embedding, k=20) # d=distances, c=candidates # TODO: mit model den score berechnen 

    print(Counter(d[0]).most_common(2))
    if index_type == "FlatL2":
        d = distance2score(d)

    results = sorted(list(zip(d[0], c[0])), key=lambda x: x[0], reverse=True)
    results = [{"docno": docnos[candidate], "score": score, "rank": i+1, "query": query} 
                for i, (score,candidate) in enumerate(results)]

    for item in results:
        item_doc = item["docno"]
        item["doc-text"] = dict_corpus[item_doc][:20]+"..."

    if results_df is None:
        results_df = pd.DataFrame(results)
    else:
        results_df = pd.concat([results_df, pd.DataFrame(results)])

#print(results_df)
#display(results_df)


In [None]:
frequencies = results_df.groupby(['query', 'docno']).size().reset_index(name='count')

# Schritt 2: Ermitteln der häufigsten docno für jede query
max_frequencies = frequencies.loc[frequencies.groupby('query')['count'].idxmax()]

# Schritt 3: Zusammenführen des ursprünglichen DataFrame mit den häufigsten docno pro query
result_df = pd.merge(results_df, max_frequencies[['query', 'docno']], on=['query', 'docno'])
display(result_df)

### RUN with real corpus

In [17]:
# Now with the dataset queries
dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')
#queries = dataset.get_topics(variant="description") # only the question / the user-query

In [20]:
# Run through queries and search the relevant documents

#name = "vanilla_mini-bert"  # name of this run
#name = "mini-bert_with-tokens"
name = "gte_base-vanilla-ivf_10000_IP"

run = []
for i, row in enumerate(dataset.get_topics(variant="description").to_dict(orient="records")):
    query = row["query"]
    #if preprocess:
    #    query = preprocess_text(query, **preprocess_params)
    # Encode the query
    query_embedding = encode(model, tokenizer, [query]).cpu().numpy() # TODO: gpu variant
    #query_embedding = query_embedding.astype(np.float32)  # brauch ich das wirklich für faiss???
    faiss.normalize_L2(query_embedding)

    # Search in the Index
    scores, candidates = index.search(query_embedding, k=10)

    # Case FlatL2 -> returns distances, lowest distance is best -> translate it to scores where highest is best.
    #if index_type == "FlatL2":
    #    scores = distance2score(scores)

    # Ergebnisse sollten bereits sortiert sein, nur nochmal zur Sicherheit:
    results = sorted(list(zip(scores[0], candidates[0])), key=lambda x: x[0], reverse=True)

    for j, (score, candidate) in enumerate(results):
        run.append({ "qid": row["qid"], "docno": docnos[candidate],
                     "rank": j+1, "score": score, "name": name})

In [21]:
rundir = "./runs"

runfile = os.path.join(rundir, name+"_run.txt")
with open(runfile, "w") as f:
    for item in run:
        # schreibt die selben sachen, die persist_and_normalize_run() schreibt
        f.write(f"{item['qid']} 0 {item['docno']} {item['rank']} {item['score']} {name}\n") 

## Evaluation

In [22]:
# Some baselines that were executed in TIRA
bm25_baseline = tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 (tira-ir-starter-pyterrier)', dataset)
sparse_cross_encoder = tira.pt.from_submission('ir-benchmarks/fschlatt/sparse-cross-encoder-4-512', dataset)
rank_zephyr = tira.pt.from_submission('workshop-on-open-web-search/fschlatt/rank-zephyr', dataset)

In [23]:
import glob
# This assumes we have execited the ../baseline-retrieval-system/baseline-retrieval-system.ipynb notebook before.
run_files = sorted(list(glob.glob(os.path.join(rundir, "*.txt"))))
methods = [pt.io.read_results(run_file_path) for run_file_path in run_files]
run_names = [name.split("/")[-1].split(".")[0] for name in run_files]

pt.Experiment(
    [bm25_baseline, sparse_cross_encoder, rank_zephyr] + methods,
    dataset.get_topics(),
    dataset.get_qrels(),
    ["ndcg_cut.10", "recip_rank", "recall_100", "map"],
    names=["BM 25 (Baseline)", "Sparse Cross Encoder", "RankZephyr"]+run_names
)

There are multiple query fields available: ('text', 'title', 'query', 'description', 'narrative'). To use with pyterrier, provide variant or modify dataframe to add query column.


Unnamed: 0,name,ndcg_cut.10,recip_rank,recall_100,map
0,BM 25 (Baseline),0.37404,0.57988,0.60133,0.26231
1,Sparse Cross Encoder,0.36646,0.61298,0.60133,0.24126
2,RankZephyr,0.34707,0.56841,0.60133,0.26749
3,gte_base-vanilla-ivf_10000_IP_run,0.10077,0.20662,0.0617,0.04235
4,mini-bert_with-tokens_run,0.06224,0.14412,0.02821,0.01344
5,mini_bert-full_preprocessing_run,0.05907,0.15809,0.02533,0.01325
6,vanilla_mini-bert_run,0.07913,0.19789,0.02745,0.01708


In [53]:
del corpus
del dataset
del index
del model
del tokenizer