## Setup

In [1]:
#!pip install transformers faiss-gpu faiss-cpu torch
#!pip install tira ir-datasets python-terrier
#!pip install sentence-transformers

In [2]:
import os
import time
import json
import re
import importlib
import random
import gc

import numpy as np
import pandas as pd
import torch
import pyterrier as pt
import faiss

# Encoder and Tokenizer models
from transformers import AutoTokenizer, AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

# Tira and Pyterrier Imports
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.third_party_integrations import ir_datasets
from tira.rest_api_client import Client

In [3]:
# Create a REST client to the TIRA platform for retrieving the pre-indexed data.
ensure_pyterrier_is_loaded()
tira = Client()

# Print options for pandas
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.precision", 4)
pd.set_option("display.max_rows", None)
pd.set_option('display.float_format', '{:.5f}'.format)


# Use GPU if available
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"device: {device}")

# TODO: set seed!!

os.makedirs("./indexe", exist_ok=True)

PyTerrier 0.10.1 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7



device: cuda


## The Dataset

### initialize the dataset

In [4]:
DATA_PATH = "."
CORPUS_PATH = os.path.join(DATA_PATH, "dataset_corpus.json")

# Load the dataset
if os.path.exists(CORPUS_PATH):
    with open(CORPUS_PATH, "r") as f:
        corpus = json.load(f)
else:
    dataset = ir_datasets.load("ir-lab-sose-2024/ir-acl-anthology-20240504-training")
    corpus = dataset.docs_store().docs
    with open(CORPUS_PATH, "w") as f:
        json.dump(obj=corpus, fp=f, indent=2, ensure_ascii=False)
    del dataset # Free space? or is this unnecessary??

print(f"{len(corpus)} documents.")

126958 documents.


In [None]:
# corpus is originally a dict: {"docno": ["docno", "text"], }
#               now like this: {"docno": "text", ...}  #  easier to handle.

dict_corpus = {v[0]: v[1] for v in corpus.values()}

#### batch the corpus

In [None]:
def batch_corpus(corpus, batch_size):
    corpus_keys = list(corpus.keys())
    for anker in range(0, len(corpus), batch_size):
        batch_keys = corpus_keys[anker:anker+batch_size]
        yield {k: corpus[k] for k in batch_keys}

### preprocessing
TOdo: outlier entfernen?

In [None]:
# Most common abbrevations in corpus and other small things to substitute
abbrevations = {
    "e.g.": "for example",
    "E.g.": "for example",
    "U.S.": "united states",
    "w.r.t.": "with respect to",
    "i.e.": "that is",
    "i.i.d.": "independent and identically distributed",
    "i.i.": "independent and identically",
    "v.s.": "versus", "vs.": "versus",
    "etc.": "and so on", #TODO: besser et cetera? oder ist das zu exotisch
    "1st": "first", "2nd": "second", "3rd": "third", "4th": "fourth", "5th": "fifth",
    "e2e": "end-to-end",
    "E2E": "end-to-end",
    "iii)": "", "ii)": "", "i)": "", "iv)": "", "v)": "",
    "?": ".", "!": ".",
    "a)": "", "b)": "", "c)": "", "d)": "", "e)": ""
}

# Very common letter-number-combinations that will not be substituted
letter_number_exceptions = ["L2","F1","L1","F2","seq2seq","Seq2Seq","word2vec","Word2Vec","2D"]

def preprocess_text(text, lower=False, years=False, percentages=False, numbers=False, 
                        letter_numbers=False, abbrev=False, special_characters=False):
    # reihenfolge ist wichtig!
    if lower:
        text = text.lower()
    if years:
        text = re.sub(r'\b(19|20)\d{2}\b', 'YEAR', text)
    if percentages:
        text = re.sub(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?%', "PERCENTAGE", text)

    # all remaining numbers
    if numbers:
        text = re.sub(r'\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b', 'NUMBER', text)

    # Remove words that are combinations of letters and numbers 
    # (except L2, F1, word2vec, ... common in corpus and probably important for context)
    if letter_numbers:
        #pattern = r'\b(?!(L2|F1|L1|F2|seq2seq|word2vec|Seq2Seq|Word2Vec|2D)\b)\w*\d+\w*\b'
        pattern = rf'\b(?!({"|".join(letter_number_exceptions)})\b)\w*\d+\w*\b'
        text = re.sub(pattern, '', text)

    # Substitute most common abbrevations
    if abbrev:
        for abbrevation, substitution in abbrevations.items():
            text = text.replace(abbrevation, substitution)

    # Remove all characters that are not normal text
    if special_characters:
        text = re.sub(r'[^a-zA-Z0-9\s\-\.\,]', '', text)

    # Punkt hinter Titel des papers setzen, falls bert genutzt wird, [SEP] token hinter titel setzten.?????
    #text = re.sub(r'\n\n', ". ", text)
    if len(text.split("\n\n")) < 2:
        text += "."
    else:
        text = re.sub(r'\n\n', ". ", text)

    # Aufeinanderfolgende whitespaces durch einzelnes blank ersetzen.
    text = re.sub(r'\s+', ' ', text).strip()

    return text

## Encoding the Corpus

### the model

In [None]:
# FIXME: das hier nur schnelle lösung! umbedingt besser machen!!
model_name_to_type_map = { # map for models that do not use AutoModel
    "paraphrase-MiniLM-L6-v2": [SentenceTransformer, None],
}

def load_model(name):
    if name in model_name_to_type_map.keys():
        model_class, tokenizer_class = model_name_to_type_map[name]
    else:
        model_class, tokenizer_class = AutoModel, AutoTokenizer

    model = model_class.from_pretrained(name)
    tokenizer = tokenizer_class.from_pretrained(name) if tokenizer_class is not None else None
    return model, tokenizer

In [None]:
# Load the model (TinyBERT or another) 
# TODO: model config (model: "DeBerta", and pretrained: "microsoft/deberta-base" )

#model_name = "prajjwal1/bert-mini"
#model_name = "microsoft/deberta-base" # ACHTUNG FEHLER BEI TOKENIZER! FIXME
#model_name = 'intfloat/e5-base-v2' # add "query: " before queries and "passage: " before passages!
#model_name = 'intfloat/e5-small-v2' # add "query: " before queries and "passage: " before passages!
#model_name = "thenlper/gte-small"
model_name = "thenlper/gte-base"
#model_name = "olm/olm-roberta-base-dec-2022"  ## nicht so gut
#model_name = 'allenai/specter' # Mit average=False benutzen! und FlatIP statt FlatL2! # Spezialisiert auf Scientific Papers

model, tokenizer = load_model(model_name)
model = model.to(device)

In [None]:
model = AutoModel.from_pretrained("gte-small-ft-mlm")
tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
model = model.to(device)

### the embedding (of document corpus)

In [11]:
def average_pool(last_hidden_states, attention_mask):
    """ Calculates average pooling of hidden states (with attention mask) """
    # mask paddings with 0 -> ignore in average calculation
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0) 
    #last_hidden = last_hidden_states # without using mask (is this even worth considering?)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


def encode(model, tokenizer, texts, max_length=512, avg_pool=False): # avg. doc length = 144 (after preprocessing only those with abstract.)
    """ Encode texts with model """
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    inputs.to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        last_hidden_states = outputs.hidden_states[-1]
        if avg_pool: 
            result = average_pool(last_hidden_states, inputs["attention_mask"])
        else: # [CLS] embeddings
            result = last_hidden_states[:,0,:]

    # GPU RAM leeren (weil es immer voll gelaufen ist)
    result.cpu()
    del outputs
    del last_hidden_states
    del inputs
    torch.cuda.empty_cache()
    gc.collect()
    return result

In [12]:

def encode_documents(corpus, model, tokenizer, batch_size=100, avg_pool=False,
                     normalize=False, preprocess=False, **preprocess_params):

    embeddings = None # will be np.array of shape [num_docs, embedding_size]
    docnos = []  # for embedding-vector index to docno translation
    for j, batch in enumerate(batch_corpus(corpus, batch_size)):
        print(f"\rBatch {j+1:3d}/{len(corpus)} ", end="")

        docnos += list(batch.keys())
        texts = list(batch.values())

        if preprocess:
            texts = [preprocess_text(t, **preprocess_params) for t in texts]

        #if "e5" in model_name.lower(): # FIXME
        #    texts = ["passage: "+t for t in texts]

        batch_embeddings = encode(model=model, tokenizer=tokenizer, texts=texts, avg_pool=avg_pool)

        if embeddings is None:
            embeddings = batch_embeddings
        else:
            embeddings = torch.concatenate([embeddings, batch_embeddings], dim=0)

    if normalize:
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
    return docnos, embeddings # TODO: yield docnos, embeddings!? -> speicherschonender? macht generator überhaupt sinn???


In [13]:
# Encode the document corpus
batch_size = 500
avg_pool   = True
normalize  = False
preprocess = False
preprocess_params = {
    "lower": True,
    "numbers": True,
    "letter_numbers": True,
    "abbrev": True,
    "special_characters": True,
}

docnos, embeddings = encode_documents(corpus, model, tokenizer, batch_size, normalize=normalize,
                                      avg_pool=avg_pool, preprocess=preprocess, **preprocess_params)
print("embeddings shape:", embeddings.shape)

embedding_size = embeddings.shape[1]
if np.isnan(embeddings).any():
    print("WARNUNG: NaN-Werte in den Embeddings gefunden!")

Batch 1270/1270 embeddings shape: torch.Size([126958, 768])


In [14]:
# SAVE
with open("gte_base-preprocessed_avgpool-embeddings.npy", "wb") as f:
    np.save(f, embeddings)

with open("gte_base-preprocessed_avgpool-docnos.txt", "w") as f:
    json.dump(docnos, f)
    #f.write("\n".join(docnos))

In [None]:
# LOAD
with open("gte_base-preprocessed_avgpool-docnos.txt", "r") as f:
    docnos = json.load(f)

with open("gte_base-preprocessed_avgpool-embeddings.npy", "rb") as f:
    embeddings = np.load(f)

## the Index

### configuration & initialisation

In [None]:
def create_faiss_index(embedding_size, index_type, **index_params):
    """ Create a FAISS index using the index factory. 
    index_type (str): The type of index to create (e.g., "Flat", "HNSW", "IVF", "PQ", etc)
    **index_params: Additional parameters for the index
    Returns the created FAISS index with embeddings added
    """
    
    metric = index_params.get("metric", "IP")
    
    # Construct the index string
    if index_type == "Flat":
        index_string = f"Flat{metric}"
    elif index_type == "HNSW":
        M = index_params.get('M', 16)
        index_string = f"HNSW{M},{metric}"
    elif index_type == "IVF":
        nlist = index_params.get('nlist', 100)
        index_string = f"IVF{nlist},Flat{metric}"
    elif index_type == "PQ":
        m = index_params.get('m', 8)
        bits = index_params.get('bits', 8)
        index_string = f"PQ{m}x{bits},{metric}"
    elif index_type == "IVFPQ":
        nlist = index_params.get('nlist', 100)
        m = index_params.get('m', 8)
        bits = index_params.get('bits', 8)
        index_string = f"IVF{nlist},PQ{m}x{bits},{metric}"
    else:
        raise ValueError(f"Unsupported index type: {index_type}")
    
    # Create the index
    index = faiss.index_factory(embedding_size, index_string)
    
    # Set additional parameters
    if index_type == "HNSW":
        if hasattr(index, "hnsw"):
            index.hnsw.efConstruction = index_params.get("efConstruction", 150)
            index.hnsw.efSearch = index_params.get("efSearch", 50)
    elif index_type in ["IVF", "IVFPQ"]:
        index.nprobe = index_params.get("nprobe", 10)


def add_embeddings(index, embeddings, metric, train):
    """ 
    metric: "IP" or "L2"
    train: bool
    """
    # Normalize for IP similarity
    if metric == "IP":
        faiss.normalize_L2(embeddings)
    
    if train:
        index.train(embeddings)
    
    index.add(embeddings)
    return index



In [None]:
# Create a FAISS index with embeddings (and docnos for indices)
embedding_size = embeddings.shape[1]

#index_type = "Flat"
#index_type = "LSH"
index_type = "IVF"
#index_type = "IVFPQ"
#index_type = "HNSW"

params = {
    "metric": "IP",
    "nlist" : 5000,  # Cluster in IVFFlat
    #"nbits" : 128, # LSH
    #"M": 32,
    #"efContruction": 1000,
    #"efSearch": 500
}

#params = { # IVFPQ
#    "nlist": 1000,
#    "m": 8,
#    "bits": 8,
#    "nprobe": 10
#}

index = create_faiss_index(embedding_size, index_type, **params)

### training

In [19]:
index = add_embeddings(index, embeddings, metric=params["metric"], train=True)

In [20]:
index_dir = "./indexe"
os.makedirs(index_dir, exist_ok=True)

index_name = f"ivf_{nlist}_IP-gte_base-prepr_avgpool.index"
faiss.write_index(index, os.path.join(index_dir, index_name))

## (Colab)

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
from shutil import copyfile

quelle = f"/content/indexe/ivf_{nlist}_IP-gte_small_ft-mlm.index"
#quelle = "/content/gte_small_ft-mlm-embeddings.npy"
#quelle = "/content/gte_small_ft-mlm-docnos.txt"
ziel = "/content/drive/My Drive/" + quelle.split("/")[-1]

copyfile(quelle, ziel)

'/content/drive/My Drive/gte_base-preprocessed_avgpool-docnos.txt'