Hier läuft dann unser fertiges Retrieval System

Vector-Space-Model-Retrieval mit Dirichlet-Smoothing:

In [None]:
!pip3 install tira ir-datasets python-terrier

from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
from tira.rest_api_client import Client
import pyterrier as pt

ensure_pyterrier_is_loaded()
tira = Client()

pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')

In [None]:
corpus = pt_dataset
from textanalysis.preprocessing import PreprocessorSpacy as Preprocessor
from indexing.indexing import Index

preprocessor = Preprocessor()
corpus_preprocessed = list(zip(
    [doc[0] for doc in corpus],
    map(preprocessor.preprocess, [doc[1] for doc in corpus])
))

index = Index(corpus_preprocessed)

In [None]:
def document_probability(index, term, doc_id):
    """
    Calculates the conditional probability of a term give a document.
    :param index: index to get frequency data from
    :param term: term to calculate the probability for
    :param doc_id: document to calculate the probability for
    """
    frequency = index.get_term_frequency(term, doc_id)
    doc_sum = 0
    for t in index.get_index_terms():
        doc_sum += index.get_term_frequency(t, doc_id)
    if doc_sum == 0:
        return 0
    else:   
        return frequency/doc_sum
    

In [None]:
def collection_probability(index, term):
    """
    Calculates the conditional probability of a term give a document collection.
    :param index: index to get frequency data from
    :param term: term to calculate the probability for
    """
    frequencies = []
    doc_sums = []
    
    for doc_id in index.get_document_ids():
        frequencies.append(index.get_term_frequency(term, doc_id))
        doc_sum = 0
        for t in index.get_index_terms():
            doc_sum += index.get_term_frequency(t, doc_id)
        doc_sums.append(doc_sum)
        
    return sum(frequencies)/sum(doc_sums)

In [None]:
def weight(index, doc_id, alpha):
    """
    Calculates the dirichlet smoothing weighting factor for a given document and alpha value
    :param index: index to get frequency data from
    :param doc_id: document to calculate the weight factor for
    :param alpha: alpha-prior for the dirichlet smoothing
    """
    doc_len = 0
    for term in index.get_index_terms():
        doc_len += index.get_term_frequency(term, doc_id)
    return alpha / (doc_len + alpha)

In [None]:
def dirichlet_term_probability(index, term, doc_id, alpha):
    """
    Calculates the conditional probability of a term give a document using Dirichlet smoothing.
    :param index: index to get frequency data from
    :param term: term to calculate the probability for
    :param doc_id: document to calculate the probability for
    :param alpha: alpha-prior for the dirichlet interpolation
    """
    omega = weight(index, doc_id, alpha)
    p1 = document_probability(index, term, doc_id)
    p2 = collection_probability(index, term)
    return (1-omega) * p1 + omega * p2

In [None]:
from math import log

def dirichlet_score(index, query, doc_id, alpha):
    """
    Calculates the relevance of a document given a query using Dirichlet smoothing.
    :param index: index to get relevance data from
    :param query: query to calculate the relevance for
    :param doc_id: document to calculate the relevance for
    :param alpha: alpha paramter for Dirichlet smoothing
    """
    rho = 1
    for term in query:
        rho += log(dirichlet_term_probability(index, term, doc_id, alpha))
    return rho

In [None]:
"hier Output anpassen: noch query und docid anzeigen?"
def dirichlet_query(index, preprocessor, text, alpha=1000, topK=-1):
    """
    Queries a given text against the given index using a Dirichlet smoothed language model
    :param preprocessor: preprocessor instance to process the query with
    :param index: the index data to query against
    :param text: query text
    :param alpha: alpha-parameter for Dirichlet smoothing
    :param topK: number of top results to return
    :return: list of (doc_id, score) tuples descending by score for all documents in the vector space
    """
    query = preprocessor.preprocess(text)
    scores = {}
    for doc_id in index.get_document_ids():
        scores[doc_id] = dirichlet_score(index, query, doc_id, alpha=alpha)
        
    return sorted(scores.items(), key=lambda item: item[1], reverse=True)[:topK]

In [None]:
run = dirichlet_query(index, preprocessor, "information retrieval", topK=10)
persist_and_normalize_run(run, system_name='retrieval_system', default_output='../runs')