In [None]:
import os

# Detect if we are in the TIRA sandbox
# Install the required dependencies if we are not in the sandbox.
if 'TIRA_DATASET_ID' not in os.environ:
    !pip3 install python-terrier tira==0.0.88 ir_datasets trectools
else:
    print('We are in the TIRA sandbox.')

In [None]:
# Import the required libraries
print('importing libraries...')
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
ensure_pyterrier_is_loaded()

print('Done. Libraries imported.')



In [None]:
def getRetrievedDocumentIds(qrels, run, k, is_relevant):

    queryIdIndex = 0
    docIdIndex = 2    
    relevanceIndex = 3

    relevant_documents = [item for item in qrels if item[relevanceIndex] == is_relevant]
    relevant_retrieved_documents = []

    for item in relevant_documents:
        query_id, doc_id = item[queryIdIndex], item[docIdIndex]
        is_document_retrieved_in_top_10 = doc_id in run.get_top_documents(query_id, k)
        if is_document_retrieved_in_top_10:
            relevant_retrieved_documents.append(item[docIdIndex])
    
    return relevant_retrieved_documents



In [None]:
def transformDocumentIdsToDocuments(document_id_list, original_documents): 
    relevant_retrieved_documents = []
    for document in list(original_documents):
        if document.doc_id in document_id_list:
            relevant_retrieved_documents.append(document)

    return relevant_retrieved_documents

In [None]:
import pyterrier as pt

def createSortedTermFrequency(relevant_retrieved_documents):
    indexer = pt.IterDictIndexer(
        "/tmp/index",
        overwrite=True,
        meta={'docno': 100, 'text': 20480},
        stemmer=None
    )
    index_ref = indexer.index(({'docno': i.doc_id, 'text': i.text} for i in relevant_retrieved_documents))
    index_from_relevant_retrieved = pt.IndexFactory.of(index_ref)

    lexicon = index_from_relevant_retrieved.getLexicon()
    term_frequencies = [(term, le.getFrequency()/len(relevant_retrieved_documents)) for term, le in lexicon]
    sorted_term_frequencies = sorted(term_frequencies, key=lambda x: x[1], reverse=True)
    return sorted_term_frequencies


In [None]:
def get_relevant_retrieved_documents(training_qrels, run, original_documents, top_k_documents):
    qrels = training_qrels.qrels_data.values.tolist()
    relevant_retrieved_document_ids = getRetrievedDocumentIds(qrels, run, top_k_documents, 1)
    relevant_retrieved_documents = transformDocumentIdsToDocuments(relevant_retrieved_document_ids, original_documents)
    sorted_term_frequency = createSortedTermFrequency(relevant_retrieved_documents)

    return sorted_term_frequency

In [None]:
def get_non_relevant_retrieved_documents(training_qrels, run, original_documents, top_k_documents):
    qrels = training_qrels.qrels_data.values.tolist()
    non_relevant_retrieved_document_ids = getRetrievedDocumentIds(qrels, run, top_k_documents, 0)
    non_relevant_retrieved_documents = transformDocumentIdsToDocuments(non_relevant_retrieved_document_ids, original_documents)
    sorted_term_frequency = createSortedTermFrequency(non_relevant_retrieved_documents)
    
    return sorted_term_frequency

In [None]:
from tira.third_party_integrations import ir_datasets
def get_all_query_terms():
    training_dataset = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
    dataset = ir_datasets.load(training_dataset)

    all_query_terms = []

    for query in list(dataset.queries_iter()):
        text = query.default_text()

        text_split = text.split(' ')

        all_query_terms = [*all_query_terms, *text_split]

    deduplicated_query_terms = set(all_query_terms)
    return deduplicated_query_terms

In [None]:
def get_filtered(all_stopwords):
    return [tup for tup in all_stopwords if tup[1] > 1]

In [None]:
def get_without_query_terms(all_stopwords):
    all_query_terms = get_all_query_terms()
    return [tup for tup in all_stopwords if tup[0] not in all_query_terms]

In [None]:
def get_filtered_and_without_query_terms(all_stopwords):
    filtered = get_filtered(all_stopwords)
    return get_without_query_terms(filtered)

In [None]:
from trectools import TrecRun, TrecQrel, TrecEval
from tira.rest_api_client import Client
from glob import glob
import pandas as pd
tira = Client()

from load_dataset import load_dataset

def load_qrels(dataset):
    return TrecQrel(tira.download_dataset('ir-lab-jena-leipzig-wise-2023', dataset, truth_dataset=True) + '/qrels.txt')

TOP_Ks = [10, 50]

for top_k in TOP_Ks:
    training_qrels = load_qrels('training-20231104-training')

    run = TrecRun('./runs/standard_stopwords/run.txt')
    training_dataset = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
    documents = load_dataset(training_dataset)['documents']

    relevant_documents = get_relevant_retrieved_documents(training_qrels, run, documents, top_k)

    training_qrels = load_qrels('training-20231104-training')

    run = TrecRun('./runs/standard_stopwords/run.txt')
    training_dataset = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
    documents = load_dataset(training_dataset)['documents']
    non_relevant_documents = get_non_relevant_retrieved_documents(training_qrels, run, documents, top_k)

    result = []
    for (word_rel, count_rel), (word_non_rel, count_non_rel) in zip(relevant_documents, non_relevant_documents):
        if word_rel == word_non_rel:
            result.append((word_rel, count_rel / count_non_rel))

    sorted_result = sorted(result, key=lambda x: x[1], reverse=True)

    stopwords_filtered_over_threshold = get_filtered(sorted_result)
    stopwords_without_query_terms = get_without_query_terms(sorted_result)
    stopwords_filtered_and_without_query_terms = get_filtered_and_without_query_terms(sorted_result)

    file_path_original = f"./stopwordlists/joined-relevant-and-non-relevant-based-stopwords-index/single_top{top_k}.txt"
    file_path1 = f'./stopwordlists/joined-relevant-and-non-relevant-based-stopwords-index/filtered_top{top_k}.txt'
    file_path2 = f'./stopwordlists/joined-relevant-and-non-relevant-based-stopwords-index/without_query_terms_top{top_k}.txt'
    file_path3 = f'./stopwordlists/joined-relevant-and-non-relevant-based-stopwords-index/filtered_and_without_query_terms_top{top_k}.txt'

    with open(file_path_original, 'w') as file:
        for string in [word for word, _ in sorted_result]:
            file.write(string + '\n') 

    with open(file_path1, 'w') as file:
        for string in [word for word, _ in stopwords_filtered_over_threshold]:
            file.write(string + '\n') 

    with open(file_path2, 'w') as file:
        for string in [word for word, _ in stopwords_without_query_terms]:
            file.write(string + '\n') 

    with open(file_path3, 'w') as file:
        for string in [word for word, _ in stopwords_filtered_and_without_query_terms]:
            file.write(string + '\n') 