In [None]:
import os

# Detect if we are in the TIRA sandbox
# Install the required dependencies if we are not in the sandbox.
if 'TIRA_DATASET_ID' not in os.environ:
    !pip3 install python-terrier tira==0.0.88 ir_datasets
else:
    print('We are in the TIRA sandbox.')

In [None]:
# Import the required libraries
print('importing libraries...')
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run
ensure_pyterrier_is_loaded()

print('Done. Libraries imported.')


In [None]:
# create index

import pyterrier as pt
import os

def create_index(documents, indexLocation=False):
    indexer = pt.IterDictIndexer(
        indexLocation if indexLocation else "/tmp/index",
        overwrite=True,
        meta={'docno': 100, 'text': 20480},
        stemmer=None
    )
    index_ref = indexer.index(({'docno': i.doc_id, 'text': i.text} for i in documents))
    return pt.IndexFactory.of(index_ref)

In [None]:
# load dataset

from tira.third_party_integrations import ir_datasets
import pyterrier as pt

def load_dataset(training_dataset):
    queries = pt.io.read_topics(ir_datasets.topics_file(training_dataset), format='trecxml')

    dataset = ir_datasets.load(training_dataset)
    return {'documents': dataset.docs_iter(), 'queries': queries, 'topics': dataset.queries_iter()}

In [None]:
# create model

import pyterrier as pt

def create_model(index):
    return pt.BatchRetrieve(index, wmodel="BM25")

In [None]:
import glob

training_dataset = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
queries = load_dataset(training_dataset)['queries']

stopword_names = [
    './stopwordlists/stopwords_english_long.txt',
    './stopwordlists/stopwords_improved_merged_top10.txt',
    './stopwordlists/stopwords_improved_single_top10.txt',
    './stopwordlists/stopwords_improved_single_top50.txt'
]

for stopwords_path in stopword_names:
    if os.path.exists(stopwords_path):
        print(f"The file '{stopwords_path}' exists.")
    else:
        print('missing file', stopwords_path)
        raise ValueError('stopwords file does not exist')

    run_name = stopwords_path.replace('./stopwordlists/stopwords_', '').replace('.txt', '')
    output_dir = 'runs/applied-custom-stopwords'
    run_output_dir = output_dir + '/' + run_name

    pt.set_property("stopwords.filename", stopwords_path)

    new_index = create_index(load_dataset(training_dataset)['documents'])
    print("index created")

    improved_model = create_model(new_index)
    print("model created")

    run = improved_model(queries)

    !rm -Rf {run_output_dir}
    !mkdir -p {run_output_dir}

    persist_and_normalize_run(run, run_name, run_output_dir)





In [None]:
# Code for Significance Test Used Locally

import glob
from create_index import create_index
from load_dataset import load_dataset
from datetime import datetime
import json
import pyterrier as pt
from trectools import TrecRun, TrecQrel, TrecEval
from tira.rest_api_client import Client
import pandas as pd
tira = Client()

training_dataset = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
load_dataset_result = load_dataset(training_dataset)

qrels_data = pd.read_csv(tira.download_dataset('ir-lab-jena-leipzig-wise-2023', 'training-20231104-training', truth_dataset=True) + '/qrels.txt', sep="\s+", names=["qid","q0","docno","relevance"])
# Enforce string type on docid column (if present)
if "docno" in qrels_data:
    qrels_data["docno"] = qrels_data["docno"].astype(str)
# Enforce string type on q0 column (if present)
if "q0" in qrels_data:
    qrels_data["q0"] = qrels_data["q0"].astype(str)
# Enforce string type on query column (if present)
if "qid" in qrels_data:
    qrels_data["qid"] = qrels_data["qid"].astype(str)
# Removes the files that were not judged:
qrels_data = qrels_data[qrels_data["relevance"] >= 0]



#training_qrels = load_qrels('training-20231104-training')
training_dataset = 'ir-lab-jena-leipzig-wise-2023/training-20231104-training'
load_dataset_result = load_dataset(training_dataset)

improved_stopwords="./stopwordlists/auswertung/stopwords_english_long.txt"
pt.set_property("stopwords.filename",improved_stopwords)
normal_index = create_index(load_dataset(training_dataset)['documents'])
print("index1 created")
normal_stopword_model = create_model(normal_index)
print("model1 created")

improved_stopwords="./stopwordlists/auswertung/stopwords_improved_merged_top10.txt"
pt.set_property("stopwords.filename",improved_stopwords)
improved_index = create_index(load_dataset(training_dataset)['documents'])
print("index2 created")
improved_model = create_model(improved_index)
print("model2 created")

print(pt.Experiment
      (
[normal_stopword_model,improved_model],
load_dataset_result['queries'],
qrels_data,
eval_metrics=["map", "recip_rank"],
names=["normal_stopword_model", "improved_model"],
baseline=0,
correction='bonferroni'))
