In [1]:
import requests
import pyterrier as pt
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel, AutoTokenizer, AutoModelForSequenceClassification
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from pathlib import Path
import os
import pandas as pd
import re
import json
import spacy
from spacy.matcher import Matcher
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.tokens import Token


# Ensure NLTK data is downloaded only once
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Define file paths
original_queries_path = 'original_queries.json'
rewritten_queries_path = 'rewritten_queries.json'


In [2]:
# Functions to save and load data
def save_data_to_file(data, file_path):
    with open(file_path, 'w') as file:
        json.dump(data, file)

def load_data_from_file(file_path):
    if Path(file_path).exists():
        with open(file_path, 'r') as file:
            return json.load(file)
    return None

def remove_redundancy(preprocessed_query):
    tokens = preprocessed_query.split()
    seen = set()
    unique_tokens = [t for t in tokens if not (t in seen or seen.add(t))]
    return ' '.join(unique_tokens)


# Load spaCy English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Extend stop words list with common but low-information words
extended_stop_words = {"define", "meaning", "example", "describe", "use", "refer", "relate", "involve", "include", "give", "take", "make", "see", "want", "get", "say", "ask", "tell", "be", "know", "do", "have", "would", "should", "could", "about"}
for word in extended_stop_words:
    STOP_WORDS.add(word)

# Customize token extension to flag important tokens to keep
Token.set_extension("is_important", default=False, force=True)

def preprocess_query(query):
    """
    Preprocess a single query using spaCy for tokenization, lemmatization, and stop word removal,
    aiming for greater conciseness.
    """
    # Clean up the query by removing unwanted characters
    query = re.sub(r'\n+', ' ', query)  # Replace one or more newlines with a single space
    query = re.sub(r'\s+', ' ', query).strip()  # Replace multiple spaces with a single space and trim

    # Process the text
    doc = nlp(query)

    # Identify important tokens to preserve
    for ent in doc.ents:
        for token in ent:
            token._.is_important = True

    for token in doc:
        if token.pos_ in {"PROPN", "NOUN", "VERB"}:
            token._.is_important = True

    # Condense the query by keeping important tokens and removing less important ones
    tokens = [token.lemma_.lower() for token in doc if (token._.is_important or token.text.lower() in extended_stop_words) and not token.is_stop and token.pos_ != "PUNCT"]

    # Reconstruct the query
    preprocessed_query = " ".join(tokens)

    return preprocessed_query


def clean_text(text):
    """Performs common cleaning operations on text."""
    text = re.sub(r"[\'\(\)]", '', text)  # Remove specific characters
    text = re.sub(r'\n+', ' ', text)  # Newlines to space
    return re.sub(r'\s+', ' ', text).strip()  # Multiple spaces to single space

def preprocess_query_final(query, max_tokens=10):
    """Preprocesses a single query by tokenizing, normalizing, removing stop words, and limiting to a maximum number of tokens."""
    if not query:
        raise ValueError("Input query must be a non-empty string")
    query = clean_text(query)
    tokens = word_tokenize(query)
    tokens = [re.sub(r'\W+', '', token.lower()) for token in tokens if re.sub(r'\W+', '', token.lower())]
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    tokens = tokens[:max_tokens]
    return " ".join(tokens)


def preprocess_rewritten_queries(rewritten_queries):
    """
    Preprocesses the rewritten queries to extract useful information.

    :param rewritten_queries: A list of dictionaries with 'generated_text' keys.
    :return: A list of cleaned, concise queries.
    """
    cleaned_queries = []

    for query in rewritten_queries:
        # Extract the generated text
        generated_text = query.get('generated_text', '')

        # Clean up the generated text by removing unwanted characters
        generated_text = re.sub(r'\n+', ' ', generated_text)  # Replace one or more newlines with a single space
        generated_text = re.sub(r'\s+', ' ', generated_text).strip()  # Replace multiple spaces with a single space and trim

        # Remove instructional text and formatting tags
        useful_text = re.sub(r"<s> \[INST\].*?\[/INST\]</s>", "", generated_text, flags=re.DOTALL)

        # Split on line breaks or common dividers and select the first non-empty line
        potential_queries = re.split(r"\n\nOR\n\n|;", useful_text)
        potential_queries = [q.strip() for q in potential_queries if q.strip()]

        # Choose the first non-empty, concise piece of text
        if potential_queries:
            cleaned_queries.append(potential_queries[0])

    return cleaned_queries

In [3]:
class DatasetLoader:
    def __init__(self, dataset_id):
        self.dataset_id = dataset_id
        if not pt.started():
            pt.init()
        self.dataset = pt.get_dataset(dataset_id)
        self.topics = self.dataset.get_topics()
        self.qrels = self.dataset.get_qrels()
        self.corpus_iter = self.dataset.get_corpus_iter()
        self.corpus_iterator = iter(self.corpus_iter)

    def get_first_doc(self):
        return next(self.corpus_iterator)

    # Assuming each topic includes a 'query_id' and 'query' field
    def get_original_queries(self):
        return [(topic['qid'], topic['query']) for topic_id, topic in
                self.topics.iterrows()]


class QueryEvaluator:
    def __init__(self, tokenizer_model, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        self.model.eval()

    def evaluate_queries(self, sentences):
        features = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
        with torch.no_grad():
            scores = self.model(**features).logits
        return scores[:, 0]

# TODO: We need to adjust this class so that it gives reasonable outputs for the queries
class RewriteQueries:
    API_URL = "https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1"
    def __init__(self, auth_token):
        self.headers = {"Authorization": f"Bearer {auth_token}"}

    def query(self, query):
        prompt = f"<s> [INST] Concisely rewrite this into a search engine query: '{query}'. Aim for brevity and clarity without further explanation. [/INST]</s>"
        prompt = {"inputs": prompt}
        response = requests.post(self.API_URL, headers=self.headers, json=prompt)
        return response.json()

In [29]:
dataset_loader = DatasetLoader('irds:msmarco-passage/trec-dl-2020')
original_queries = dataset_loader.get_original_queries()
original_queries_df = pd.DataFrame(original_queries, columns=['qid', 'query'])

msmarco-passage/trec-dl-2020 documents:   0%|          | 0/8841823 [00:18<?, ?it/s]


In [5]:
query_evaluator = QueryEvaluator("Ashishkr/query_wellformedness_score", "Ashishkr/query_wellformedness_score")

selected_threshold = 0.4

query_texts = [query_text for qid, query_text in original_queries]

# Evaluate the well-formedness scores of the extracted query texts
well_formed_scores = query_evaluator.evaluate_queries(query_texts)

queries_to_rewrite = [(original_query, score) for original_query, score in zip(original_queries, well_formed_scores) if score < selected_threshold]

print(f"We have {len(queries_to_rewrite)} queries to rewrite.")
print(queries_to_rewrite[:5])

Some weights of the model checkpoint at Ashishkr/query_wellformedness_score were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


We have 96 queries to rewrite.
[(('1037496', 'who is rep scalise'), tensor(0.1841)), (('1051399', 'who sings monk theme song'), tensor(0.3793)), (('1103791', 'definition of endorsing'), tensor(0.2150)), (('1105792', 'define geon'), tensor(0.2947)), (('1105860', 'where can the amazon rainforest is located'), tensor(0.1853)), (('1106979', 'define pareto chart in statistics'), tensor(0.1922)), (('1108450', 'define define gallows'), tensor(0.1133)), (('1108651', 'what the best way to get clothes white'), tensor(0.1986)), (('1108729', 'what temperature and humidity to dry sausage'), tensor(0.1304)), (('1109699', 'what mental illnesses'), tensor(0.0420)), (('1110678', 'what is the un fao'), tensor(0.2620)), (('1113042', 'what is shakespeare s theatre called'), tensor(0.2335)), (('1113256', 'what is reba mcentire s net worth'), tensor(0.3470)), (('1114993', 'conformative definition'), tensor(0.1109)), (('1118370', 'what does provisions mean'), tensor(0.2031)), (('1120588', 'caries detection s

In [6]:
# Modified section for handling rewritten queries to maintain original query association
rewritten_queries = load_data_from_file(rewritten_queries_path)
if rewritten_queries is None:
    rewritten_queries = []
    query_rewriter = RewriteQueries("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")
    for original_query, _ in queries_to_rewrite:  # original_query is a tuple (qid, query_text)
        qid, query_text = original_query  # Unpack the original_query tuple
        response = query_rewriter.query(query_text)  # Pass only query_text for rewriting
        if response and isinstance(response, list) and 'generated_text' in response[0]:
            generated_text = response[0]['generated_text']
            rewritten_queries.append({'original_query': original_query, 'generated_text': generated_text})
    save_data_to_file(rewritten_queries, rewritten_queries_path)


In [7]:
# Process the rewritten queries with the original query included
cleaned_queries = [(query['original_query'], preprocess_rewritten_queries([query])[0]) for query in rewritten_queries]

with open('cleaned_queries_with_original.json', 'w') as f:
    json.dump(cleaned_queries, f)

for original_query, cleaned_query in cleaned_queries:
    preprocessed_query = preprocess_query(cleaned_query)
    final_query = remove_redundancy(preprocessed_query)


# output the final queries to a JSON file
with open('final_queries.json', 'w') as f:
    json.dump(cleaned_queries, f)


# Load the final queries with original and rewritten versions
with open('final_queries.json', 'r') as f:
    queries_data = json.load(f)

# Load the cleaned and potentially rewritten queries
with open('cleaned_queries_with_original.json', 'r') as f:
    cleaned_queries = json.load(f)

In [33]:
# Extract q_id and original query into separate lists
q_ids, original_queries = zip(*[ (q_id, query) for (q_id, query), _ in cleaned_queries ])
rewritten_queries = [rewritten for _, rewritten in cleaned_queries]

# Create a DataFrame with separate columns for q_id, original, and rewritten
df_queries = pd.DataFrame({
    'qid': q_ids,
    'original': original_queries,
    'rewritten': rewritten_queries
})

# Display the first few rows of the DataFrame
df_queries.head()

Unnamed: 0,qid,original,rewritten
0,1037496,who is rep scalise,Representative Steve Scalise biography or back...
1,1051399,who sings monk theme song,```sql Monk theme song singer ``` or ``` Monk ...
2,1103791,definition of endorsing,"""Endorsing definition"" or ""define endorsing"""
3,1105792,define geon,"""Geon definition"" or ""What is a geon?"""
4,1105860,where can the amazon rainforest is located,"""Amazon rainforest location"" This query is bri..."


In [30]:
original_queries_df.head()

Unnamed: 0,qid,query
0,1030303,who is aziz hashim
1,1037496,who is rep scalise
2,1043135,who killed nicholas ii of russia
3,1045109,who owns barnhart crane
4,1049519,who said no one can make you feel inferior


In [42]:
# Assuming df_queries and original_queries_df are already defined DataFrames as shown in the images provided
# Merge the two DataFrames on 'q_id'
merged_df = pd.merge(original_queries_df, df_queries, on='qid', how='left')

# Replace NaNs in the 'rewritten' column with the 'original' query from the original_queries_df
merged_df['rewritten'] = merged_df['rewritten'].fillna(merged_df['query'])

# If you want to rename the 'query' column to 'original' for consistency
merged_df = merged_df.rename(columns={'query': 'original_y'})

# Now you can drop any redundant columns if they exist (assuming 'original_y' is redundant)
merged_df = merged_df.drop(columns=['original', 'original_y'], errors='ignore')

# rename the rewritten column to query
rewritten_queries_df = merged_df.rename(columns={'rewritten': 'query'})

# Display the first few rows of the merged DataFrame
rewritten_queries_df.head()

Unnamed: 0,qid,query
0,1030303,who is aziz hashim
1,1037496,Representative Steve Scalise biography or back...
2,1043135,who killed nicholas ii of russia
3,1045109,who owns barnhart crane
4,1049519,who said no one can make you feel inferior


In [44]:
index_location = str(Path("index").absolute())
index_exists = os.path.isfile(
    os.path.join(index_location, "data.properties"))

# Fetch corpus iterator just before indexing
if not index_exists:
    corpus_iter = dataset_loader.corpus_iter 
    indexer = pt.IterDictIndexer(index_location)
    index_ref = indexer.index(corpus_iter)
    print("Indexing completed.")
else:
    print("Index already exists, loading from disk.")
    index_ref = index_location

# Assuming qrels are loaded correctly
qrels = dataset_loader.qrels

index = pt.IndexFactory.of(index_ref)
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
tf_idf = pt.BatchRetrieve(index, wmodel="TF_IDF")

eval_metrics = [
    pt.measures.RR(rel=1),
    pt.measures.nDCG @ 10,
    pt.measures.MAP(rel=1),
    pt.measures.Precision @ 5,  # Precision at rank 5
    pt.measures.Recall @ 100,   # Recall at rank 100
    pt.measures.MRR             # Mean Reciprocal Rank
]

Index already exists, loading from disk.


In [None]:
# Evaluating Original Queries
print("Evaluating Original Queries with BM25 and TF-IDF:")
results_original = pt.Experiment(
    [bm25, tf_idf],  # List of retrieval systems to evaluate
    original_queries_df[['qid', 'query']],  # DataFrame with queries
    qrels,  # Qrels for relevance judgments
    eval_metrics, 
    names=["BM25 Original", "TF-IDF Original"]  # Names for the systems
)

print(f"Results for Original Queries:\n{results_original}")
results_original.to_csv('results_original.csv', index=False)

Evaluating Original Queries with BM25 and TF-IDF:


In [None]:
print("Evaluating Rewritten Queries with BM25 and TF-IDF:")
simple_results = pt.Experiment(
    [bm25, tf_idf],
    rewritten_queries_df[['qid', 'query']],
    qrels,
    eval_metrics,
    names=["BM25 Rewritten", "TF-IDF Rewritten"]
)

print(f"Results for Rewritten Queries:\n{simple_results}")
simple_results.to_csv('simple_results.csv', index=False)