# 1. Imports and Config

In [2]:
import pyterrier as pt
import pandas as pd
import os
import numpy as np

In [3]:
if not pt.java.started():
    pt.java.init()

Java started and loaded: pyterrier.java.colab, pyterrier.java, pyterrier.java.24, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


## Loading index

In [4]:
INDEX = "./formal_run/terrierindex/1"

index_path = os.path.abspath(INDEX) 

try:
    index = pt.IndexFactory.of(index_path)
except Exception as e:
    print(f"Error loading index. Please check the path.\nError: {e}")
    exit()

# 2. Visualizing the index

When the indexer.index is generated, it creates:

- **Lexicon**: vocabulary with unique words;
- **Inverted File**: for every term, list of documents they appear and how many times;
- **Document Index**: stores the length of each document.

In [5]:
print("\n=== INDEX STATISTICS ===")
stats = index.getCollectionStatistics()
print(f"Total Documents: {stats.getNumberOfDocuments()}")
print(f"Total Unique Terms: {stats.getNumberOfUniqueTerms()}")
print(f"Total Tokens: {stats.getNumberOfTokens()}")


=== INDEX STATISTICS ===
Total Documents: 630
Total Unique Terms: 20210
Total Tokens: 131237


In [6]:
print("\n=== PEEKING INTO THE LEXICON (Top 5 terms) ===")
# Iterating through the first few terms to see what was indexed
lexicon = index.getLexicon()
for i, (term, entry) in enumerate(lexicon):
    print(f"Term: '{term}' | Appears in {entry.getDocumentFrequency()} docs")
    if i >= 4: break


=== PEEKING INTO THE LEXICON (Top 5 terms) ===
Term: '0' | Appears in 257 docs
Term: '00' | Appears in 80 docs
Term: '000' | Appears in 31 docs
Term: '0002' | Appears in 1 docs
Term: '0005' | Appears in 3 docs


In [None]:
lexicon = index.getLexicon()
termo_alvo = "space"

try:
    entry = lexicon.getLexiconEntry(termo_alvo)
    
    if entry is not None:
        print(f"--- Stats for: '{termo_alvo}' ---")
        
        print(f"Document Frequency (Nt): {entry.getDocumentFrequency()}")
        
        print(f"Total Frequency (TF):    {entry.getFrequency()}")
    else:
        print(f"The term '{termo_alvo}' does not exist in the index.")

except Exception as e:
    print(f"Error fetching term or term does not exist: {e}")

--- EstatÃ­sticas para: 'space' ---
Document Frequency (Nt): 9
Total Frequency (TF):    12


# 3. Visualizing Search

When the search is executed, the pipeline is:

- **Pre-processing**: applies same preprocessing techniques used during indexing;
- **Lexicon Lookup**: get the documents for each term of the query;
- **BM25 Scoring**: for each document with at least one of the query terms, it calculates the score.
    * **TF (Term Frequency)**: how many times the word appears in the document;
    * **IDF (Inverse Document Frequency)**: is this term rare?;
    * **Field Length Normalization**: is the document long or short?
- **Ranking**: the engine sums the scores for all query terms for each document and sorts.

In [None]:
bm25 = pt.terrier.Retriever(index, 
                            wmodel="BM25", 
                            metadata=["docno", "summary", "title"], 
                            num_results=10)

In [13]:
def explain_term_importance(index_obj, user_query):
    """
    Analyzes which word in the query carries the most weight (IDF).
    High IDF = Rare word = High impact on ranking.
    """
    lexicon = index_obj.getLexicon()
    terms = user_query.lower().split()
    N = index_obj.getCollectionStatistics().getNumberOfDocuments()
    
    data = []
    print(f"\n--- Term Analysis for: '{user_query}' ---")
    for t in terms:
        try:
            entry = lexicon.getLexiconEntry(t)
            nt = entry.getDocumentFrequency() # Number of docs containing term
            
            # Simplified IDF calculation for visualization
            # IDF = log(TotalDocs / (DocsWithTerm))
            if nt > 0:
                idf = np.log((N - nt + 0.5) / (nt + 0.5))
            else:
                idf = 0
            
            data.append({'Term': t, 'Docs (Nt)': nt, 'Strength (IDF)': round(idf, 4)})
        except:
            data.append({'Term': t, 'Docs (Nt)': 0, 'Strength (IDF)': 0})
    
    df = pd.DataFrame(data).sort_values('Strength (IDF)', ascending=False)
    print(df.to_string(index=False))
    print("---------------------------------------------")

In [19]:
query = input("Enter your query (e.g., future space missions): ")

if query:
    explain_term_importance(index, query)


--- Term Analysis for: 'space mission' ---
   Term  Docs (Nt)  Strength (IDF)
  space          9          4.1808
mission         41          2.6536
---------------------------------------------
