# Imports

In [None]:
import pandas as pd
import re
from sentence_transformers import SentenceTransformer, util
import json
import random

  from .autonotebook import tqdm as notebook_tqdm


# SNC Text Preprocessing

In [2]:
def clean_scope_note(text):
    if not isinstance(text, str): return ""
    
    # Normalize
    text = text.lower()
    
    # Phrases that add noise to the vector (instructions vs content)
    noise_phrases = [
        r"use for materials on",
        r"use for",
        r"subdivide by.*",
        r"includes?",
        r"if volume warrants",
        r"general policy",
        r"see also",
        r"filed under",
        r"restricted to"
    ]
    
    for pattern in noise_phrases:
        text = re.sub(pattern, " ", text)
    
    return text

# Main Workflow

In [None]:
def run_embedding_ranking(
    queries_df,       # columns: ['topic_id', 'query_text']
    labels_df,        # columns: ['label_id', 'label_text'] (Unique Labels)
    folder_map_df,    # columns: ['folder_id', 'label_id'] (Which folder has which label)
    qrels_df,         # columns: ['topic_id', 'folder_id', 'relevance']
    seed
):
    """
    Ranks folder labels by semantic similarity to query, expands to folders,
    and checks ground truth.
    """
    # --- A. Setup Models & Data ---
    print("Loading Model...")
    model = SentenceTransformer('all-mpnet-base-v2')
    
    # Create Qrels Lookup Dict for O(1) access: {(topic, folder): rel}
    print("Building Qrels Lookup...")
    qrels_map = qrels_df.set_index(['topic_id', 'folder_id'])['relevance'].to_dict()
    
    # Group Folders by Label for fast expansion
    # dict: { 'POL18': ['FolderA', 'FolderB'], ... }
    label_to_folders = folder_map_df.groupby('label_id')['folder_id'].apply(list).to_dict()

    # --- B. Pre-compute Label Embeddings ---
    print("Cleaning and Encoding Folder Labels...")
    labels_df['cleaned_text'] = labels_df['folder_label'].apply(clean_scope_note)
    
    # Create lists for ordering
    unique_label_ids = labels_df['label_id'].tolist()
    unique_label_texts = labels_df['cleaned_text'].tolist()
    
    # Encode all labels once (Tensor format for GPU/CPU acceleration)
    label_embeddings = model.encode(unique_label_texts, convert_to_tensor=True)

    results_data = []

    # --- C. Loop Per Query ---
    print(f"Processing {len(queries_df)} queries...")

    random.seed(seed)
    
    for idx, row in queries_df.iterrows():
        topic_id = row['ID']
        query_text = row['TITLE'] + row['DESCRIPTION']
        
        # 1. Encode Query
        query_embedding = model.encode(query_text, convert_to_tensor=True)
        
        # 2. Calculate Cosine Similarity (Query vs All Labels)
        cosine_scores = util.cos_sim(query_embedding, label_embeddings)[0]
        
        # 3. Rank Labels
        # We zip scores with IDs to keep track
        scores_and_labels = zip(cosine_scores.tolist(), unique_label_ids)
        # Sort by score descending
        ranked_labels = sorted(scores_and_labels, key=lambda x: x[0], reverse=True)
        
        # 4. Expansion Logic (Get folders until count >= 5)
        final_selection = []
        target_count = 5
        
        for score, label_id in ranked_labels:
            # Calculate how many slots are left
            slots_left = target_count - len(final_selection)
            
            if slots_left <= 0:
                break
            
            # Get candidates from this label
            candidates = label_to_folders.get(label_id, [])
            
            if not candidates:
                continue
            
            if len(candidates) <= slots_left:
                # Case A: We can take ALL folders from this label
                selected_subset = candidates.copy()
                random.shuffle(selected_subset)
            else:
                # Case B: This label has MORE folders than we have space.
                selected_subset = random.sample(candidates, slots_left)
            
            # Add to selection
            for folder_id in selected_subset:
                relevance = qrels_map.get((topic_id, folder_id), 0)
                final_selection.append({
                    "topic": topic_id,
                    "folder_label_id": label_id,
                    "similarity_score": score,
                    "folder": folder_id,
                    "qrel": relevance
                })
        
        results_data.extend(final_selection)

    # --- D. Final Output ---
    final_df = pd.DataFrame(results_data)
    
    # Reorder columns as requested
    final_df = final_df[['topic', 'folder_label_id', 'folder', 'qrel']]
    
    return final_df

# Creating DataFrames

In [4]:
with open("./src/data_creation/topics_output.txt", 'r', encoding='utf-8') as file:
    queries = list(json.load(file).values())

queries_df = pd.DataFrame.from_dict(queries)
queries_df.head()

Unnamed: 0,TITLE,DESCRIPTION,NARRATIVE,ID
0,Future space missions,Find documents that describe plans for one or ...,Space missions are defined as objects of any k...,T18Eval-00001
1,Brazilian submarines,Find documents that mention the operation of s...,Submarines are any watercraft that can operate...,T18Eval-00002
2,Alagoas flood,I'm interested in learning about damage from f...,I'm looking for documents about flood damage i...,T18Eval-00003
3,Grain sorghum,Tell me about grain sorghum operations in Nort...,Sorghum is a drought-tolerant species of grass...,T18Eval-00004
4,Beef cattle industry,Locate documents containing information on Bra...,The beef cattle industry is a sector of agricu...,T18Eval-00005


In [None]:
with open("./data/folders_metadata/FoldersV1.2.json", 'r', encoding='utf-8') as file:
    data = json.load(file)

folder_map_df = pd.DataFrame.from_dict(data, orient='index').reset_index().rename(columns={'index': 'folder_id'})

labels_df = pd.DataFrame(folder_map_df['folder_label'].unique(), columns=['folder_label'])

labels_df['label_id'] = ['LABEL' + str(i + 1) for i in range(len(labels_df))]

folder_map_df = folder_map_df.merge(labels_df, on='folder_label', how='left')

folder_map_df.head()

Unnamed: 0,folder_id,box,snc,label,date,endDate,rg,folder_label,label_id
0,A99990001,A0001,LAB 3,LAB 3 Organizations & Conferences 1964 (Classi...,01/01/1964,01/01/1969,84,LABOR & MANPOWER: ORGANIZATIONS & CONFERENCES,LABEL1
1,A99990026,A0001,POL 18,POL 18 Pernambuco 1964 (Classified),01/01/1964,01/01/1967,84,"POLITICAL AFFAIRS & RELATIONS: PROVINCIAL, MUN...",LABEL2
2,A99990070,A0001,AID,AID - General 1964 (Classified),01/01/1964,01/01/1968,84,AID,LABEL3
3,A99990073,A0001,DEF,DEF Defense Affairs 1964 (Classifed),01/01/1964,01/01/1969,84,DEFENSE AFFAIRS,LABEL4
4,A99990103,A0001,AV,AV - Aviation (Civil) 1964 (Classified),01/01/1964,01/01/1969,84,AVIATION (CIVIL),LABEL5


In [6]:
labels_df.head()

Unnamed: 0,folder_label,label_id
0,LABOR & MANPOWER: ORGANIZATIONS & CONFERENCES,LABEL1
1,"POLITICAL AFFAIRS & RELATIONS: PROVINCIAL, MUN...",LABEL2
2,AID,LABEL3
3,DEFENSE AFFAIRS,LABEL4
4,AVIATION (CIVIL),LABEL5


In [None]:
qrels_df = pd.read_csv(
    './qrels/formal-folder-qrel.txt', 
    sep='\t',
    header=None,
    names=['topic_id', 'iteration', 'folder_id', 'relevance'],
    dtype={'topic_id': str, 'folder_id': str}
)

qrels_df.head()

Unnamed: 0,topic_id,iteration,folder_id,relevance
0,T18Eval-00001,0,N23813085,0
1,T18Eval-00001,0,B99990565,3
2,T18Eval-00001,0,M99990649,0
3,T18Eval-00001,0,N23813036,0
4,T18Eval-00001,0,N23813033,0


# Running Code

In [9]:
# Run the Evaluation
for seed in [1, 100, 333, 777, 999, 1000, 12345, 55555, 98876, 101010]:
    df_results = run_embedding_ranking(queries_df=queries_df, labels_df=labels_df,
                                    folder_map_df=folder_map_df, qrels_df=qrels_df, seed=seed)

    tsv_content = df_results[['topic', 'folder']].to_csv(sep='\t', index=False, header=False)

    tsv_content = tsv_content.rstrip()

    with open(f'./all_runs/F_EMB_TD/Random{seed}.csv', 'w') as f:
        f.write(tsv_content)
    #df_results.head(10)

Loading Model...
Building Qrels Lookup...
Cleaning and Encoding Folder Labels...
Processing 45 queries...
Loading Model...
Building Qrels Lookup...
Cleaning and Encoding Folder Labels...
Processing 45 queries...
Loading Model...
Building Qrels Lookup...
Cleaning and Encoding Folder Labels...
Processing 45 queries...
Loading Model...
Building Qrels Lookup...
Cleaning and Encoding Folder Labels...
Processing 45 queries...
Loading Model...
Building Qrels Lookup...
Cleaning and Encoding Folder Labels...
Processing 45 queries...
Loading Model...
Building Qrels Lookup...
Cleaning and Encoding Folder Labels...
Processing 45 queries...
Loading Model...
Building Qrels Lookup...
Cleaning and Encoding Folder Labels...
Processing 45 queries...
Loading Model...
Building Qrels Lookup...
Cleaning and Encoding Folder Labels...
Processing 45 queries...
Loading Model...
Building Qrels Lookup...
Cleaning and Encoding Folder Labels...
Processing 45 queries...
Loading Model...
Building Qrels Lookup...
Clea

In [None]:
from src.lastest_runs.run_refactor import evaluateSearchResults

for seed in [1, 100, 333, 777, 999, 1000, 12345, 55555, 98876, 101010]:
    qrels_folder = '../qrels/formal-folder-qrel.txt'
    qrels_box = '../qrels/formal-box-qrel.txt'
    evaluateSearchResults(f'./all_runs/F_EMB_TD/Random{seed}.csv', qrels_folder, qrels_box, './results/metrics/OfficialResultsTopicsResultsEvaluation.txt', "", f"F_EMB_TD/Random{seed}", True)

          Folder          Box
NDCG@5: 0.093±0.05    0.223±0.07
   MAP: 0.035±0.02    0.118±0.05
   MRR: 0.149±0.08    0.393±0.12
   S@1: 0.067±0.07    0.267±0.13
          Folder          Box
NDCG@5: 0.099±0.05    0.220±0.07
   MAP: 0.036±0.02    0.114±0.05
   MRR: 0.156±0.08    0.385±0.12
   S@1: 0.067±0.07    0.244±0.13
          Folder          Box
NDCG@5: 0.105±0.05    0.247±0.08
   MAP: 0.043±0.03    0.136±0.06
   MRR: 0.174±0.09    0.398±0.12
   S@1: 0.089±0.08    0.267±0.13
          Folder          Box
NDCG@5: 0.093±0.05    0.213±0.07
   MAP: 0.035±0.02    0.112±0.04
   MRR: 0.148±0.08    0.371±0.11
   S@1: 0.067±0.07    0.222±0.12
          Folder          Box
NDCG@5: 0.092±0.05    0.226±0.07
   MAP: 0.035±0.02    0.115±0.04
   MRR: 0.145±0.08    0.398±0.12
   S@1: 0.067±0.07    0.267±0.13
          Folder          Box
NDCG@5: 0.096±0.05    0.219±0.07
   MAP: 0.036±0.02    0.106±0.04
   MRR: 0.157±0.08    0.365±0.11
   S@1: 0.067±0.07    0.222±0.12
          Folder          Bo

In [33]:
def detailed_relevance_breakdown(df_results):
    """
    Analyzes the df_results table to show distribution of relevance per topic.
    """
    # 1. Define custom aggregations
    # We want to count how many times specific relevance scores appear
    aggregations = {
        'total_retrieved': ('folder', 'count'),
        'relevant_any':    ('qrel', lambda x: (x > 0).sum()),      # Any relevance > 0
        'highly_relevant': ('qrel', lambda x: (x == 3).sum()),     # Count of Score 3
        'relevant':        ('qrel', lambda x: (x == 1).sum()),     # Count of Score 1
        'not_relevant':    ('qrel', lambda x: (x == 0).sum()),     # Count of Score 0
        'max_relevance':   ('qrel', 'max')                         # Best doc found
    }

    # 2. Group by Topic and Apply
    analysis_df = df_results.groupby('topic').agg(**aggregations)

    # 3. Calculate Precision (Accuracy of the retrieved set)
    analysis_df['precision'] = analysis_df['relevant_any'] / analysis_df['total_retrieved']

    # 4. Add a "Status" column for quick reading
    def get_status(row):
        if row['highly_relevant'] > 0:
            return "✅ High Hit"
        elif row['relevant_any'] > 0:
            return "⚠️ Weak Hit"
        else:
            return "❌ Miss"
            
    analysis_df['status'] = analysis_df.apply(get_status, axis=1)

    # 5. Reorder columns for readability
    analysis_df = analysis_df[[
        'status', 'total_retrieved', 'relevant_any', 
        'highly_relevant', 'relevant', 'not_relevant', 'precision'
    ]]

    return analysis_df

# --- RUNNING THE ANALYSIS ---
# Assuming 'df_results' is the dataframe returned from the previous step
detailed_stats = detailed_relevance_breakdown(df_results)

print("\n--- DETAILED TOPIC ANALYSIS ---")
print(detailed_stats)

# --- GLOBAL SUMMARY METRICS ---
print("\n--- GLOBAL SUMMARY ---")
total_topics = len(detailed_stats)
success_topics = len(detailed_stats[detailed_stats['relevant_any'] > 0])
high_quality_topics = len(detailed_stats[detailed_stats['highly_relevant'] > 0])

print(f"Total Topics Evaluated: {total_topics}")
print(f"Success Rate (Found at least 1 relevant folder): {success_topics / total_topics:.1%} ({success_topics}/{total_topics})")
print(f"High Quality Rate (Found a 'Score 3' folder):    {high_quality_topics / total_topics:.1%} ({high_quality_topics}/{total_topics})")
print(f"Average Precision per Topic: {detailed_stats['precision'].mean():.3f}")


--- DETAILED TOPIC ANALYSIS ---
                    status  total_retrieved  relevant_any  highly_relevant  \
topic                                                                        
T18Eval-00001   ✅ High Hit                7             5                4   
T18Eval-00002       ❌ Miss                5             0                0   
T18Eval-00003       ❌ Miss                5             0                0   
T18Eval-00004   ✅ High Hit               11             2                1   
T18Eval-00005   ✅ High Hit               43             4                3   
T18Eval-00006       ❌ Miss                8             0                0   
T18Eval-00007  ⚠️ Weak Hit               10             1                0   
T18Eval-00008       ❌ Miss                6             0                0   
T18Eval-00009       ❌ Miss                6             0                0   
T18Eval-00010       ❌ Miss               42             0                0   
T18Eval-00011       ❌ Miss     