In [6]:
import os
import pandas as pd

In [7]:
# Function to parse the QREL file
def parse_qrels(qrels_file):
    qrels = {}
    with open(qrels_file, 'r') as file:
        for line in file:
            query_id, _, doc_id, relevance = line.split()
            relevance = int(relevance)
            if query_id not in qrels:
                qrels[query_id] = {}
            qrels[query_id][doc_id] = relevance
    return qrels

# Function to calculate precision at k
def precision_at_k(relevant, retrieved, k=10):
    if k > len(retrieved):
        k = len(retrieved)
    retrieved_at_k = retrieved[:k]
    true_positives = sum([1 for doc_id in retrieved_at_k if doc_id in relevant and relevant[doc_id] > 0])
    return true_positives / k

# Function to calculate average precision at k
def average_precision_at_k(relevant, retrieved, k=100):
    precisions = [precision_at_k(relevant, retrieved, i + 1) for i in range(k) if i < len(retrieved) and retrieved[i] in relevant and relevant[retrieved[i]] > 0]
    if not precisions:
        return 0.0
    return sum(precisions) / len(precisions)

# Function to calculate Mean Average Precision (MAP) at k
def mean_average_precision_at_k(relevant, retrieved, k=100):
    precision_values = []
    relevant_count = 0
    for i, doc_id in enumerate(retrieved[:k]):
        if doc_id in relevant and relevant[doc_id] > 0:
            relevant_count += 1
            precision_values.append(relevant_count / (i + 1))  # Calculate precision at each relevant document rank
    if not precision_values:
        return 0.0
    return sum(precision_values) / len(precision_values)


# Function to calculate P@10 and AP@100
def calculate_metrics(df, qrel_dict, k1=10, k2=100):
    results = []
    grouped = df.groupby('topicId')
    
    for topic, group in grouped:
        relevant_docs = qrel_dict.get(str(topic), {})  # Ensure topic is a string
        retrieved_docs = group.sort_values('ranking')['docId'].tolist()
        
        p_at_10 = precision_at_k(relevant_docs, retrieved_docs, k1)
        ap_at_100 = average_precision_at_k(relevant_docs, retrieved_docs, k2)
        map_at_100 = mean_average_precision_at_k(relevant_docs, retrieved_docs, k2) #MAP
        
        results.append({'topicId': topic, 'P@10': p_at_10, 'AP@100': ap_at_100, 'MAP@100': map_at_100})
    
    return results

In [8]:
# Define paths
qrel_file_path = 'D:\VSCODE PROJECT\IR\dataset\qrels.trec8.csv'
input_dir = 'D:\VSCODE PROJECT\IR\cleaned'  # Replace with your input files directory
output_dir = 'D:\VSCODE PROJECT\IR\scores'
output_file_name = 'testerror_result.csv'
output_path = os.path.join(output_dir, output_file_name)

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

In [9]:
# Parse the QREL file
qrel_dict = parse_qrels(qrel_file_path)

# Debug: Print the qrel_dict to ensure it's parsed correctly
print("QREL Dictionary:")
for topic, docs in qrel_dict.items():
   print(f"Topic: {topic}, Docs: {list(docs.items())[:5]}")  # Print first 5 docs for each topic for brevity


QREL Dictionary:
Topic: 401, Docs: [('FBIS3-10009', 0), ('FBIS3-10059', 0), ('FBIS3-10142', 0), ('FBIS3-1026', 0), ('FBIS3-10502', 0)]
Topic: 402, Docs: [('FBIS3-10134', 0), ('FBIS3-10279', 0), ('FBIS3-10291', 0), ('FBIS3-10855', 0), ('FBIS3-10954', 0)]
Topic: 403, Docs: [('FBIS3-10291', 0), ('FBIS3-10690', 0), ('FBIS3-11193', 0), ('FBIS3-1164', 0), ('FBIS3-11691', 0)]
Topic: 404, Docs: [('FBIS3-10109', 0), ('FBIS3-10402', 0), ('FBIS3-1048', 0), ('FBIS3-1067', 0), ('FBIS3-11594', 0)]
Topic: 405, Docs: [('FBIS3-1045', 0), ('FBIS3-1067', 0), ('FBIS3-11379', 0), ('FBIS3-11396', 0), ('FBIS3-1166', 0)]
Topic: 406, Docs: [('FBIS3-10133', 0), ('FBIS3-1411', 0), ('FBIS3-1674', 0), ('FBIS3-1790', 0), ('FBIS3-18526', 0)]
Topic: 407, Docs: [('FBIS3-10216', 0), ('FBIS3-14832', 0), ('FBIS3-16551', 0), ('FBIS3-16631', 0), ('FBIS3-16684', 0)]
Topic: 408, Docs: [('FBIS3-10291', 0), ('FBIS3-12406', 0), ('FBIS3-13345', 0), ('FBIS3-13375', 0), ('FBIS3-13962', 0)]
Topic: 409, Docs: [('FBIS3-10392', 0), ('

In [10]:
#Get the list of input files
input_files = [os.path.join(input_dir, file) for file in os.listdir(input_dir) if file.endswith('.csv')]

# Initialize a DataFrame to store all results
all_results = []

# Process each input file
for file in input_files:
    try:
        # Read the input dataset
        input_df = pd.read_csv(file, delimiter=',', names=['topicId', 'identifier', 'docId', 'ranking', 'similarityScore', 'systemName'])
        
        # Debug: Print the first few rows of the dataframe
        #print(f"\nProcessing file: {file}")
        #print(input_df.head())
        
        # Calculate P@10 and AP@100
        results = calculate_metrics(input_df, qrel_dict, k1=10, k2=100)
        
        # Convert to DataFrame
        results_df = pd.DataFrame(results)
        results_df['system'] = os.path.basename(file).split('.')[0]
        
        # Append to all results
        all_results.append(results_df)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Concatenate all results into a single DataFrame
if all_results:
    final_results = pd.concat(all_results, ignore_index=True)
    # Save the final results to a file
    final_results.to_csv(output_path, index=False)
    
    # Display the final results in a table format
    #table_format = final_results.pivot(index='topicId', columns='system', values=['P@10', 'AP@100'])
    #print("\nFinal Results in Table Format:")
    #print(table_format)
    # Display the final results in separate tables for P@10 and AP@100
    table_format_p10 = final_results.pivot(index='topicId', columns='system', values='P@10')
    table_format_ap100 = final_results.pivot(index='topicId', columns='system', values='AP@100')
    table_format_map100 = final_results.pivot(index='topicId', columns='system', values='MAP@100')
    
    

    print("\nTable for P@10:")
    print(table_format_p10)

    # print("\nTable for AP@100:")
    # print(table_format_ap100)
    # print("\nTable for MAP@100:")
    # print(table_format_map100)

else:
    print("No valid data found to concatenate.")



Table for P@10:
system   Dm8Nbn  Flab8at  Flab8ax  GE8MTD2  MITSLStd  Mer8Adtd2  UB99T  \
topicId                                                                  
401         0.0      0.2      0.3      1.0       0.3        0.2    0.0   
402         0.6      0.6      0.9      0.6       0.7        0.6    0.0   
403         0.6      0.9      1.0      1.0       0.9        0.7    0.5   
404         0.1      0.2      0.3      0.4       0.4        0.4    0.1   
405         0.2      0.5      0.4      0.6       0.2        0.3    0.1   
406         0.2      0.5      0.5      0.4       0.4        0.4    0.0   
407         0.3      0.9      1.0      0.6       0.7        0.8    0.1   
408         0.1      0.3      0.4      0.7       0.4        0.4    0.0   
409         0.2      0.2      0.2      0.4       0.4        0.3    0.1   
410         0.5      1.0      1.0      1.0       1.0        1.0    0.0   
411         0.1      0.5      0.7      0.8       0.5        0.5    0.1   
412         0.0      