In [52]:
import os
import pandas as pd

In [53]:
# Function to parse the QREL file
def parse_qrels(qrels_file):
    qrels = {}
    with open(qrels_file, 'r') as file:
        for line in file:
            topicId, _, docId, relevance = line.split()
            relevance = int(relevance)
            if topicId not in qrels:
                qrels[topicId] = {}
            qrels[topicId][docId] = relevance
    return qrels

In [54]:
# Function to calculate precision at k
def precision_at_k(relevant, retrieved, k):
    retrieved_at_k = retrieved[:k]
    true_positives = sum([1 for docId in retrieved_at_k if docId in relevant])
    return true_positives / k


In [55]:
# Function to calculate average precision at k
def average_precision(relevant, retrieved, k):
    precisions = [precision_at_k(relevant, retrieved, i + 1) for i in range(k) if retrieved[i] in relevant]
    if not precisions:
        return 0.0
    return sum(precisions) / len(precisions)

In [56]:
# Function to calculate P@10 and MAP@10
def calculate_metrics(df, qrels, k=10):
    p_at_10 = []
    ap_at_10 = []
    grouped = df.groupby('topicId')
    
    for topicId, group in grouped:
        relevant_docs = qrels.get(topicId, {})
        retrieved_docs = group.sort_values('ranking')['docId'].tolist()
        p_at_10.append((topicId, precision_at_k(relevant_docs, retrieved_docs, k)))
        ap_at_10.append((topicId, average_precision(relevant_docs, retrieved_docs, k)))
    
    return p_at_10, ap_at_10

In [57]:
# Define paths
qrel_file_path = 'D:\VSCODE PROJECT\IR\dataset\qrels.trec8.adhoc'
input_dir = 'D:\VSCODE PROJECT\IR\cleaned'
output_dir = 'D:\VSCODE PROJECT\IR\scores'

In [58]:
# Parse the QREL file
qrels = parse_qrels(qrel_file_path)

# Get the list of input files
input_files = [os.path.join(input_dir, file) for file in os.listdir(input_dir) if file.endswith('.txt')]

# Initialize a DataFrame to store all results
all_results = pd.DataFrame()


In [59]:
# Process each input file
for file in input_files:
    # Read the input dataset
    input_df = pd.read_csv(file, delimiter='\t', names=['topicId', 'identifier', 'docId', 'ranking', 'similarityScore', 'systemName'])
    
    # Calculate P@10 and AP@10
    p_at_10, ap_at_10 = calculate_metrics(input_df, qrels)
    
    # Convert to DataFrame
    p_at_10_df = pd.DataFrame(p_at_10, columns=['topicId', 'P@10'])
    ap_at_10_df = pd.DataFrame(ap_at_10, columns=['topicId', 'AP@10'])
    
    # Merge results
    results_df = pd.merge(p_at_10_df, ap_at_10_df, on='topicId')
    results_df['systemName'] = os.path.basename(file).split('.')[0]
    
    # Append to all results
    all_results = pd.concat([all_results, results_df])


In [60]:
# Save the final results to a file
output_path = os.path.join(output_dir, 'scores.csv')
all_results.to_csv(output_path, index=False)


In [61]:

# Display the final results
print(all_results)

    topicId  P@10  AP@10     systemName
0       401   0.0    0.0  input_ric8dpn
1       402   0.0    0.0  input_ric8dpn
2       403   0.0    0.0  input_ric8dpn
3       404   0.0    0.0  input_ric8dpn
4       405   0.0    0.0  input_ric8dpn
..      ...   ...    ...            ...
45      446   0.0    0.0        plt8ah2
46      447   0.0    0.0        plt8ah2
47      448   0.0    0.0        plt8ah2
48      449   0.0    0.0        plt8ah2
49      450   0.0    0.0        plt8ah2

[250 rows x 4 columns]
