In [21]:
import pandas as pd

def parse_qrels(qrels_file):
    qrels = {}
    with open(qrels_file, 'r') as file:
        for line in file:
            topic, _, doc_id, relevance = line.split()
            relevance = int(relevance)
            if topic not in qrels:
                qrels[topic] = {}
            qrels[topic][doc_id] = relevance
    return qrels

qrels_file_path = 'D:\VSCODE PROJECT\IR\dataset\qrels.trec8.csv'
qrels = parse_qrels(qrels_file_path)


In [22]:
def precision_at_k(relevant, retrieved, k=10):
    if k > len(retrieved):
        k = len(retrieved)
    relevant_at_k = list(relevant)[:k]
    retrieved_at_k = retrieved[:k]
    true_positives = len(set(relevant_at_k) & set(retrieved_at_k))
    return true_positives / k

def average_precision(relevant, retrieved, k=100):
    precisions = []
    for i in range(1, k + 1):
        if i <= len(retrieved) and retrieved[i - 1] in relevant:
            precisions.append(precision_at_k(relevant, retrieved, i))
    if not precisions:
        return 0.0
    return sum(precisions) / len(precisions)

def calculate_metrics_for_system(system_file, qrel_dict):
    system_results = pd.read_csv(system_file)
    system_results.sort_values(by=['topicId', 'ranking'], inplace=True)
    
    p_at_10_results = []
    ap_at_100_results = []
    
    grouped_results = system_results.groupby('topicId')
    for topic_id, group in grouped_results:
        topic_id = str(topic_id)
        relevant = qrel_dict.get(topic_id, set())
        retrieved = group['docId'].tolist()
        
        p_at_10 = precision_at_k(relevant, retrieved, 10)
        ap_at_100 = average_precision(relevant, retrieved, 100)
        
        p_at_10_results.append({'topicId': topic_id, 'P@10': p_at_10})
        ap_at_100_results.append({'topicId': topic_id, 'AP@100': ap_at_100})
        
    p_at_10_df = pd.DataFrame(p_at_10_results)
    ap_at_100_df = pd.DataFrame(ap_at_100_results)
    
    return p_at_10_df, ap_at_100_df


In [25]:
import glob
import pandas as pd

# Ensure correct file path pattern
input_files = glob.glob(r'D:\VSCODE PROJECT\IR\cleaned\*.csv')

# Print matched files to verify
print(input_files)

# Check the columns of the first file
if input_files:
    test_file = input_files[0]
    test_df = pd.read_csv(test_file)
    print("Columns in the test file:", test_df.columns)
else:
    print("No files found. Please check the directory and file pattern.")


['D:\\VSCODE PROJECT\\IR\\cleaned\\ok8amxc.ok8amxc.csv']
Columns in the test file: Index(['401\tQ0\tFBIS4-18182\t1\t3.59032\tok8amxc'], dtype='object')


In [26]:
import glob

input_files = glob.glob('D:\VSCODE PROJECT\IR\cleaned\*.csv')
system_names = [f'System {i+1}' for i in range(len(input_files))]

all_p_at_10 = pd.DataFrame()
all_ap_at_100 = pd.DataFrame()

for system_file, system_name in zip(input_files, system_names):
    p_at_10_df, ap_at_100_df = calculate_metrics_for_system(system_file, qrels)
    
    p_at_10_df.columns = ['topicId', system_name]
    ap_at_100_df.columns = ['topicId', system_name]
    
    if all_p_at_10.empty:
        all_p_at_10 = p_at_10_df
        all_ap_at_100 = ap_at_100_df
    else:
        all_p_at_10 = pd.merge(all_p_at_10, p_at_10_df, on='topicId')
        all_ap_at_100 = pd.merge(all_ap_at_100, ap_at_100_df, on='topicId')

# Calculate Mean Precision@10 and Mean Precision@100 for each system
mean_p_at_10 = all_p_at_10.mean(axis=0)
mean_ap_at_100 = all_ap_at_100.mean(axis=0)

mean_metrics = pd.DataFrame({
    'System': system_names,
    'Mean Precision@10': mean_p_at_10[1:],
    'Mean Precision@100': mean_ap_at_100[1:]
})

# Save results to a CSV file (optional)
all_p_at_10.to_csv('precision_at_10_results.csv', index=False)
all_ap_at_100.to_csv('ap_at_100_results.csv', index=False)
mean_metrics.to_csv('mean_metrics_results.csv', index=False)


KeyError: 'topicId'

In [None]:
print("Precision@10")
print(all_p_at_10)

print("\nAverage Precision@100")
print(all_ap_at_100)

print("\nMean Precision@10 and Mean Precision@100")
print(mean_metrics)


Precision@10
Empty DataFrame
Columns: []
Index: []

Average Precision@100
Empty DataFrame
Columns: []
Index: []

Mean Precision@10 and Mean Precision@100
Empty DataFrame
Columns: [System, Mean Precision@10, Mean Precision@100]
Index: []
