In [3]:
import os
import pandas as pd

# Define the path to your dataset directory
data_dir = 'D:\VSCODE PROJECT\IR\cleaned_dataset'

# Get the list of files in the directory
files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith('.txt')]

# Define a threshold for similarityScore to determine relevance
similarity_threshold = 0.5  # Adjust this value as needed

# Define the function to calculate P@10 and AP@10
def calculate_p10_ap10(df):
    # Filter top 10 documents for each topic
    top_10_docs = df[df['ranking'] < 10]
    
    # Calculate P@10 for each topic
    p_at_10 = top_10_docs.groupby('topicId')['relevance'].apply(lambda x: x.sum() / 10)
    
    # Calculate AP@10 for each topic
    def average_precision_at_10(group):
        relevant = group['relevance'].values
        precisions = [relevant[:i+1].sum() / (i+1) for i in range(len(relevant))]
        return sum(precisions) / min(len(relevant), 10)
    
    ap_at_10 = top_10_docs.groupby('topicId').apply(average_precision_at_10)
    
    return p_at_10, ap_at_10

# Initialize a list to store the results
results = []

# Process each file
for file in files:
    try:
        # Read the dataset
        df = pd.read_csv(file, delimiter='\t', names=['topicId', 'Q0', 'docId', 'ranking', 'similarityScore', 'system'])
        
        # Debug: Print the first few rows of the dataframe
        print(f"Processing file: {file}")
        print(df.head())
        
        # Ensure ranking is sorted
        df = df.sort_values(by=['topicId', 'ranking'])
        
        # Create the 'relevance' column based on the similarity threshold
        df['relevance'] = (df['similarityScore'] > similarity_threshold).astype(int)
        
        # Calculate P@10 and AP@10
        p_at_10, ap_at_10 = calculate_p10_ap10(df)
        
        # Append the results with the system name
        system_name = os.path.basename(file).split('.')[0]
        result_df = pd.DataFrame({
            'topicId': p_at_10.index,
            'P@10': p_at_10.values,
            'AP@10': ap_at_10.values,
            'system': system_name
        })
        
        results.append(result_df)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Concatenate all results into a single DataFrame
if results:
    final_results = pd.concat(results, ignore_index=True)
    # Debug: Print the final results
    print(final_results.head())
else:
    print("No valid data found to concatenate.")

# Save the final results to a file
output_dir = 'D:\VSCODE PROJECT\IR\scores'
os.makedirs(output_dir, exist_ok=True)  # Ensure the directory exists
output_path = os.path.join(output_dir, 'p_at_10_scores.csv')
if not final_results.empty:
    final_results.to_csv(output_path, index=False)


Processing file: D:\VSCODE PROJECT\IR\cleaned_dataset\cleaned_input.1.1.txt
   topicId  Q0             docId  ranking  similarityScore  system
0      401  Q0       FBIS4-20472        1              320       1
1      401  Q0       FBIS4-68893        2              251       1
2      401  Q0  FR941117-2-00158        3              221       1
3      401  Q0       FBIS3-37947        4              203       1
4      401  Q0          FBIS4-29        5              186       1
Processing file: D:\VSCODE PROJECT\IR\cleaned_dataset\cleaned_input.ok8amxc.txt
   topicId  Q0        docId  ranking  similarityScore   system
0      401  Q0  FBIS4-18182        1          3.59032  ok8amxc
1      401  Q0  FBIS3-18916        2          3.44936  ok8amxc
2      401  Q0  FBIS3-18833        3          3.40886  ok8amxc
3      401  Q0  FBIS3-39117        4          3.25332  ok8amxc
4      401  Q0  FBIS3-17077        5          3.15430  ok8amxc
Processing file: D:\VSCODE PROJECT\IR\cleaned_dataset\cleaned_in

In [4]:
import os
import pandas as pd

# Define the path to your dataset directory
data_dir = 'D:\VSCODE PROJECT\IR\cleaned_dataset'

# Get the list of files in the directory
files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith('.txt')]

# Define a threshold for similarityScore to determine relevance
similarity_threshold = 0.5  # Adjust this value as needed

# Define the function to calculate P@10 and AP@10
def calculate_p10_ap10(df):
    # Filter top 10 documents for each topic
    top_10_docs = df[df['ranking'] < 10]
    
    # Calculate P@10 for each topic
    p_at_10 = top_10_docs.groupby('topicId')['relevance'].apply(lambda x: x.sum() / 10)
    
    # Calculate AP@10 for each topic
    def average_precision_at_10(group):
        relevant = group['relevance'].values
        precisions = [relevant[:i+1].sum() / (i+1) for i in range(len(relevant))]
        return sum(precisions) / min(len(relevant), 10)
    
    ap_at_10 = top_10_docs.groupby('topicId').apply(average_precision_at_10)
    
    return p_at_10, ap_at_10

# Initialize a list to store the results
results = []

# Process each file
for file in files:
    try:
        # Read the dataset
        df = pd.read_csv(file, delimiter='\t', names=['topicId', 'Q0', 'docId', 'ranking', 'similarityScore', 'system'])
        
        # Debug: Print the first few rows of the dataframe
        print(f"Processing file: {file}")
        print(df.head())
        
        # Ensure ranking is sorted
        df = df.sort_values(by=['topicId', 'ranking'])
        
        # Create the 'relevance' column based on the similarity threshold
        df['relevance'] = (df['similarityScore'] > similarity_threshold).astype(int)
        
        # Calculate P@10 and AP@10
        p_at_10, ap_at_10 = calculate_p10_ap10(df)
        
        # Append the results with the system name
        system_name = os.path.basename(file).split('.')[0]
        result_df = pd.DataFrame({
            'topicId': p_at_10.index,
            'P@10': p_at_10.values,
            'AP@10': ap_at_10.values,
            'system': system_name
        })
        
        results.append(result_df)
    except Exception as e:
        print(f"Error processing file {file}: {e}")

# Concatenate all results into a single DataFrame
if results:
    final_results = pd.concat(results, ignore_index=True)
    # Debug: Print the final results
    print(final_results.head())
else:
    print("No valid data found to concatenate.")

# Save the final results to a file
output_path = 'D:\VSCODE PROJECT\IR\scores'
if not final_results.empty:
    final_results.to_csv(output_path, index=False)


Processing file: D:\VSCODE PROJECT\IR\cleaned_dataset\cleaned_input.1.1.txt
   topicId  Q0             docId  ranking  similarityScore  system
0      401  Q0       FBIS4-20472        1              320       1
1      401  Q0       FBIS4-68893        2              251       1
2      401  Q0  FR941117-2-00158        3              221       1
3      401  Q0       FBIS3-37947        4              203       1
4      401  Q0          FBIS4-29        5              186       1
Processing file: D:\VSCODE PROJECT\IR\cleaned_dataset\cleaned_input.ok8amxc.txt
   topicId  Q0        docId  ranking  similarityScore   system
0      401  Q0  FBIS4-18182        1          3.59032  ok8amxc
1      401  Q0  FBIS3-18916        2          3.44936  ok8amxc
2      401  Q0  FBIS3-18833        3          3.40886  ok8amxc
3      401  Q0  FBIS3-39117        4          3.25332  ok8amxc
4      401  Q0  FBIS3-17077        5          3.15430  ok8amxc
Processing file: D:\VSCODE PROJECT\IR\cleaned_dataset\cleaned_in

PermissionError: [Errno 13] Permission denied: 'D:\\VSCODE PROJECT\\IR\\scores'