In [1]:
import pandas as pd

# Define the path to your QREL file
data_path = 'D:\VSCODE PROJECT\IR\dataset\input.ok8amxc'

# Define column names for the DataFrame
column_names = ['topicId', 'identifier', 'docId', 'ranking', 'similarityScore', 'systemName']

# Read the QREL file into a DataFrame
df = pd.read_csv(data_path, sep='\t', header=None, names=column_names)

# Display the first few rows of the DataFrame to verify
print(df)


       topicId identifier        docId  ranking  similarityScore systemName
0          401         Q0  FBIS4-18182        0         3.590320    ok8amxc
1          401         Q0  FBIS3-18916        1         3.449360    ok8amxc
2          401         Q0  FBIS3-18833        2         3.408860    ok8amxc
3          401         Q0  FBIS3-39117        3         3.253320    ok8amxc
4          401         Q0  FBIS3-17077        4         3.154300    ok8amxc
...        ...        ...          ...      ...              ...        ...
49995      450         Q0  FBIS3-33693      995         0.855519    ok8amxc
49996      450         Q0   FT933-6130      996         0.855423    ok8amxc
49997      450         Q0  FBIS4-35729      997         0.855387    ok8amxc
49998      450         Q0  FBIS3-34652      998         0.854756    ok8amxc
49999      450         Q0  FT944-16013      999         0.854586    ok8amxc

[50000 rows x 6 columns]


In [2]:
# Check for duplicates in 'Ranking' within each 'topicId' group
has_duplicates = df.groupby('topicId')['ranking'].apply(lambda x: x.duplicated()).any()

# Check if 'Ranking' is in ascending order within each 'topicId' group
is_ascending_ranking = df.groupby('topicId')['ranking'].apply(lambda x: x.is_monotonic_increasing).all()

# Check if 'SimilarityScore' is in descending order within each 'topicId' group
is_descending_similarity = df.groupby('topicId')['similarityScore'].apply(lambda x: x.is_monotonic_decreasing).all()

# Print the results
print("Ranking has duplicates within each topicId:", has_duplicates)
print("Ranking is in ascending order within each topicId:", is_ascending_ranking)
print("SimilarityScore is in descending order within each topicId:", is_descending_similarity)

Ranking has duplicates within each topicId: False
Ranking is in ascending order within each topicId: True
SimilarityScore is in descending order within each topicId: True


In [3]:
# Count the number of rows per topicid
topicId_counts = df['topicId'].value_counts().sort_index()
print(topicId_counts)

topicId
401    1000
402    1000
403    1000
404    1000
405    1000
406    1000
407    1000
408    1000
409    1000
410    1000
411    1000
412    1000
413    1000
414    1000
415    1000
416    1000
417    1000
418    1000
419    1000
420    1000
421    1000
422    1000
423    1000
424    1000
425    1000
426    1000
427    1000
428    1000
429    1000
430    1000
431    1000
432    1000
433    1000
434    1000
435    1000
436    1000
437    1000
438    1000
439    1000
440    1000
441    1000
442    1000
443    1000
444    1000
445    1000
446    1000
447    1000
448    1000
449    1000
450    1000
Name: count, dtype: int64


In [4]:
# Check data types of all columns
print("Data type of each column: ")
print(df.dtypes)

# Check null value
print( "Count of null values in each column: ")
print(df.isnull().sum())

Data type of each column: 
topicId              int64
identifier          object
docId               object
ranking              int64
similarityScore    float64
systemName          object
dtype: object
Count of null values in each column: 
topicId            0
identifier         0
docId              0
ranking            0
similarityScore    0
systemName         0
dtype: int64


In [5]:
# Function to check if the rank starts from 0 and is in ascending order
def is_rank_ascending_from_zero(group):
    ranks = group['ranking'].values
    return (ranks == range(len(ranks))).all()

# Group by topicid and check the rank order
rank_ascending_from_zero = df.groupby('topicId').apply(is_rank_ascending_from_zero)
print("\nRank in ascending order starting from 0 per topicid:")
print(rank_ascending_from_zero)



Rank in ascending order starting from 0 per topicid:
topicId
401    True
402    True
403    True
404    True
405    True
406    True
407    True
408    True
409    True
410    True
411    True
412    True
413    True
414    True
415    True
416    True
417    True
418    True
419    True
420    True
421    True
422    True
423    True
424    True
425    True
426    True
427    True
428    True
429    True
430    True
431    True
432    True
433    True
434    True
435    True
436    True
437    True
438    True
439    True
440    True
441    True
442    True
443    True
444    True
445    True
446    True
447    True
448    True
449    True
450    True
dtype: bool


In [6]:
# Function to check if a series is in descending order
def is_descending(series):
    return series.is_monotonic_decreasing

# Check if similarityscore is in descending order for each topicid
similarity_descending = df.groupby('topicId')['similarityScore'].apply(is_descending)
print("\nSimilarity score in descending order per topicid:")
print(similarity_descending)


Similarity score in descending order per topicid:
topicId
401    True
402    True
403    True
404    True
405    True
406    True
407    True
408    True
409    True
410    True
411    True
412    True
413    True
414    True
415    True
416    True
417    True
418    True
419    True
420    True
421    True
422    True
423    True
424    True
425    True
426    True
427    True
428    True
429    True
430    True
431    True
432    True
433    True
434    True
435    True
436    True
437    True
438    True
439    True
440    True
441    True
442    True
443    True
444    True
445    True
446    True
447    True
448    True
449    True
450    True
Name: similarityScore, dtype: bool


In [7]:
# Reassign the rank within each topicid group
df['ranking'] = df.groupby('topicId').cumcount() + 1

print(df)

       topicId identifier        docId  ranking  similarityScore systemName
0          401         Q0  FBIS4-18182        1         3.590320    ok8amxc
1          401         Q0  FBIS3-18916        2         3.449360    ok8amxc
2          401         Q0  FBIS3-18833        3         3.408860    ok8amxc
3          401         Q0  FBIS3-39117        4         3.253320    ok8amxc
4          401         Q0  FBIS3-17077        5         3.154300    ok8amxc
...        ...        ...          ...      ...              ...        ...
49995      450         Q0  FBIS3-33693      996         0.855519    ok8amxc
49996      450         Q0   FT933-6130      997         0.855423    ok8amxc
49997      450         Q0  FBIS4-35729      998         0.855387    ok8amxc
49998      450         Q0  FBIS3-34652      999         0.854756    ok8amxc
49999      450         Q0  FT944-16013     1000         0.854586    ok8amxc

[50000 rows x 6 columns]


In [8]:
# Save the cleaned DataFrame back to a file
save_path = "D:\VSCODE PROJECT\IR\cleaned_dataset\cleaned_input.ok8amxc.txt"

df.to_csv(save_path, sep="\t", header=False, index=False)

In [9]:
# # Sort the DataFrame by 'topicId' and 'SimilarityScore' in descending order
# df_sorted = df.sort_values(by=['topicId', 'similarityScore'], ascending=[True, False])

# # Calculate the expected ranking based on sorted 'SimilarityScore'
# df_sorted['ExpectedRanking'] = df_sorted.groupby('topicId').cumcount()

# # Check if 'Ranking' aligns with 'ExpectedRanking'
# is_aligned = (df_sorted['ranking'] == df_sorted['ExpectedRanking']).all()

# # Print the result
# print("SimilarityScore aligns with Ranking:", is_aligned)
# print(df_sorted)


In [10]:
# Define a function to calculate P@10 and AP@10 using average similarity score as the threshold
def calculate_p10_ap10(df):
    results = []
    
    for topic_id, group in df.groupby('topicId'):
        min_similarity = group['similarityScore'].min()
        max_similarity = group['similarityScore'].max()
        average_similarity = (min_similarity + max_similarity) / 2
        
        # Create the 'relevance' column based on the average similarity score
        group['relevance'] = (group['similarityScore'] > average_similarity).astype(int)
        
        # Filter top 10 documents for the current topic
        top_10_docs = group[group['ranking'] < 10]
        
        # Calculate P@10 for the current topic
        p_at_10 = top_10_docs['relevance'].sum() / 10
        
        # Calculate AP@10 for the current topic
        def average_precision_at_10(group):
            relevant = group['relevance'].values
            precisions = [relevant[:i+1].sum() / (i+1) for i in range(len(relevant))]
            return sum(precisions) / min(len(relevant), 10)
        
        ap_at_10 = average_precision_at_10(top_10_docs)
        
        results.append({'topicId': topic_id, 'P@10': p_at_10, 'AP@10': ap_at_10})
    
    return pd.DataFrame(results)

# Initialize a list to store the final results for all systems
final_results = []


In [11]:
results_df = calculate_p10_ap10(df)
print(results_df)

    topicId  P@10     AP@10
0       401   0.9  1.000000
1       402   0.9  1.000000
2       403   0.9  1.000000
3       404   0.9  1.000000
4       405   0.9  1.000000
5       406   0.8  0.987654
6       407   0.9  1.000000
7       408   0.9  1.000000
8       409   0.9  1.000000
9       410   0.9  1.000000
10      411   0.5  0.858686
11      412   0.9  1.000000
12      413   0.9  1.000000
13      414   0.9  1.000000
14      415   0.9  1.000000
15      416   0.9  1.000000
16      417   0.9  1.000000
17      418   0.4  0.775838
18      419   0.9  1.000000
19      420   0.9  1.000000
20      421   0.9  1.000000
21      422   0.7  0.961420
22      423   0.9  1.000000
23      424   0.9  1.000000
24      425   0.9  1.000000
25      426   0.9  1.000000
26      427   0.9  1.000000
27      428   0.9  1.000000
28      429   0.9  1.000000
29      430   0.9  1.000000
30      431   0.6  0.919312
31      432   0.9  1.000000
32      433   0.3  0.665212
33      434   0.9  1.000000
34      435   0.9  1

In [12]:
# Initialize a list to store all similarity scores
all_similarity_scores = []

# Load all data and collect similarity scores
data_frames = []
data_frames.append(df)
all_similarity_scores.extend(df['similarityScore'].values)

# Calculate global min, max, and average similarity scores
global_min_similarity = min(all_similarity_scores)
global_max_similarity = max(all_similarity_scores)
global_avg_similarity = (global_min_similarity + global_max_similarity) / 2

# Define a function to calculate P@10 and AP@10 using the global similarity score threshold
def calculate_p10_ap10_global(df, threshold):
    # Create the 'relevance' column based on the global average similarity score
    df['relevance'] = (df['similarityScore'] > threshold).astype(int)
    
    results = []
    
    for topic_id, group in df.groupby('topicId'):
        # Filter top 10 documents for the current topic
        top_10_docs = group[group['ranking'] <= 10]
        
        # Calculate P@10 for the current topic
        p_at_10 = top_10_docs['relevance'].sum() / 10
        
        # Calculate AP@10 for the current topic
        def average_precision_at_10(group):
            relevant = group['relevance'].values
            precisions = [relevant[:i+1].sum() / (i+1) for i in range(len(relevant))]
            return sum(precisions) / min(len(relevant), 10)
        
        ap_at_10 = average_precision_at_10(top_10_docs)
        
        results.append({'topicId': topic_id, 'P@10': p_at_10, 'AP@10': ap_at_10})
    
    return pd.DataFrame(results)

# Initialize a list to store the final results for all systems
final_results = []

In [13]:
results_df = calculate_p10_ap10_global(df, global_avg_similarity)
print(results_df)

    topicId  P@10     AP@10
0       401   0.0  0.000000
1       402   0.0  0.000000
2       403   1.0  1.000000
3       404   0.0  0.000000
4       405   1.0  1.000000
5       406   0.3  0.628690
6       407   0.2  0.485794
7       408   0.5  0.822817
8       409   0.6  0.887381
9       410   0.8  0.968889
10      411   0.8  0.968889
11      412   0.0  0.000000
12      413   0.0  0.000000
13      414   0.0  0.000000
14      415   0.9  0.990000
15      416   0.2  0.485794
16      417   0.0  0.000000
17      418   0.4  0.738254
18      419   0.1  0.292897
19      420   0.2  0.485794
20      421   0.0  0.000000
21      422   0.1  0.292897
22      423   0.3  0.628690
23      424   0.4  0.738254
24      425   0.2  0.485794
25      426   0.0  0.000000
26      427   1.0  1.000000
27      428   0.0  0.000000
28      429   0.5  0.822817
29      430   1.0  1.000000
30      431   0.1  0.292897
31      432   0.0  0.000000
32      433   0.1  0.292897
33      434   0.0  0.000000
34      435   0.0  0