In [8]:
import pandas as pd

# Define the path to your QREL file
data_path = 'D:\VSCODE PROJECT\IR\dataset\input.pir9Aa1'

# Define column names for the DataFrame
column_names = ['topicId', 'identifier', 'docId', 'ranking', 'similarityScore', 'systemName']

# Read the QREL file into a DataFrame
df = pd.read_csv(data_path, sep='\t', header=None, names=column_names)

# Display the first few rows of the DataFrame to verify
print(df)


       topicId identifier          docId  ranking  similarityScore systemName
0          401         Q0     FBIS4-9582        0           4.1810    pir9Aa1
1          401         Q0    FBIS4-31715        1           4.0127    pir9Aa1
2          401         Q0    FT942-15501        2           3.4143    pir9Aa1
3          401         Q0     FBIS3-4201        3           3.3311    pir9Aa1
4          401         Q0    FBIS4-18182        4           3.3238    pir9Aa1
...        ...        ...            ...      ...              ...        ...
49995      450         Q0    FBIS3-13360      995           3.0238    pir9Aa1
49996      450         Q0    FBIS4-17507      996           3.0236    pir9Aa1
49997      450         Q0    FBIS3-19376      997           3.0234    pir9Aa1
49998      450         Q0    FT921-14974      998           3.0221    pir9Aa1
49999      450         Q0  LA092089-0141      999           3.0218    pir9Aa1

[50000 rows x 6 columns]


In [9]:
# Count the number of rows per topicid
topicId_counts = df['topicId'].value_counts().sort_index()
print(topicId_counts)

topicId
401    1000
402    1000
403    1000
404    1000
405    1000
406    1000
407    1000
408    1000
409    1000
410    1000
411    1000
412    1000
413    1000
414    1000
415    1000
416    1000
417    1000
418    1000
419    1000
420    1000
421    1000
422    1000
423    1000
424    1000
425    1000
426    1000
427    1000
428    1000
429    1000
430    1000
431    1000
432    1000
433    1000
434    1000
435    1000
436    1000
437    1000
438    1000
439    1000
440    1000
441    1000
442    1000
443    1000
444    1000
445    1000
446    1000
447    1000
448    1000
449    1000
450    1000
Name: count, dtype: int64


In [10]:
# Check data types of all columns
print("Data type of each column: ")
print(df.dtypes)

# Check null value
print( "Count of null values in each column: ")
print(df.isnull().sum())

Data type of each column: 
topicId              int64
identifier          object
docId               object
ranking              int64
similarityScore    float64
systemName          object
dtype: object
Count of null values in each column: 
topicId            0
identifier         0
docId              0
ranking            0
similarityScore    0
systemName         0
dtype: int64


In [11]:
# Check for duplicates in 'Ranking' within each 'topicId' group
has_duplicates = df.groupby('topicId')['ranking'].apply(lambda x: x.duplicated()).any()

# Check if 'Ranking' is in ascending order within each 'topicId' group
is_ascending_ranking = df.groupby('topicId')['ranking'].apply(lambda x: x.is_monotonic_increasing).all()

# Check if 'SimilarityScore' is in descending order within each 'topicId' group
is_descending_similarity = df.groupby('topicId')['similarityScore'].apply(lambda x: x.is_monotonic_decreasing).all()

# Print the results
print("Ranking has duplicates within each topicId:", has_duplicates)
print("Ranking is in ascending order within each topicId:", is_ascending_ranking)
print("SimilarityScore is in descending order within each topicId:", is_descending_similarity)

Ranking has duplicates within each topicId: False
Ranking is in ascending order within each topicId: False
SimilarityScore is in descending order within each topicId: True


In [12]:
# Function to identify non-increasing ranks within each group
def find_non_increasing_ranks(group):
    # Calculate the difference between consecutive ranks
    rank_diff = group['ranking'].diff()
    # Identify rows where the difference is not positive
    non_increasing = rank_diff[rank_diff <= 0].index
    return non_increasing

# Apply the function to each group and concatenate the results
non_increasing_rows = df.groupby('topicId').apply(find_non_increasing_ranks).reset_index(level=0, drop=True)
non_increasing_rows = non_increasing_rows.explode().dropna().astype(int)

# Display the rows that do not follow an increasing order in ranking
print("Rows with non-increasing ranks:")
print(df.loc[non_increasing_rows])

Rows with non-increasing ranks:
       topicId identifier          docId  ranking  similarityScore systemName
97         401         Q0    FBIS3-30284       96           2.5129    pir9Aa1
183        401         Q0    FBIS4-55873      182           2.3493    pir9Aa1
185        401         Q0     FT943-3436      184           2.3480    pir9Aa1
219        401         Q0     FT934-5606      218           2.2994    pir9Aa1
247        401         Q0    FBIS3-36648      246           2.2659    pir9Aa1
...        ...        ...            ...      ...              ...        ...
49801      450         Q0    FBIS4-50537      800           3.1839    pir9Aa1
49826      450         Q0    FBIS4-12153      825           3.1626    pir9Aa1
49873      450         Q0    FBIS3-12320      872           3.1313    pir9Aa1
49899      450         Q0  LA030889-0043      898           3.1112    pir9Aa1
49931      450         Q0  LA112190-0110      930           3.0879    pir9Aa1

[2144 rows x 6 columns]


In [13]:
# Sort the DataFrame by topicid and similarityscore in descending order
df = df.sort_values(by=["topicId", "similarityScore"], ascending=[True, False])

# Reassign the rank within each topicid group
df['ranking'] = df.groupby('topicId').cumcount() + 1

print(df)

       topicId identifier          docId  ranking  similarityScore systemName
0          401         Q0     FBIS4-9582        1           4.1810    pir9Aa1
1          401         Q0    FBIS4-31715        2           4.0127    pir9Aa1
2          401         Q0    FT942-15501        3           3.4143    pir9Aa1
3          401         Q0     FBIS3-4201        4           3.3311    pir9Aa1
4          401         Q0    FBIS4-18182        5           3.3238    pir9Aa1
...        ...        ...            ...      ...              ...        ...
49995      450         Q0    FBIS3-13360      996           3.0238    pir9Aa1
49996      450         Q0    FBIS4-17507      997           3.0236    pir9Aa1
49997      450         Q0    FBIS3-19376      998           3.0234    pir9Aa1
49998      450         Q0    FT921-14974      999           3.0221    pir9Aa1
49999      450         Q0  LA092089-0141     1000           3.0218    pir9Aa1

[50000 rows x 6 columns]


In [14]:
# Save the cleaned DataFrame back to a file
save_path = "D:\VSCODE PROJECT\IR\cleaned_dataset\cleaned_input.pir9Aa1.txt"

df.to_csv(save_path, sep="\t", header=False, index=False)