In [9]:
import pandas as pd

# Define the path to your QREL file
data_path = 'D:\VSCODE PROJECT\IR\dataset\input.ric8dpn'

# Define column names for the DataFrame
column_names = ['topicId', 'identifier', 'docId', 'ranking', 'similarityScore', 'systemName']

# Read the QREL file into a DataFrame
df = pd.read_csv(data_path, sep='\t', header=None, names=column_names)

# Display the first few rows of the DataFrame to verify
print(df)


       topicId identifier          docId  ranking  similarityScore systemName
0          401         Q0  LA052590-0090        1         0.518393    ric8dpn
1          401         Q0    FBIS3-19951        2         0.447785    ric8dpn
2          401         Q0    FBIS3-59436        3         0.439529    ric8dpn
3          401         Q0    FBIS4-68774        4         0.434339    ric8dpn
4          401         Q0     FBIS4-9582        5         0.424880    ric8dpn
...        ...        ...            ...      ...              ...        ...
49995      450         Q0    FBIS3-34583      996         0.327118    ric8dpn
49996      450         Q0  LA112090-0166      997         0.327041    ric8dpn
49997      450         Q0  LA110490-0183      998         0.326964    ric8dpn
49998      450         Q0    FBIS4-35559      999         0.326920    ric8dpn
49999      450         Q0    FBIS4-38038     1000         0.326914    ric8dpn

[50000 rows x 6 columns]


In [10]:
# Count the number of rows per topicid
topicId_counts = df['topicId'].value_counts().sort_index()
print(topicId_counts)

topicId
401    1000
402    1000
403    1000
404    1000
405    1000
406    1000
407    1000
408    1000
409    1000
410    1000
411    1000
412    1000
413    1000
414    1000
415    1000
416    1000
417    1000
418    1000
419    1000
420    1000
421    1000
422    1000
423    1000
424    1000
425    1000
426    1000
427    1000
428    1000
429    1000
430    1000
431    1000
432    1000
433    1000
434    1000
435    1000
436    1000
437    1000
438    1000
439    1000
440    1000
441    1000
442    1000
443    1000
444    1000
445    1000
446    1000
447    1000
448    1000
449    1000
450    1000
Name: count, dtype: int64


In [11]:
# Check data types of all columns
print("Data type of each column: ")
print(df.dtypes)

# Check null value
print( "Count of null values in each column: ")
print(df.isnull().sum())

Data type of each column: 
topicId              int64
identifier          object
docId               object
ranking              int64
similarityScore    float64
systemName          object
dtype: object
Count of null values in each column: 
topicId            0
identifier         0
docId              0
ranking            0
similarityScore    0
systemName         0
dtype: int64


In [12]:
# Check for duplicates in 'Ranking' within each 'topicId' group
has_duplicates = df.groupby('topicId')['ranking'].apply(lambda x: x.duplicated()).any()

# Check if 'Ranking' is in ascending order within each 'topicId' group
is_ascending_ranking = df.groupby('topicId')['ranking'].apply(lambda x: x.is_monotonic_increasing).all()

# Check if 'SimilarityScore' is in descending order within each 'topicId' group
is_descending_similarity = df.groupby('topicId')['similarityScore'].apply(lambda x: x.is_monotonic_decreasing).all()

# Print the results
print("Ranking has duplicates within each topicId:", has_duplicates)
print("Ranking is in ascending order within each topicId:", is_ascending_ranking)
print("SimilarityScore is in descending order within each topicId:", is_descending_similarity)

Ranking has duplicates within each topicId: False
Ranking is in ascending order within each topicId: False
SimilarityScore is in descending order within each topicId: True


In [13]:
# Function to check if a series is in ascending order
def is_ascending(series):
    return series.is_monotonic_increasing

# Check if rank is in ascending order for each topicid
rank_ascending = df.groupby('topicId')['ranking'].apply(is_ascending)
print("\nRank in ascending order per topicid:")
print(rank_ascending)


Rank in ascending order per topicid:
topicId
401     True
402     True
403     True
404     True
405    False
406     True
407    False
408     True
409     True
410    False
411     True
412    False
413     True
414     True
415    False
416    False
417     True
418     True
419    False
420     True
421     True
422    False
423    False
424     True
425    False
426     True
427     True
428     True
429    False
430     True
431     True
432     True
433    False
434     True
435     True
436    False
437     True
438     True
439    False
440     True
441     True
442    False
443    False
444     True
445     True
446     True
447    False
448     True
449    False
450     True
Name: ranking, dtype: bool


In [14]:
# Function to check if a series is in descending order
def is_descending(series):
    return series.is_monotonic_decreasing

# Check if similarityscore is in descending order for each topicid
similarity_descending = df.groupby('topicId')['similarityScore'].apply(is_descending)
print("\nSimilarity score in descending order per topicid:")
print(similarity_descending)


Similarity score in descending order per topicid:
topicId
401    True
402    True
403    True
404    True
405    True
406    True
407    True
408    True
409    True
410    True
411    True
412    True
413    True
414    True
415    True
416    True
417    True
418    True
419    True
420    True
421    True
422    True
423    True
424    True
425    True
426    True
427    True
428    True
429    True
430    True
431    True
432    True
433    True
434    True
435    True
436    True
437    True
438    True
439    True
440    True
441    True
442    True
443    True
444    True
445    True
446    True
447    True
448    True
449    True
450    True
Name: similarityScore, dtype: bool


In [15]:
# Reassign the rank within each topicid group
df['ranking'] = df.groupby('topicId').cumcount() + 1

print(df)

       topicId identifier          docId  ranking  similarityScore systemName
0          401         Q0  LA052590-0090        1         0.518393    ric8dpn
1          401         Q0    FBIS3-19951        2         0.447785    ric8dpn
2          401         Q0    FBIS3-59436        3         0.439529    ric8dpn
3          401         Q0    FBIS4-68774        4         0.434339    ric8dpn
4          401         Q0     FBIS4-9582        5         0.424880    ric8dpn
...        ...        ...            ...      ...              ...        ...
49995      450         Q0    FBIS3-34583      996         0.327118    ric8dpn
49996      450         Q0  LA112090-0166      997         0.327041    ric8dpn
49997      450         Q0  LA110490-0183      998         0.326964    ric8dpn
49998      450         Q0    FBIS4-35559      999         0.326920    ric8dpn
49999      450         Q0    FBIS4-38038     1000         0.326914    ric8dpn

[50000 rows x 6 columns]


In [16]:
# Save the cleaned DataFrame back to a file
save_path = "D:\VSCODE PROJECT\IR\cleaned_dataset\cleaned_input.ric8dpn.txt"

df.to_csv(save_path, sep="\t", header=False, index=False)