In [10]:
import pandas as pd

# Define the path to your QREL file
data_path = 'D:\VSCODE PROJECT\IR\dataset\input.UB99T'

# Define column names for the DataFrame
column_names = ['topicId', 'identifier', 'docId', 'ranking', 'similarityScore', 'systemName']

# Read the QREL file into a DataFrame
df = pd.read_csv(data_path, sep='\t', header=None, names=column_names)

# Display the first few rows of the DataFrame to verify
print(df)


       topicId identifier          docId  ranking  similarityScore systemName
0          401         Q0    FBIS4-25684        0         177.0670      UB99T
1          401         Q0    FBIS4-22981        1         139.5210      UB99T
2          401         Q0    FBIS4-68773        2         120.6760      UB99T
3          401         Q0    FBIS3-61091        3         110.9000      UB99T
4          401         Q0     FT941-3931        4          85.2391      UB99T
...        ...        ...            ...      ...              ...        ...
49995      450         Q0  LA051090-0085      803           0.0700      UB99T
49996      450         Q0  LA051090-0102      971           0.0700      UB99T
49997      450         Q0  LA051190-0035      703           0.0700      UB99T
49998      450         Q0  LA051290-0032      720           0.0700      UB99T
49999      450         Q0  LA051489-0029      882           0.0700      UB99T

[50000 rows x 6 columns]


In [11]:
# Count the number of rows per topicid
topicId_counts = df['topicId'].value_counts().sort_index()
print(topicId_counts)

topicId
401    1000
402    1000
403    1000
404    1000
405    1000
406    1000
407    1000
408    1000
409    1000
410    1000
411    1000
412    1000
413    1000
414    1000
415    1000
416    1000
417    1000
418    1000
419    1000
420    1000
421    1000
422    1000
423    1000
424    1000
425    1000
426    1000
427    1000
428    1000
429    1000
430    1000
431    1000
432    1000
433    1000
434    1000
435    1000
436    1000
437    1000
438    1000
439    1000
440    1000
441    1000
442    1000
443    1000
444    1000
445    1000
446    1000
447    1000
448    1000
449    1000
450    1000
Name: count, dtype: int64


In [12]:
# Check data types of all columns
print("Data type of each column: ")
print(df.dtypes)

# Check null value
print( "Count of null values in each column: ")
print(df.isnull().sum())

Data type of each column: 
topicId              int64
identifier          object
docId               object
ranking              int64
similarityScore    float64
systemName          object
dtype: object
Count of null values in each column: 
topicId            0
identifier         0
docId              0
ranking            0
similarityScore    0
systemName         0
dtype: int64


In [13]:
# Function to check if the rank starts from 0 and is in ascending order
def is_rank_ascending_from_zero(group):
    ranks = group['ranking'].values
    return (ranks == range(len(ranks))).all()

# Group by topicid and check the rank order
rank_ascending_from_zero = df.groupby('topicId').apply(is_rank_ascending_from_zero)
print("\nRank in ascending order starting from 0 per topicid:")
print(rank_ascending_from_zero)



Rank in ascending order starting from 0 per topicid:
topicId
401    False
402    False
403    False
404    False
405    False
406    False
407    False
408    False
409    False
410    False
411    False
412    False
413    False
414    False
415    False
416    False
417    False
418    False
419    False
420    False
421    False
422    False
423    False
424    False
425    False
426    False
427    False
428    False
429    False
430    False
431    False
432    False
433    False
434    False
435    False
436    False
437    False
438    False
439    False
440    False
441    False
442    False
443    False
444    False
445    False
446    False
447    False
448    False
449    False
450    False
dtype: bool


In [14]:
# Function to identify non-increasing ranks within each group
def find_non_increasing_ranks(group):
    # Calculate the difference between consecutive ranks
    rank_diff = group['ranking'].diff()
    # Identify rows where the difference is not positive
    non_increasing = rank_diff[rank_diff <= 0].index
    return non_increasing

# Apply the function to each group and concatenate the results
non_increasing_rows = df.groupby('topicId').apply(find_non_increasing_ranks).reset_index(level=0, drop=True)
non_increasing_rows = non_increasing_rows.explode().dropna().astype(int)

# Display the rows that do not follow an increasing order in ranking
print("Rows with non-increasing ranks:")
print(df.loc[non_increasing_rows])

Rows with non-increasing ranks:
       topicId identifier             docId  ranking  similarityScore  \
242        401         Q0     LA032889-0062      241          9.57709   
418        401         Q0     LA033090-0141      416          4.93642   
535        401         Q0         FT911-225      533          3.03557   
590        401         Q0  FR941025-2-00062      587          2.43828   
601        401         Q0       FBIS3-10838      597          2.32113   
...        ...        ...               ...      ...              ...   
49988      450         Q0     LA041790-0124      633          0.07000   
49990      450         Q0     LA042289-0143      772          0.07000   
49993      450         Q0     LA042590-0123      850          0.07000   
49995      450         Q0     LA051090-0085      803          0.07000   
49997      450         Q0     LA051190-0035      703          0.07000   

      systemName  
242        UB99T  
418        UB99T  
535        UB99T  
590        UB99

In [15]:
# Function to check if a series is in descending order
def is_descending(series):
    return series.is_monotonic_decreasing

# Check if similarityscore is in descending order for each topicid
similarity_descending = df.groupby('topicId')['similarityScore'].apply(is_descending)
print("\nSimilarity score in descending order per topicid:")
print(similarity_descending)


Similarity score in descending order per topicid:
topicId
401    False
402    False
403     True
404     True
405     True
406     True
407    False
408     True
409     True
410    False
411     True
412     True
413     True
414     True
415     True
416     True
417    False
418     True
419     True
420     True
421     True
422     True
423     True
424     True
425     True
426     True
427     True
428     True
429     True
430     True
431     True
432     True
433     True
434     True
435     True
436     True
437     True
438     True
439     True
440     True
441     True
442     True
443     True
444     True
445     True
446     True
447     True
448     True
449     True
450     True
Name: similarityScore, dtype: bool


In [16]:
# Check for duplicates in 'Ranking' within each 'topicId' group
has_duplicates = df.groupby('topicId')['ranking'].apply(lambda x: x.duplicated()).any()

# Check if 'Ranking' is in ascending order within each 'topicId' group
is_ascending_ranking = df.groupby('topicId')['ranking'].apply(lambda x: x.is_monotonic_increasing).all()

# Check if 'SimilarityScore' is in descending order within each 'topicId' group
is_descending_similarity = df.groupby('topicId')['similarityScore'].apply(lambda x: x.is_monotonic_decreasing).all()

# Print the results
print("Ranking has duplicates within each topicId:", has_duplicates)
print("Ranking is in ascending order within each topicId:", is_ascending_ranking)
print("SimilarityScore is in descending order within each topicId:", is_descending_similarity)

Ranking has duplicates within each topicId: False
Ranking is in ascending order within each topicId: False
SimilarityScore is in descending order within each topicId: False


In [17]:
# Convert the similarityscore to a numeric type if it's not already
df['similarityScore'] = pd.to_numeric(df['similarityScore'])

# Sort the DataFrame by topicid and similarityscore in descending order
df = df.sort_values(by=["topicId", "similarityScore"], ascending=[True, False])

# Reassign the rank within each topicid group
df['ranking'] = df.groupby('topicId').cumcount() + 1

print(df)

       topicId identifier          docId  ranking  similarityScore systemName
0          401         Q0    FBIS4-25684        1         177.0670      UB99T
1          401         Q0    FBIS4-22981        2         139.5210      UB99T
2          401         Q0    FBIS4-68773        3         120.6760      UB99T
3          401         Q0    FBIS3-61091        4         110.9000      UB99T
4          401         Q0     FT941-3931        5          85.2391      UB99T
...        ...        ...            ...      ...              ...        ...
49995      450         Q0  LA051090-0085      996           0.0700      UB99T
49996      450         Q0  LA051090-0102      997           0.0700      UB99T
49997      450         Q0  LA051190-0035      998           0.0700      UB99T
49998      450         Q0  LA051290-0032      999           0.0700      UB99T
49999      450         Q0  LA051489-0029     1000           0.0700      UB99T

[50000 rows x 6 columns]


In [18]:
# Save the cleaned DataFrame back to a file
save_path = "D:\VSCODE PROJECT\IR\cleaned_dataset\cleaned_input.UB99T.txt"

df.to_csv(save_path, sep="\t", header=False, index=False)