In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


# Load the data
# data = pd.read_csv('data/output/relish-ground-truth/RELISH.tsv', delimiter='\t', header=None)
data = pd.read_csv('RELISH.tsv', delimiter='\t', header=None)
print('Initial pairs in relevance matrix:', len(data))

# Get the unique reference articles
refDocs = np.unique(data[0])  # First column
print('Length of unique reference articles:', len(refDocs))

# Get the unique assessed articles
asdDocs = np.unique(data[1])
print('Length of unique assessed articles:', len(asdDocs))

# Find reference articles if they do not exist in PID2
onlyRefDocs = [pid for pid in refDocs if pid not in asdDocs]  # Second column
print('Length of reference articles that do not exist as assessed articles:',len(onlyRefDocs))

# Assuming onlyRefDocs is the list of reference articles that do not exist as assessed articles
onlyRefDocs_data = data[data[0].isin(onlyRefDocs)]

# Find reference articles if they exist in PID2
notonlyRefDocs = [pid for pid in refDocs if pid in asdDocs]  # Second column
print('Length of reference articles that also exist as assessed articles:', len(notonlyRefDocs))

# Filter data based on onlyRefDocs
refRelMatrix = data[data[0].isin(onlyRefDocs)]  # Filter based on PID1
print('Total pairs after filtering:', len(refRelMatrix))

# Save the pairs being removed to 'valid.tsv'
valid_data = data[~data[0].isin(onlyRefDocs)]
valid_data.to_csv('valid.tsv', sep='\t', header=None, index=False)

# # Remove the corresponding pairs from data
# refRelMatrix = data[~data[1].isin(refDocs)]  # Filter based on PID2

# Initialize variables to store the best split
best_split = None
best_error = float('inf')
total_rows_initial = len(refRelMatrix)  # Total rows in the original refRelMatrix

# Loop for 1000 iterations
for i in range(1000):
    # Generate a different seed for each iteration
    np.random.seed(i)

    # Split onlyRefDocs into 80/20
    # train_onlyRef, test_onlyRef = train_test_split(onlyRefDocs, test_size=0.2)

    # Use 'Relevance' as the stratification variable
    train_data, test_data = train_test_split(onlyRefDocs_data, test_size=0.2, random_state=42, stratify=onlyRefDocs_data[2])

    # Filter refRelMatrix based on train_onlyRef
    ref_rel_train = refRelMatrix[refRelMatrix[0].isin(train_onlyRef)]  # Filter based on PID1

    # Update refRelMatrix by removing rows corresponding to train_onlyRef
    ref_rel_test = refRelMatrix[~refRelMatrix[0].isin(train_onlyRef)]  # Update based on PID1

    # Print out the size of train_onlyRef, ref_rel_train, and total rows in train and test for debugging
    total_rows_train_test = len(train_onlyRef) + len(test_onlyRef)
    print(f"Iteration {i+1}: Train OnlyRefDocs size: {len(train_onlyRef)}, Ref Rel Train size: {len(ref_rel_train)}, "
          f"Total rows in train and test: {total_rows_train_test}", f"Total pairs: {total_rows_initial}")

    # Find the percentage of those pairs in the 80-split
    train_percentage = len(ref_rel_train) / total_rows_initial

    # Calculate the error from 80%
    error = abs(train_percentage - 0.8)

    # Check if this split gives the closest pairs split to 80%
    if error < best_error:
        best_error = error
        best_split = (train_onlyRef, test_onlyRef, ref_rel_train, ref_rel_test)

    # Break the loop if all pairs have been exhausted
    if len(refRelMatrix) == 0:
        print("All pairs have been exhausted.")
        break

# Report best results
if best_split:
    best_train_onlyRef, best_test_onlyRef, ref_rel_train, ref_rel_test = best_split
    print("Best Split Found:")
    print(f"Train Data Size by PMID: {len(best_train_onlyRef)}, Test Data Size by PMID: {len(best_test_onlyRef)}")
    print(f"Train Data Size Pairs: {len(ref_rel_train)}, Test Data Size Pairs: {len(ref_rel_test)}")
    print(f"Percentage of Pairs in Train Data: {len(ref_rel_train) / total_rows_initial}")
    print(f"Percentage of Pairs in Test Data: {len(ref_rel_test) / total_rows_initial}")
    print(f"Error from 80%: {best_error}")

    # # Save the best train and test splits to separate files
    df = pd.DataFrame(ref_rel_train).to_csv('train_split.tsv', sep='\t', index=False)
    df = pd.DataFrame(ref_rel_test).to_csv('test_split.tsv', sep='\t', index=False)
else:
    print("No iterations performed.")



Initial pairs in relevance matrix: 189884
Length of unique reference articles: 3194
Length of unique assessed articles: 159792
Length of reference articles that do not exist as assessed articles: 2108
Length of reference articles that also exist as assessed articles: 1086
Total pairs after filtering: 125307
Iteration 1: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 2: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 3: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 4: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 5: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 6: Train OnlyRefDocs size: 1686, Ref Rel Train size

Iteration 67: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 68: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 69: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 70: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 71: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 72: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 73: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 74: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Tota

Iteration 136: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 137: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 138: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 139: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 140: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 141: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 142: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 143: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2

Iteration 205: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 206: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 207: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 208: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 209: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 210: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 211: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 212: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2

Iteration 275: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 276: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 277: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 278: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 279: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 280: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 281: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 282: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2

Iteration 345: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 346: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 347: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 348: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 349: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 350: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 351: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 352: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2

Iteration 414: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 415: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 416: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 417: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 418: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 419: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 420: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 421: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2

Iteration 483: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 484: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 485: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 486: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 487: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 488: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 489: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 490: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2

Iteration 552: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 553: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 554: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 555: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 556: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 557: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 558: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 559: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2

Iteration 619: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 620: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 621: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 622: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 623: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 624: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 625: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 626: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2

Iteration 688: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 689: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 690: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 691: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 692: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 693: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 694: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 695: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2

Iteration 757: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 758: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 759: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 760: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 761: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 762: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 763: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 764: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2

Iteration 825: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 826: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 827: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 828: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 829: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 830: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 831: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 832: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2

Iteration 895: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 896: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 897: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 898: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 899: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 900: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 901: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 902: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2

Iteration 964: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 965: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 966: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 967: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 968: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 969: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 970: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2108 Total pairs: 125307
Iteration 971: Train OnlyRefDocs size: 1686, Ref Rel Train size: 100126, Total rows in train and test: 2