# Data Splitting

## Pre requisites
1. A TSV file which contains PID1, PID2 and the corresponsing value of relvancy between the pairs of PIDs.
2. The relish_tokenized.npy file with tokens. 


## Loading Necessary Modules

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from numpy import array


## Load the dataset

In [None]:

df_relish = pd.read_csv('/content/drive/MyDrive/Bonn/ZB Med/data/RELISH.tsv', sep='\t')
df_relish.columns=['PID1', 'PID2','Value']
data = np.load('data/RELISH_tokenized.npy', allow_pickle=True)
df_npy = pd.DataFrame(data)
df_npy = pd.DataFrame(data, columns=['PID', 'Title', 'Abstract'])

## Converting PIDs to a list

In [None]:
dfr_pid1=df_relish['PID1'].tolist()
dfr_pid2=df_relish['PID2'].tolist()

In [None]:

gt_pid=df_npy['PID'].tolist()
gt_pid = [int(arr) for arr in gt_pid]

## Declaration of memory variables for best and worst case.

In [None]:
train, test = train_test_split(df_npy, test_size=0.20, shuffle=True)
train.to_csv('split/RELISH_NPY_Training_Dataset.tsv', sep='\t', index=False)
test.to_csv('split/RELISH_NPY_Test_Dataset.tsv', sep='\t', index=False)

## Load the split dataset

In [None]:

dfnpy_train = pd.read_csv('split/RELISH_NPY_Training_Dataset.tsv', sep='\t')
dfnpy_test = pd.read_csv('plit/RELISH_NPY_Test_Dataset.tsv', sep='\t')

In [None]:
dfnpy_train_pid=dfnpy_train['PID'].tolist()
dfnpy_test_pid=dfnpy_test['PID'].tolist()

 ## Find matching pairs of PID1 and PID2 between two datasets.

In [None]:
def matching_pairs(df_rel, df_npy):
    # Read data from the first file
    file1_data_train = df_rel

    # Read data from the second file
    file2_data_train = df_npy

    # Extract unique PIDs from both columns (PID1 and PID2) in the first file
    pids_file1_train = set(file1_data_train['PID1']).union(set(file1_data_train['PID2']))

    # Extract PIDs from the second file
    pids_file2_train = set(file2_data_train['PID'])

    # Find pairs from the first file where both PID1 and PID2 are present in the second file
    matching_pairs_train = []

    for _, row in file1_data_train.iterrows():
        pid1 = row['PID1']
        pid2 = row['PID2']

        if pid1 in pids_file2_train and pid2 in pids_file2_train:
            matching_pairs_train.append((pid1, pid2))

    mtpr1 = (len(matching_pairs_train) / len(file1_data_train['PID1'])) * 100
    return mtpr1

In [None]:
print("Matching Pairs in training : ",matching_pairs(df_relish,dfnpy_train))
print("Matching Pairs in testing : ",matching_pairs(df_relish,dfnpy_test))

Matching Pairs in training :  63.6376084220283
Matching Pairs in testing :  4.014050757571768


##  Iterate to Find the best train and test datasets based on matching pairs and Save the best train and test datasets as TSV files.

In [None]:
perc=0
best_train_set = None
best_test_set = None
list_perc=[]
for i in range(1000):
  # dfnpy_train, dfnpy_test=split_data(df_npy)
  dfnpy_train, dfnpy_test = train_test_split(df_npy, test_size=0.20, shuffle=True)
  #memory variables for best and worst case.
  dfnpy_train.to_csv('split/RELISH_NPY_Training_Dataset.tsv', sep='\t', index=False)
  dfnpy_test.to_csv('split/RELISH_NPY_Test_Dataset.tsv', sep='\t', index=False)

  # Load the dataset
  dfnpy_train = pd.read_csv('split/RELISH_NPY_Training_Dataset.tsv', sep='\t')
  dfnpy_test = pd.read_csv('split/RELISH_NPY_Test_Dataset.tsv', sep='\t')

  train_perc=matching_pairs(df_relish, dfnpy_train)
  list_perc.append(train_perc)
  if(train_perc>perc):
    perc=train_perc
    best_train_set=dfnpy_train.copy()
    best_test_set=dfnpy_test.copy()

best_train_set.to_csv('plit/best_train_80_new.tsv')
best_test_set.to_csv('split/best_test_20_new.tsv')
print("Best Match Percentage : ",perc)

with open('split/percentage.txt', 'w') as file:
  for index,item in enumerate(list_perc):
    file.write(f"Index {index}: {item}\n")

In [None]:
dfnpy_test = pd.read_csv('split/best_test_20.tsv', sep='\t')

In [None]:
def matching_pairs(df_rel, df_npy, output_tsv='split/matching_pairs_train_80_20.tsv'):

    file1_data_train = df_rel 

    file2_data_train = df_npy 

    # Extract unique PIDs from both columns (PID1 and PID2) in the first file
    pids_file1_train = set(file1_data_train['PID1']).union(set(file1_data_train['PID2']))

    # Extract PIDs from the second file
    pids_file2_train = set(file2_data_train['PID'])

    # Find pairs from the first file where both PID1 and PID2 are present in the second file
    matching_pairs_train = []

    for _, row in file1_data_train.iterrows():
        pid1 = row['PID1']
        pid2 = row['PID2']

        if pid1 in pids_file2_train and pid2 in pids_file2_train:
            matching_pairs_train.append((pid1, pid2))

    # Create a DataFrame for matching pairs
    matching_pairs_df = pd.DataFrame(matching_pairs_train, columns=['PID1', 'PID2'])

    # Save matching pairs to a TSV file
    matching_pairs_df.to_csv(output_tsv, sep='\t', index=False)

    mtpr1 = (len(matching_pairs_train) / len(file1_data_train['PID1'])) * 100
    return mtpr1

In [None]:
print("Matching Pairs in training : ", matching_pairs(df_relish,dfnpy_train))

Matching Pairs in testing :  3.5290152356977718


In [None]:
import pandas as pd

def matching_pairs(df_rel, df_npy, output_tsv='split/matching_pairs_relish_train.tsv'):
    # Read data from the first file
    file1_data_train = df_rel 

    # Read data from the second file
    file2_data_train = df_npy 

    # Extract unique PIDs from both columns (PID1 and PID2) in the first file
    pids_file1_train = set(file1_data_train['PID1']).union(set(file1_data_train['PID2']))

    # Extract PIDs from the second file
    pids_file2_train = set(file2_data_train['PID'])

    # Find pairs from the first file where both PID1 and PID2 are present in the second file
    matching_pairs_train = []

    for _, row in file1_data_train.iterrows():
        pid1 = row['PID1']
        pid2 = row['PID2']

        if pid1 in pids_file2_train and pid2 in pids_file2_train:
            matching_pairs_train.append((pid1, pid2, row['Value']))  # Include 'Value' attribute

    # Create a DataFrame for matching pairs
    matching_pairs_df = pd.DataFrame(matching_pairs_train, columns=['PID1', 'PID2', 'Value'])  # Include 'Value' column

    # Save matching pairs to a TSV file
    matching_pairs_df.to_csv(output_tsv, sep='\t', index=False)

    mtpr1 = (len(matching_pairs_train) / len(file1_data_train)) * 100  # Calculate matching pairs ratio
    return mtpr1

In [None]:
df_best_train=pd.read_csv('split/best_train_20.tsv', sep='\t')

In [None]:
print("Matching Pairs in relish train : ",matching_pairs(df_relish,df_best_train))