In [1]:
import pandas as pd
urlTrain = "https://raw.githubusercontent.com/awsm-research/VulRepair/refs/heads/main/data/fine_tune_data/train.csv"
urlTest = "https://raw.githubusercontent.com/awsm-research/VulRepair/refs/heads/main/data/fine_tune_data/test.csv"
urlVal = "https://raw.githubusercontent.com/awsm-research/VulRepair/refs/heads/main/data/fine_tune_data/val.csv"

In [7]:
# Load datasets
train_df = pd.read_csv(urlTrain)
test_df = pd.read_csv(urlTest)
val_df = pd.read_csv(urlVal)

In [8]:
def remove_inter_duplicates(source_df, *target_dfs):
    """
    Removes rows from source_df that are present in any of the target_dfs.
    """
    if not target_dfs:
        return source_df
    # Combine target DataFrames and drop duplicates within them
    combined_target = pd.concat(target_dfs).drop_duplicates()
    # Merge to find overlapping rows
    merged = source_df.merge(combined_target, on=list(source_df.columns),
                        how='left', indicator=True)
    # Keep only rows unique to source_df
    cleaned_df = merged[merged['_merge'] == 'left_only'].drop(columns='_merge')
    return cleaned_df

In [9]:
# Store original sizes
original_sizes = {
    'train': len(train_df),
    'val': len(val_df),
    'test': len(test_df),
}

# Process train dataset
train_df = train_df.drop_duplicates()  # Intra duplicates
train_df = remove_inter_duplicates(train_df, val_df, test_df)  # Inter duplicates

# Process val dataset
val_df = val_df.drop_duplicates()  # Intra duplicates
val_df = remove_inter_duplicates(val_df, test_df)  # Inter duplicates

# Process test dataset
test_df = test_df.drop_duplicates()  # Intra duplicates

In [10]:
# Calculate new sizes
new_sizes = {
    'train': len(train_df),
    'val': len(val_df),
    'test': len(test_df),
}

# Calculate total size
total_cleaned = sum(new_sizes.values())

# Print original and new proportions
print("Original sizes:")
for dataset, size in original_sizes.items():
    print(f"{dataset}: {size} rows ({size / sum(original_sizes.values()):.2%})")

print("\nNew sizes after deduplication:")
for dataset, size in new_sizes.items():
    print(f"{dataset}: {size} rows ({size / total_cleaned:.2%})")

Original sizes:
train: 5937 rows (70.00%)
val: 839 rows (9.89%)
test: 1706 rows (20.11%)

New sizes after deduplication:
train: 3777 rows (61.88%)
val: 713 rows (11.68%)
test: 1614 rows (26.44%)
