In [50]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV file
text_1_path = "csv files/text_1.csv"  
text_2_path = "csv files/text_2.csv"  
text_3_path = "csv files/text_3.csv"  
text_4_path = "csv files/text_4.csv"  
text_5_path = "csv files/text_5.csv"  
text_6_path = "csv files/text_6.csv"  

df1, df2, df3 = pd.read_csv(text_1_path), pd.read_csv(text_2_path), pd.read_csv(text_3_path)
df4, df5, df6 = pd.read_csv(text_4_path), pd.read_csv(text_5_path), pd.read_csv(text_6_path)

data = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True)

# Ensure the necessary column exists
if "preprocessed_text" not in data.columns:
    raise ValueError("The column 'preprocessed_text' is missing from the CSV file.")
    
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['preprocessed_text'])

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)

In [69]:
thresholdLowerBound = 0.95  # Adjust this as needed
thresholdUpperBound = 1 # Adjust this as needed

similar_pairs = [] # store file pairs with similarity above the threshold

# Mask similarity matrix to find pairs above threshold
mask = (similarity_matrix > thresholdLowerBound) & (similarity_matrix < thresholdUpperBound)
indices = np.argwhere(mask)
fileNames = data["file_name"].to_numpy()

# Collect file names for pairs
similar_pairs = [(fileNames[i], fileNames[j], similarity_matrix[i, j]) 
                 for i, j in indices if i < j]  # Only upper triangle, since similarityMatrix is symmetrical (i.e. sm[i,j] = sm[j,i])

duplicate_files = set(file1 for file1, file2, _ in similar_pairs).union(file2 for file1, file2, _ in similar_pairs)

print(f"There are {len(duplicate_files)} files with a duplicate. There are {len(fileNames) -len(duplicate_files)} unique files.\n")

# Display the results
"""
print(f"There are {len(similar_pairs)} pairs of files with similarity above {thresholdLowerBound} and below {thresholdUpperBound} out of {num_files} total files:\n")
for file1, file2, score in similar_pairs[:10]:
    print(f"{file1} and {file2} - Similarity: {score:.2f}")
"""
uniqueFiles = set(file for file in fileNames if file not in duplicate_files)
print(len(uniqueFiles))

There are 449 files with a duplicate. There are 6332 unique files.

6332


In [81]:
# Initialize a set to keep selected files
selected_files = set()

# Keep track of files already represented
represented_files = set()

# Sort similar pairs to ensure consistent results
similar_pairs_sorted = sorted(similar_pairs, key=lambda x: -x[2])  # Sort by similarity score descending

for file1, file2, score in similar_pairs_sorted:
        # If neither file is already represented, add one of them
    if file1 not in represented_files and file2 not in represented_files:
        selected_files.add(file1)  # Arbitrarily choose file1
    represented_files.add(file1)
    represented_files.add(file2)    

# Display results
uniqueFiles = set(file for file in fileNames if file not in represented_files)
print(len(represented_files), "files have >= 1 duplicate")
duplicatesRemoved = selected_files.union(uniqueFiles)
filesToRemove = set(file for file in fileNames if file not in duplicatesRemoved)

print(f"Number of selected files: {len(selected_files)} out of {len(similar_pairs_sorted)} pairs.\n"+
      f"Number of files once duplicates were removed: {len(duplicatesRemoved)}.\n"+
      f"Originally {len(data)} files.")

assert duplicate_files.issubset(represented_files), "Not all duplicates are represented!"
assert selected_files.issubset(represented_files), "Selected files are not correctly represented!"

449 files have >= 1 duplicate
Number of selected files: 118 out of 10807 pairs.
Number of files once duplicates were removed: 6450.
Originally 6781 files.


### Below is the final dataset with one copy of each duplicate

In [84]:
dataNoDuplicates = data[~data['file_name'].isin(filesToRemove)].reset_index(drop=True)
len(dataNoDuplicates)

6450