In [50]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the CSV file
text_1_path = "csv files/text_1.csv"  
text_2_path = "csv files/text_2.csv"  
text_3_path = "csv files/text_3.csv"  
text_4_path = "csv files/text_4.csv"  
text_5_path = "csv files/text_5.csv"  
text_6_path = "csv files/text_6.csv"  

df1, df2, df3 = pd.read_csv(text_1_path), pd.read_csv(text_2_path), pd.read_csv(text_3_path)
df4, df5, df6 = pd.read_csv(text_4_path), pd.read_csv(text_5_path), pd.read_csv(text_6_path)

data = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True)

# Ensure the necessary column exists
if "preprocessed_text" not in data.columns:
    raise ValueError("The column 'preprocessed_text' is missing from the CSV file.")
    
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['preprocessed_text'])

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)

In [49]:
thresholdLowerBound = 0.95  # Adjust this as needed
thresholdUpperBound = 1 # Adjust this as needed

similar_pairs = [] # store file pairs with similarity above the threshold

# Mask similarity matrix to find pairs above threshold
mask = (similarity_matrix > thresholdLowerBound) & (similarity_matrix < thresholdUpperBound)
indices = np.argwhere(mask)
fileNames = data["file_name"].to_numpy()

# Collect file names for pairs
similar_pairs = [(fileNames[i], fileNames[j], similarity_matrix[i, j]) 
                 for i, j in indices if i < j]  # Only upper triangle, since similarityMatrix is symmetrical (i.e. sm[i,j] = sm[j,i])

duplicate_files = set(file1 for file1, file2, _ in similar_pairs).union(file2 for file1, file2, _ in similar_pairs)

print(f"There are {len(duplicate_files)} files with a duplicate. There are {len(fileNames) -len(duplicate_files)} unique files.\n")

# Display the results
print(f"There are {len(similar_pairs)} pairs of files with similarity above {thresholdLowerBound} and below {thresholdUpperBound} out of {num_files} total files:\n")
for file1, file2, score in similar_pairs[:10]:
    print(f"{file1} and {file2} - Similarity: {score:.2f}")

There are 449 files with a duplicate. There are 6332 unique files.

There are 10807 pairs of files with similarity above 0.95 and below 1 out of 6781 total files:

8-11-20 Patterson_embedded.txt and 9-30-20 Patterson_embedded.txt - Similarity: 1.00
8-13-20 Schmitt-Chan_embedded.txt and 8-12-20 Fearn_embedded.txt - Similarity: 0.96
8-24-20 Parenteau_embedded.txt and 9-1-20 Cohen David_embedded.txt - Similarity: 0.98
8-12-20 Gurin_embedded.txt and 9-6-20 Gurin_embedded.txt - Similarity: 1.00
8-13-20 Loi-On Attachment 2_embedded.txt and 9-30-20 Loi-On Attachment 3_embedded.txt - Similarity: 0.99
8-12-20 Niku_embedded.txt and 8-11-20 Niku_embedded.txt - Similarity: 0.99
8-11-20 Lynn Attachment_embedded.txt and 8-10-20 Lynn Attachment_embedded.txt - Similarity: 1.00
8-12-20 Fearn_embedded.txt and 8-12-20 Jamakatt_embedded.txt - Similarity: 0.97
8-12-20 Fearn_embedded.txt and 8-11-20 Fearn_embedded.txt - Similarity: 0.97
8-10-20 Locker_embedded.txt and 8-13-19 Wolman_embedded.txt - Similarit

In [33]:
files_to_remove = set()
for i in range(num_files):
    if i in files_to_remove:
        continue  # Skip already marked files
    for j in range(i + 1, num_files):
        if j in files_to_remove:
            continue  # Skip already marked files
        if similarity_matrix[i, j] > 0.85:
            # Mark the second file (j) as a duplicate
            files_to_remove.add(j)

# Create a DataFrame of unique files
unique_files = data.drop(index=list(files_to_remove)).reset_index(drop=True)
len(unique_files)

5793

In [None]:
import openai
import os

# Set your OpenAI API key
openai.api_key = "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"

def send_to_openai(file_content):
    """
    Send the file content to the OpenAI API to extract the number and sentence.
    """
    system_prompt = (
        "You are a helpful assistant tasked with analyzing text files. "
        "Each file contains comments submitted by the public. Some files "
        "include a header that specifies how many people the file represents, "
        "in terms of 'submissions', 'copies', or similar words. Your task is to:\n"
        "1. Identify and extract the number of people represented based on the text in the header.\n"
        "2. Return the full sentence from the file that contains this number.\n"
        "If no such information is found, return 1 as the default number and note that no header was identified."
    )
    
    prompt = (
        f"Here is the content of the file:\n\n{file_content}\n\n"
        "Please extract the number of people represented and the corresponding sentence."
    )
    
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",  # Use "gpt-3.5-turbo" if GPT-4 is not available
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt},
            ]
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        print(f"Error with OpenAI API: {e}")
        return None

def process_files(file_list):
    """
    Process a list of text files and send them to OpenAI.
    """
    results = []
    for file_path in file_list:
        print(f"Processing {file_path}...")
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                result = send_to_openai(content)
                if result:
                    results.append({"file": file_path, "result": result})
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
    
    return results

def save_results_to_file(results, output_path):
    """
    Save the API results to a file for further analysis.
    """
    with open(output_path, 'w', encoding='utf-8') as output_file:
        for entry in results:
            output_file.write(f"File: {entry['file']}\n")
            output_file.write(f"Result: {entry['result']}\n")
            output_file.write("-" * 80 + "\n")

# Replace with the directory containing your .txt files
directory = "path_to_your_text_files"
file_list = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".txt")]

# Process files and save results
results = process_files(file_list)
save_results_to_file(results, "output_results.txt")

print("Processing complete. Results saved to output_results.txt.")
