In [2]:
#!/usr/bin/env python3
import os
import re
import shutil
import csv

#####################
# PART 1: Build set of q_ids from cost data CSVs
#####################

# Directory with the cost data CSV files
cost_data_base = "/Users/vk365/Dropbox/interconnections_data/data/ic_studies/raw/04_intermediate_scraped_data/phase_2_cost_data"
# Clusters are named Cluster 1, Cluster 2, ... Cluster 14
cluster_dirs = [entry for entry in os.listdir(cost_data_base) if os.path.isdir(os.path.join(cost_data_base, entry))]
print("DEBUG: Found cluster directories in cost data:")
for d in cluster_dirs:
    print(" -", d)

# Allowed style markers: only CSV files with these substrings will be processed.
allowed_styles = ["_style_H_", "_style_J_", "_style_M_", "_style_others_"]

# Set to store project numbers (q_ids) that have a Phase 1 Attachment 2 PDF
att2_qids = set()

# For each cluster directory, go into its "02_intermediate" folder and process CSV files.
for cluster in cluster_dirs:
    inter_dir = os.path.join(cost_data_base, cluster, "02_intermediate")
    if not os.path.exists(inter_dir):
        print(f"DEBUG: Skipping {cluster} because missing folder: 02_intermediate")
        continue

    # Process each CSV file that has one of the allowed style markers in its name.
    for fname in os.listdir(inter_dir):
        if not fname.lower().endswith(".csv"):
            continue
        fname_lower = fname.lower()
        # Check that one of the allowed style markers is present.
        if not any(style.lower() in fname_lower for style in allowed_styles):
            print(f"DEBUG: Skipping CSV file {fname} as it does not contain an allowed style marker.")
            continue

        csv_path = os.path.join(inter_dir, fname)
        try:
            with open(csv_path, newline='', encoding='utf-8') as csvfile:
                reader = csv.DictReader(csvfile)
                # Expect a column called "q_id"
                for row in reader:
                    qid = row.get("q_id")
                    if qid:
                        att2_qids.add(qid.strip())
        except Exception as e:
            print(f"DEBUG: Error processing CSV {csv_path}: {e}")

print("DEBUG: Unique q_ids (projects with a Phase 1 Attachment 2 PDF) from cost data:")
print(att2_qids)

#####################
# PART 2: Process each project in 03_data using the cost data information
#####################

# Base directory for the project folders
base_dir = "/Users/vk365/Dropbox/interconnections_data/data/ic_studies/raw/03_data"

# Updated regexes for Attachment 2 and Attachment 3 allowing optional whitespace
att2_regex = re.compile(r'(Att\s*2|Attachment\s*2|Attach\s*2|Att#\s*2|Attachment#\s*2)', re.IGNORECASE)
att3_regex = re.compile(r'(Att\s*3|Attachment\s*3|Attach\s*3|Att#\s*3|Attachment#\s*3)', re.IGNORECASE)

def is_phase1(fname):
    """
    Returns True if the filename qualifies as a Phase 1 file.
    Excludes any file that contains Phase 2 markers (e.g., PII, PHII).
    """
    fname_up = fname.upper()
    if "PI" in fname_up or "PHI" in fname_up:
        return False
    for token in ["P2", "PII", "PH2", "PHII"]:
        if token in fname_up:
            return True
    return False

# Summary counters and CSV data list for the project processing
count_with_att3 = 0
count_with_att2_and_att3 = 0
count_with_att2_without_att3 = 0
csv_rows = []

# List of project folders in the base_dir
project_dirs = [entry.name for entry in os.scandir(base_dir) if entry.is_dir()]
print("\nDEBUG: Found project directories in 03_data:")
for proj in project_dirs:
    print(" -", proj)

# Process each project folder only if its q_id is in our att2_qids set.
for proj in project_dirs:
    # Only process if the project folder name is in our att2_qids set.
    if proj not in att2_qids:
        print(f"DEBUG: Skipping project {proj} because it is not in the cost data q_id set.")
        continue

    proj_path = os.path.join(base_dir, proj)
    phase1_dir = os.path.join(proj_path, "03_phase_2_study")
    doc_dump_dir = os.path.join(proj_path, "document_dump")

    print(f"\nDEBUG: Processing project: {proj}")

    if not os.path.exists(phase1_dir):
        print(f"DEBUG: Skipping {proj} because missing folder: 03_phase_2_study")
        continue
    if not os.path.exists(doc_dump_dir):
        print(f"DEBUG: Skipping {proj} because missing folder: document_dump")
        continue

    phase1_files = os.listdir(phase1_dir)
    doc_dump_files = os.listdir(doc_dump_dir)
    print(f"DEBUG: Files in 03_phase_2_study for {proj}: {phase1_files}")
    print(f"DEBUG: Files in document_dump for {proj}: {doc_dump_files}")

    att3_present = False
    att2_present = False

    # Check for existing Phase 1 attachments in the phase1 study folder.
    for fname in phase1_files:
        if not fname.lower().endswith(".pdf"):
            continue
        if not is_phase1(fname):
            continue
        if att3_regex.search(fname):
            att3_present = True
        if att2_regex.search(fname):
            att2_present = True

    csv_row = {"q_id": proj, "Att2_present": "yes" if att2_present else "no", "Att3_present": "yes" if att3_present else "no"}

    if att3_present:
        print(f"DEBUG: {proj} already has an Attachment 3 in phase 1 folder.")
    else:
        # Since the project is in our cost data set, we know an Attachment 2 should exist.
        # Try to derive a search pattern from an existing Attachment 2 file.
        search_pattern = None
        if att2_present:
            for fname in phase1_files:
                if not fname.lower().endswith(".pdf"):
                    continue
                if att2_regex.search(fname):
                    match = att2_regex.search(fname)
                    if match:
                        token = match.group(0)
                        # Remove whitespace from the token
                        token_no_space = re.sub(r'\s+', '', token)
                        # Replace the digit "2" with a pattern that allows optional whitespace before 3.
                        search_pattern = re.sub(r'2', r'\\s*3', re.escape(token_no_space), count=1)
                    break

        if not search_pattern:
            # Fall back to a generic pattern if for some reason Attachment 2 wasn't found.
            search_pattern = r"(Att\s*3|Attachment\s*3|Attach\s*3|Att#\s*3|Attachment#\s*3)"

        search_regex = re.compile(search_pattern, re.IGNORECASE)
        print(f"DEBUG: Using search pattern for Attachment 3 for project {proj}: {search_regex.pattern}")

        candidate = None
        for fname in doc_dump_files:
            if not fname.lower().endswith(".pdf"):
                continue
            if not is_phase1(fname):
                print(f"DEBUG: Skipping {fname} in document_dump as it doesn't match phase 1 criteria")
                continue
            if not search_regex.search(fname):
                print(f"DEBUG: Skipping {fname} in document_dump as it doesn't match attachment 3 pattern")
                continue
            candidate = fname
            break

        if candidate:
            src = os.path.join(doc_dump_dir, candidate)
            dst = os.path.join(phase1_dir, candidate)
            try:
                shutil.move(src, dst)
                att3_present = True
                csv_row["Att3_present"] = "yes"
                print(f"DEBUG: Moved file '{candidate}' from document_dump to 02_phase_1_study for project {proj}.")
            except Exception as e:
                print(f"DEBUG: Error moving file '{candidate}' for project {proj}: {e}")
        else:
            print(f"DEBUG: No matching Attachment 3 PDF found in document_dump for project {proj}.")

    if att3_present:
        count_with_att3 += 1
        if att2_present:
            count_with_att2_and_att3 += 1
    if att2_present and not att3_present:
        count_with_att2_without_att3 += 1

    csv_rows.append(csv_row)

print("\nSummary:")
print(f"Projects with Phase 1 Attachment 3: {count_with_att3}")
print(f"Projects with both Attachment 2 and 3: {count_with_att2_and_att3}")
print(f"Projects with Attachment 2 but no Attachment 3: {count_with_att2_without_att3}")

# Write the CSV summary file.
csv_file_path = os.path.join(base_dir, "phase_2_attachment_summary.csv")
with open(csv_file_path, mode="w", newline="") as csvfile:
    fieldnames = ["q_id", "Att2_present", "Att3_present"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in csv_rows:
        writer.writerow(row)

print(f"CSV summary written to {csv_file_path}")


DEBUG: Found cluster directories in cost data:
 - all_clusters
 - Cluster 14
DEBUG: Skipping all_clusters because missing folder: 02_intermediate
DEBUG: Skipping CSV file costs_phase_2_cluster_14_style_Q_total_network_addendum.csv as it does not contain an allowed style marker.
DEBUG: Skipping CSV file costs_phase_2_cluster_14_style_Q.csv as it does not contain an allowed style marker.
DEBUG: Skipping CSV file costs_phase_2_cluster_14_style_Q_itemized_addendums.csv as it does not contain an allowed style marker.
DEBUG: Skipping CSV file costs_phase_2_cluster_14_style_Q_total.csv as it does not contain an allowed style marker.
DEBUG: Skipping CSV file costs_phase_2_cluster_14_style_Q_total_network.csv as it does not contain an allowed style marker.
DEBUG: Skipping CSV file costs_phase_2_cluster_14_style_Q_total_addendums.csv as it does not contain an allowed style marker.
DEBUG: Skipping CSV file costs_phase_2_cluster_14_style_Q_itemized.csv as it does not contain an allowed style marke