In [2]:
import pandas as pd
import json

# Define the metadata columns to extract
metadata_columns = [
    "Filename",
    "Document Name-Answer",
    "Parties-Answer",
    "Agreement Date-Answer",
    "Effective Date-Answer",
    "Expiration Date-Answer",
    "Renewal Term-Answer",
    "Notice Period To Terminate Renewal- Answer",
    "Governing Law-Answer"
]

# Load your CSV file into a DataFrame
csv_file = "../data/CUAD_v1/master_clauses.csv"  # Change to your CSV file path
df = pd.read_csv(csv_file)

# Extract only the metadata columns from the DataFrame
metadata_df = df[metadata_columns]
metadata_df = metadata_df.fillna("")

# Create a dictionary with filenames as keys
contracts_metadata = {}

# Clean up any dates with []/[]/[] pattern to empty string
def clean_date(date_str):
    if not isinstance(date_str, str):
        return date_str
    # Handle both []/[]/[] and []/[]/YYYY patterns
    if '[]/[]' in date_str:
        # If it has a year at the end, extract just the year
        if date_str.endswith(']'):
            return ''
        try:
            year = date_str.split('/')[-1]
            if year.isdigit() and len(year) == 4:
                return year
            return ''
        except:
            return ''
    return date_str

# Add function to clean dates in the DataFrame
metadata_df['Effective Date-Answer'] = metadata_df['Effective Date-Answer'].apply(clean_date)
metadata_df['Agreement Date-Answer'] = metadata_df['Agreement Date-Answer'].apply(clean_date)
metadata_df['Expiration Date-Answer'] = metadata_df['Expiration Date-Answer'].apply(clean_date)


for _, row in metadata_df.iterrows():
    filename = row["Filename"]
    # Create a copy of the row as dict and remove the filename field
    contract_data = row.to_dict()
    del contract_data["Filename"]
    
    # Clean up extra spaces in document name
    if "Document Name-Answer" in contract_data:
        contract_data["Document Name-Answer"] = " ".join(contract_data["Document Name-Answer"].split())
    
    # Add to the main dictionary with filename as key
    contracts_metadata[filename] = contract_data

# Write the metadata to a JSON file
output_file = "../data/created/contracts_metadata_only.json"  # Output file path
with open(output_file, "w") as json_file:
    json.dump(contracts_metadata, json_file, indent=4, ensure_ascii=False)

print(f"Extracted metadata columns saved to {output_file}")


Extracted metadata columns saved to ../data/created/contracts_metadata_only.json
