In [1]:
import pandas as pd
import os
from openpyxl import load_workbook

def update_excel_files_with_mentions_and_hypotheses():
    """
    This script processes all .xlsx files in the current directory to link mentions
    and hypotheses across different sheets, preserving original formatting.

    It performs the following steps:
    1.  Finds all non-temporary .xlsx files in the directory.
    2.  Concatenates the "MENTIONS" and "HYPOTHESIS" sheets from all
        found .xlsx files into single, master DataFrames.
    3.  Splits columns with multiple IDs (separated by ';') to handle multiple links
        from a single cell.
    4.  It then re-opens each workbook and for every sheet, it identifies a specific
        column that starts with the name '{sheet_name}_ID' (e.g., 'WORK_ID' for sheet 'WORK').
    5.  It then adds or updates three columns to these sheets:
        - '{sheet_name}_MENTIONING': Populated with 'MENTIONED_ID's.
        - '{sheet_name}_MENTIONED_BY': Populated with 'MENTIONING_ID's.
        - '{sheet_name}_HYPOTHESIS_ID': Populated with 'HYPOTHESIS_ID's.
    6.  The updated sheets are saved back to their original .xlsx files using
        openpyxl to preserve all existing cell formatting.
    """
    try:
        # Helper function for conditional lowercasing
        def custom_clean_and_format(s):
            if not isinstance(s, str):
                s = str(s)
            
            cleaned_parts = []
            for part in s.split(';'):
                stripped_part = part.strip()
                # Check for specific prefixes and preserve their case
                if stripped_part.startswith(('VO_', 'PAG_', 'PO_')):
                    cleaned_parts.append(stripped_part.replace(' ', '_'))
                else:
                    # Lowercase for everything else
                    cleaned_parts.append(stripped_part.lower().replace(' ', '_'))
            return ';'.join(cleaned_parts)

        # 1. Find all .xlsx files, ignoring temporary files
        xlsx_files = [f for f in os.listdir('.') if f.endswith('.xlsx') and not f.startswith('~')]

        if not xlsx_files:
            print("No .xlsx files were found in the current directory.")
            return

        # 2. Read and concatenate all "MENTIONS" and "HYPOTHESIS" sheets from all files
        print("Step 1: Consolidating all 'MENTIONS' and 'HYPOTHESIS' sheets...")
        mentions_dfs = []
        hypothesis_dfs = []
        for file in xlsx_files:
            try:
                xls = pd.ExcelFile(file)
                if 'MENTIONS' in xls.sheet_names:
                    df = pd.read_excel(file, sheet_name='MENTIONS')
                    mentions_dfs.append(df)
                    print(f"  - Found and read 'MENTIONS' from {file}")
                if 'HYPOTHESIS' in xls.sheet_names:
                    df = pd.read_excel(file, sheet_name='HYPOTHESIS')
                    hypothesis_dfs.append(df)
                    print(f"  - Found and read 'HYPOTHESIS' from {file}")
            except Exception as e:
                print(f"Warning: Could not read sheets from {file}: {e}")

        if not mentions_dfs and not hypothesis_dfs:
            print("No 'MENTIONS' or 'HYPOTHESIS' sheets were found. Stopping.")
            return

        # Process MENTIONS
        if mentions_dfs:
            master_mentions_df = pd.concat(mentions_dfs, ignore_index=True)
            print("Successfully consolidated all 'MENTIONS' sheets.")
            master_mentions_df.dropna(subset=['MENTIONING_ID', 'MENTIONED_ID'], inplace=True)
            for col in master_mentions_df.columns:
                master_mentions_df[col] = master_mentions_df[col].fillna('').astype(str).apply(custom_clean_and_format)
            
            master_mentions_df['MENTIONING_ID'] = master_mentions_df['MENTIONING_ID'].str.split(';')
            mentioning_exploded = master_mentions_df.explode('MENTIONING_ID')
            master_mentions_df['MENTIONED_ID'] = master_mentions_df['MENTIONED_ID'].str.split(';')
            mentioned_exploded = master_mentions_df.explode('MENTIONED_ID')
            
            mentioning_exploded['MENTIONING_ID'] = mentioning_exploded['MENTIONING_ID'].str.strip()
            mentioned_exploded['MENTIONED_ID'] = mentioned_exploded['MENTIONED_ID'].str.strip()
        else:
            master_mentions_df = pd.DataFrame()
            mentioning_exploded = pd.DataFrame(columns=['MENTIONING_ID', 'MENTIONED_ID'])
            mentioned_exploded = pd.DataFrame(columns=['MENTIONING_ID', 'MENTIONED_ID'])
            print("No 'MENTIONS' sheets found to process.")

        # Process HYPOTHESIS
        if hypothesis_dfs:
            master_hypothesis_df = pd.concat(hypothesis_dfs, ignore_index=True)
            print("Successfully consolidated all 'HYPOTHESIS' sheets.")
            master_hypothesis_df.dropna(subset=['HYPOTHESIS_ID', 'HYPOTHESIS_ABOUT_ID(S)'], inplace=True)
            for col in master_hypothesis_df.columns:
                master_hypothesis_df[col] = master_hypothesis_df[col].fillna('').astype(str).apply(custom_clean_and_format)

            master_hypothesis_df['HYPOTHESIS_ABOUT_ID(S)'] = master_hypothesis_df['HYPOTHESIS_ABOUT_ID(S)'].str.split(';')
            hypothesis_exploded = master_hypothesis_df.explode('HYPOTHESIS_ABOUT_ID(S)')
            hypothesis_exploded['HYPOTHESIS_ABOUT_ID(S)'] = hypothesis_exploded['HYPOTHESIS_ABOUT_ID(S)'].str.strip()
        else:
            master_hypothesis_df = pd.DataFrame()
            hypothesis_exploded = pd.DataFrame(columns=['HYPOTHESIS_ID', 'HYPOTHESIS_ABOUT_ID(S)'])
            print("No 'HYPOTHESIS' sheets found to process.")


        # 4 & 5. Process each file again to add new columns
        print("\nStep 2: Updating sheets in each file...")
        for file in xlsx_files:
            try:
                print(f"Processing {file}...")
                original_sheets = pd.read_excel(file, sheet_name=None)
                
                for sheet_name, sheet_df in original_sheets.items():
                    for col in sheet_df.columns:
                        if ('ID' in col.split('_') or col == "PLACE_OF_PUBLICATION_MANIFESTATION" or col == "HYPOTHESIS_ABOUT_ID(S)") and col != "EXTERNAL_ID_MANIFESTATION":
                            print(f"  - Cleaning column '{col}' in sheet '{sheet_name}'...")
                            sheet_df[col] = sheet_df[col].fillna('').astype(str).apply(custom_clean_and_format)
                    
                    # 1) Clean the 'RELATIONSHIP' column in the 'RELATIONSHIP' sheet
                    if sheet_name == 'RELATIONSHIP' and 'RELATIONSHIP' in sheet_df.columns:
                        print("  - Cleaning column 'RELATIONSHIP' in sheet 'RELATIONSHIP'...")
                        sheet_df['RELATIONSHIP'] = sheet_df['RELATIONSHIP'].fillna('').astype(str).apply(
                            lambda s: s.strip().lower().replace(' ', '_')
                        )

                    original_sheets[sheet_name] = sheet_df

                workbook = load_workbook(file)

                for sheet_name in workbook.sheetnames:
                    worksheet = workbook[sheet_name]
                    sheet_df = original_sheets[sheet_name]

                    header_map = {cell.value: cell.column for cell in worksheet[1]}
                    for col_name in sheet_df.columns:
                        if 'ID' in col_name.split('_') or col_name == "PLACE_OF_PUBLICATION_MANIFESTATION" or col_name == "HYPOTHESIS_ABOUT_ID(S)" or (sheet_name == 'RELATIONSHIP' and col_name == 'RELATIONSHIP'):
                            if col_name in header_map:
                                col_idx = header_map[col_name]
                                print(f"  - Writing cleaned data back to column '{col_name}' in sheet '{sheet_name}'...")
                                cleaned_column_data = sheet_df[col_name]
                                for i, value in enumerate(cleaned_column_data, start=2):
                                    worksheet.cell(row=i, column=col_idx, value=value)

                    id_col_prefix = f"{sheet_name}_ID"
                    matching_id_cols = [col for col in sheet_df.columns if col.startswith(id_col_prefix)]
                    
                    id_col_to_use = None
                    if len(matching_id_cols) == 1:
                        id_col_to_use = matching_id_cols[0]
                        print(f"  - Found primary ID column for linking: '{id_col_to_use}'")
                    elif len(matching_id_cols) == 0:
                        print(f"  - Skipping sheet '{sheet_name}': No column found starting with '{id_col_prefix}'.")
                        continue
                    else:
                        print(f"  - Skipping sheet '{sheet_name}': Ambiguous ID columns found: {matching_id_cols}.")
                        continue
                    
                    print(f"  - Updating sheet '{sheet_name}' with links...")
                    
                    mentioning_col_name = f"{sheet_name}_MENTIONING"
                    mentioned_by_col_name = f"{sheet_name}_MENTIONED_BY"
                    hypothesis_col_name = f"{sheet_name}_HYPOTHESIS_ID"
                    
                    mentioning_values = []
                    mentioned_by_values = []
                    hypothesis_values = []

                    for index, row in sheet_df.iterrows():
                        cell_id = row[id_col_to_use]

                        # Process for MENTIONING
                        all_mentioned_ids = set()
                        if cell_id and cell_id != 'nan' and not mentioning_exploded.empty:
                            matched_mentions = mentioning_exploded[mentioning_exploded['MENTIONING_ID'] == cell_id]
                            for _, mention_row in matched_mentions.iterrows():
                                ids = mention_row['MENTIONED_ID']
                                if isinstance(ids, list):
                                    all_mentioned_ids.update(id.strip() for id in ids)
                                else:
                                    all_mentioned_ids.add(str(ids).strip())
                        mentioning_values.append(';'.join(sorted(list(all_mentioned_ids))))

                        # Process for MENTIONED_BY
                        all_mentioning_ids = set()
                        if cell_id and cell_id != 'nan' and not mentioned_exploded.empty:
                            matched_mentioned = mentioned_exploded[mentioned_exploded['MENTIONED_ID'] == cell_id]
                            for _, mention_row in matched_mentioned.iterrows():
                                ids = mention_row['MENTIONING_ID']
                                if isinstance(ids, list):
                                    all_mentioning_ids.update(id.strip() for id in ids)
                                else:
                                    all_mentioning_ids.add(str(ids).strip())
                        mentioned_by_values.append(';'.join(sorted(list(all_mentioning_ids))))

                        # Process for HYPOTHESIS
                        all_hypothesis_ids = set()
                        if cell_id and cell_id != 'nan' and not hypothesis_exploded.empty:
                            matched_hypothesis = hypothesis_exploded[hypothesis_exploded['HYPOTHESIS_ABOUT_ID(S)'] == cell_id]
                            for _, hypothesis_row in matched_hypothesis.iterrows():
                                all_hypothesis_ids.add(str(hypothesis_row['HYPOTHESIS_ID']).strip())
                        
                        # New functionality for MANIFESTATION sheet to also check MANIFESTATION_VOLUME_ID for hypotheses
                        if sheet_name == "MANIFESTATION" and "MANIFESTATION_VOLUME_ID" in sheet_df.columns:
                            volume_id = row["MANIFESTATION_VOLUME_ID"]
                            if volume_id and volume_id != 'nan' and not hypothesis_exploded.empty:
                                print(f"    - Also checking 'MANIFESTATION_VOLUME_ID': {volume_id} for hypotheses...")
                                matched_hypothesis_vol = hypothesis_exploded[hypothesis_exploded['HYPOTHESIS_ABOUT_ID(S)'] == volume_id]
                                for _, hypothesis_row in matched_hypothesis_vol.iterrows():
                                    all_hypothesis_ids.add(str(hypothesis_row['HYPOTHESIS_ID']).strip())

                        hypothesis_values.append(';'.join(sorted(list(all_hypothesis_ids))))

                    # New functionality for MANIFESTATION sheet
                    if sheet_name == "MANIFESTATION" and "MANIFESTATION_VOLUME_ID" in sheet_df.columns:
                        print("  - Processing 'MANIFESTATION_VOLUME_ID' for mentions...")
                        vol_mentioning_col_name = "MANIFESTATION_VOLUME_MENTIONING"
                        vol_mentioned_by_col_name = "MANIFESTATION_VOLUME_MENTIONED_BY"
                        
                        vol_mentioning_values = []
                        vol_mentioned_by_values = []

                        for index, row in sheet_df.iterrows():
                            volume_id = row["MANIFESTATION_VOLUME_ID"]

                            # Process for MENTIONING
                            all_mentioned_ids_vol = set()
                            if volume_id and volume_id != 'nan' and not mentioning_exploded.empty:
                                matched_mentions_vol = mentioning_exploded[mentioning_exploded['MENTIONING_ID'] == volume_id]
                                for _, mention_row in matched_mentions_vol.iterrows():
                                    ids = mention_row['MENTIONED_ID']
                                    if isinstance(ids, list):
                                        all_mentioned_ids_vol.update(id.strip() for id in ids)
                                    else:
                                        all_mentioned_ids_vol.add(str(ids).strip())
                            vol_mentioning_values.append(';'.join(sorted(list(all_mentioned_ids_vol))))

                            # Process for MENTIONED_BY
                            all_mentioning_ids_vol = set()
                            if volume_id and volume_id != 'nan' and not mentioned_exploded.empty:
                                matched_mentioned_vol = mentioned_exploded[mentioned_exploded['MENTIONED_ID'] == volume_id]
                                for _, mention_row in matched_mentioned_vol.iterrows():
                                    ids = mention_row['MENTIONING_ID']
                                    if isinstance(ids, list):
                                        all_mentioning_ids_vol.update(id.strip() for id in ids)
                                    else:
                                        all_mentioning_ids_vol.add(str(ids).strip())
                            vol_mentioned_by_values.append(';'.join(sorted(list(all_mentioning_ids_vol))))
                        
                        # Write MANIFESTATION_VOLUME mentions
                        current_header_map = {cell.value: cell.column for cell in worksheet[1] if cell.value}
                        if vol_mentioning_col_name in current_header_map:
                            vol_mentioning_col_idx = current_header_map[vol_mentioning_col_name]
                            print(f"  - Found existing column '{vol_mentioning_col_name}'. It will be updated.")
                        else:
                            vol_mentioning_col_idx = worksheet.max_column + 1
                            worksheet.cell(row=1, column=vol_mentioning_col_idx, value=vol_mentioning_col_name)
                            print(f"  - Creating new column '{vol_mentioning_col_name}'.")

                        if vol_mentioned_by_col_name in current_header_map:
                            vol_mentioned_by_col_idx = current_header_map[vol_mentioned_by_col_name]
                            print(f"  - Found existing column '{vol_mentioned_by_col_name}'. It will be updated.")
                        else:
                            vol_mentioned_by_col_idx = worksheet.max_column + 1
                            worksheet.cell(row=1, column=vol_mentioned_by_col_idx, value=vol_mentioned_by_col_name)
                            print(f"  - Creating new column '{vol_mentioned_by_col_name}'.")

                        for i, (mentioning_val, mentioned_by_val) in enumerate(zip(vol_mentioning_values, vol_mentioned_by_values)):
                            excel_row = i + 2
                            worksheet.cell(row=excel_row, column=vol_mentioning_col_idx, value=mentioning_val)
                            worksheet.cell(row=excel_row, column=vol_mentioned_by_col_idx, value=mentioned_by_val)


                    # 6. Use openpyxl to write new data
                    current_header_map = {cell.value: cell.column for cell in worksheet[1] if cell.value}

                    # Determine column for MENTIONING
                    if mentioning_col_name in current_header_map:
                        mentioning_col_idx = current_header_map[mentioning_col_name]
                        print(f"  - Found existing column '{mentioning_col_name}'. It will be updated.")
                    else:
                        mentioning_col_idx = worksheet.max_column + 1
                        worksheet.cell(row=1, column=mentioning_col_idx, value=mentioning_col_name)
                        print(f"  - Creating new column '{mentioning_col_name}'.")

                    # Determine column for MENTIONED_BY
                    if mentioned_by_col_name in current_header_map:
                        mentioned_by_col_idx = current_header_map[mentioned_by_col_name]
                        print(f"  - Found existing column '{mentioned_by_col_name}'. It will be updated.")
                    else:
                        mentioned_by_col_idx = worksheet.max_column + 1
                        worksheet.cell(row=1, column=mentioned_by_col_idx, value=mentioned_by_col_name)
                        print(f"  - Creating new column '{mentioned_by_col_name}'.")
                        
                    # Write the data for mentions
                    for i, (mentioning_val, mentioned_by_val) in enumerate(zip(mentioning_values, mentioned_by_values)):
                        excel_row = i + 2
                        worksheet.cell(row=excel_row, column=mentioning_col_idx, value=mentioning_val)
                        worksheet.cell(row=excel_row, column=mentioned_by_col_idx, value=mentioned_by_val)

                    # Determine and write column for HYPOTHESIS_ID, but NOT on the HYPOTHESIS sheet itself
                    if sheet_name != 'HYPOTHESIS':
                        if hypothesis_col_name in current_header_map:
                            hypothesis_col_idx = current_header_map[hypothesis_col_name]
                            print(f"  - Found existing column '{hypothesis_col_name}'. It will be updated.")
                        else:
                            hypothesis_col_idx = worksheet.max_column + 1
                            worksheet.cell(row=1, column=hypothesis_col_idx, value=hypothesis_col_name)
                            print(f"  - Creating new column '{hypothesis_col_name}'.")
                        
                        for i, hypothesis_val in enumerate(hypothesis_values):
                            excel_row = i + 2
                            worksheet.cell(row=excel_row, column=hypothesis_col_idx, value=hypothesis_val)


                if file.lower() != 'columns_trace.xlsx':
                    workbook.save(file)
                    print(f"Successfully saved updates to {file}\n")
                else:
                    print(f"Skipping save for '{file}' as per exclusion rules.\n")


            except Exception as e:
                print(f"An error occurred while processing {file}: {e}\n")

    except Exception as e:
        print(f"An unexpected critical error occurred: {e}")

if __name__ == "__main__":
    update_excel_files_with_mentions_and_hypotheses()

Step 1: Consolidating all 'MENTIONS' and 'HYPOTHESIS' sheets...
  - Found and read 'MENTIONS' from BA-EG.xlsx
  - Found and read 'HYPOTHESIS' from BA-EG.xlsx
  - Found and read 'MENTIONS' from BPC-AB.xlsx
  - Found and read 'HYPOTHESIS' from BPC-AB.xlsx
Successfully consolidated all 'MENTIONS' sheets.
Successfully consolidated all 'HYPOTHESIS' sheets.

Step 2: Updating sheets in each file...
Processing BA-EG.xlsx...


  master_hypothesis_df = pd.concat(hypothesis_dfs, ignore_index=True)


  - Cleaning column 'WORK_ID' in sheet 'WORK'...
  - Cleaning column 'AUTHOR_WORK_ID' in sheet 'WORK'...
  - Cleaning column 'EXPRESSION_ID' in sheet 'EXPRESSION'...
  - Cleaning column 'WORK_ID' in sheet 'EXPRESSION'...
  - Cleaning column 'TRANSLATOR_ID' in sheet 'EXPRESSION'...
  - Cleaning column 'EDITOR_ID' in sheet 'EXPRESSION'...
  - Cleaning column 'SCRIPTWRITER_ID' in sheet 'EXPRESSION'...
  - Cleaning column 'COMPOSITOR_ID' in sheet 'EXPRESSION'...
  - Cleaning column 'REVIEWER_ID' in sheet 'EXPRESSION'...
  - Cleaning column 'OTHER_SECONDARY_ROLE_ID' in sheet 'EXPRESSION'...
  - Cleaning column 'MANIFESTATION_ID' in sheet 'MANIFESTATION'...
  - Cleaning column 'MANIFESTATION_VOLUME_ID' in sheet 'MANIFESTATION'...
  - Cleaning column 'PLACE_OF_PUBLICATION_MANIFESTATION' in sheet 'MANIFESTATION'...
  - Cleaning column 'EXPRESSION_ID' in sheet 'MANIFESTATION'...
  - Cleaning column 'PUBLISHER_MANIFESTATION_ID' in sheet 'MANIFESTATION'...
  - Cleaning column 'EDITOR_MANIFESTATIO