In [44]:
import os
import glob
import pandas as pd
import networkx as nx

def load_csv_files(folder_path):
    """
    Load all CSV files from a given folder into a dictionary.
    Keys are file names and values are DataFrames.
    """
    csv_files = glob.glob(os.path.join(folder_path, '*.'))
    dataframes = {}
    for file in csv_files:
        try:
            df = pd.read_csv(file)
            filename = os.path.basename(file)
            dataframes[filename] = df
        except Exception as e:
            print(f"Error loading {file}: {e}")
    return dataframes

# def load_excel_files(folder_path):
#     """
#     Loads every XLSX file from a folder. If a file contains multiple sheets,
#     each sheet is loaded as a separate dataset.

#     Returns:
#         datasets (list): A list of dictionaries with keys:
#             'file'  - the file path,
#             'sheet' - the sheet name,
#             'data'  - the loaded DataFrame.
#     """
#     file_pattern = os.path.join(folder_path, '*.xlsx')
#     datasets = {}
#     for file in glob.glob(file_pattern):
#         try:
#             xls = pd.ExcelFile(file)
#             for sheet in xls.sheet_names:
#                 try:
#                     df = pd.read_excel(xls, sheet_name=sheet)
#                     filename = os.path.basename(file)
#                     datasets[filename] = {'file': file, 'sheet': sheet, 'data': df}
#                 except Exception as e:
#                     print(f"Error reading sheet '{sheet}' in file '{file}': {e}")
#         except Exception as e:
#             print(f"Error processing file '{file}': {e}")
#     return datasets


# def load_excel_files(folder_path):
#     """
#     Loads every XLSX file from a folder. If a file contains multiple sheets,
#     each sheet is loaded as a separate dataset.
    
#     Returns:
#         datasets (dict): A dictionary where keys are file names and values are lists 
#         of dictionaries. Each inner dictionary has:
#             'sheet' - the sheet name,
#             'data'  - the loaded DataFrame.
#     """
#     file_pattern = os.path.join(folder_path, '*.xlsx')
#     datasets = {}
#     for file in glob.glob(file_pattern):
#         filename = os.path.basename(file)
#         datasets[filename] = []  # Initialize list for multiple sheets
#         try:
#             xls = pd.ExcelFile(file)
#             for sheet in xls.sheet_names:
#                 try:
#                     df = pd.read_excel(xls, sheet_name=sheet)
#                     datasets[filename].append({'sheet': sheet, 'data': df})
#                 except Exception as e:
#                     print(f"Error reading sheet '{sheet}' in file '{file}': {e}")
#         except Exception as e:
#             print(f"Error processing file '{file}': {e}")
#     return datasets

def load_excel_files(folder_path):
    """
    Loads every XLSX file from a folder. If a file contains multiple sheets,
    each sheet is loaded as a separate dataset.
    
    Returns:
        datasets (dict): A dictionary where keys are file names and values are dictionaries.
        Each inner dictionary has keys as sheet names and values as the loaded DataFrame.
    """
    file_pattern = os.path.join(folder_path, '*.xlsx')
    datasets = {}
    for file in glob.glob(file_pattern):
        filename = os.path.basename(file)
        datasets[filename] = {}  # Initialize dictionary for sheets
        try:
            xls = pd.ExcelFile(file)
            for sheet in xls.sheet_names:
                try:
                    df = pd.read_excel(xls, sheet_name=sheet)
                    datasets[filename][sheet] = df
                except Exception as e:
                    print(f"Error reading sheet '{sheet}' in file '{file}': {e}")
        except Exception as e:
            print(f"Error processing file '{file}': {e}")
    return datasets

def get_candidate_signatures(df, threshold=100):
    """
    For a given DataFrame, return a mapping of candidate join column signatures.
    
    A candidate join column is one that is either of object type or has a limited
    number of unique values (<= threshold). The signature is defined as a tuple:
      (data type as string, frozenset of unique non-null values)
    
    Returns a dictionary mapping signature -> list of column names.
    """
    candidate_signatures = {}
    for col in df.columns:
        series = df[col]
        unique_vals = series.dropna().unique()
        if len(unique_vals) == 0:
            continue
        # if the column is object type or has limited unique values, consider it candidate
        if series.dtype == 'object' or len(unique_vals) <= threshold:
            sig = (str(series.dtype), frozenset(unique_vals))
            candidate_signatures.setdefault(sig, []).append(col)
    return candidate_signatures

def build_candidate_mapping(dataframes, threshold=100):
    """
    Build a global mapping for candidate join columns.
    
    Returns a dictionary mapping:
       candidate_signature -> list of tuples (file_name, column_name, unique_count)
    """
    mapping = {}
    for file_name, df in dataframes.items():
        cand_sig = get_candidate_signatures(df, threshold=threshold)
        for sig, cols in cand_sig.items():
            for col in cols:
                unique_count = len(df[col].dropna().unique())
                mapping.setdefault(sig, []).append((file_name, col, unique_count))
    return mapping

def build_join_graph(candidate_mapping):
    """
    Build a graph where each node is a CSV file and an edge connects two files
    if they share at least one candidate join column signature.
    
    The edge attribute 'candidate_keys' is a list of tuples:
      (candidate_signature, column_in_file1, column_in_file2)
    """
    G = nx.Graph()
    # add nodes (file names)
    files = set()
    for candidates in candidate_mapping.values():
        for (file_name, col, _) in candidates:
            files.add(file_name)
    for file in files:
        G.add_node(file)
    # add edges for each candidate signature shared across files
    for sig, candidates in candidate_mapping.items():
        if len(candidates) > 1:
            for i in range(len(candidates)):
                for j in range(i+1, len(candidates)):
                    file1, col1, _ = candidates[i]
                    file2, col2, _ = candidates[j]
                    if G.has_edge(file1, file2):
                        G[file1][file2]['candidate_keys'].append((sig, col1, col2))
                    else:
                        G.add_edge(file1, file2, candidate_keys=[(sig, col1, col2)])
    return G

def find_common_join_key(df1, df2, threshold=100):
    """
    Identify a common join key between two DataFrames.
    
    It computes candidate signatures for each and returns a tuple:
      (column_name_in_df1, column_name_in_df2, candidate_signature)
    for the first candidate signature that both dataframes share.
    If no candidate is found, returns None.
    """
    cand1 = get_candidate_signatures(df1, threshold)
    cand2 = get_candidate_signatures(df2, threshold)
    common_keys = set(cand1.keys()).intersection(set(cand2.keys()))
    if common_keys:
        # if several keys are common, choose one based on lower unique count in df1
        best_key = None
        best_count = None
        for key in common_keys:
            col1 = cand1[key][0]
            col2 = cand2[key][0]
            count1 = len(df1[col1].dropna().unique())
            if best_count is None or count1 < best_count:
                best_count = count1
                best_key = (col1, col2, key)
        return best_key
    return None

def join_dataframes(dataframes, candidate_mapping, threshold=100):
    """
    Using the candidate mapping, build a join graph and for each connected component,
    iteratively join the dataframes. For each connected component, the file with the fewest
    rows (a proxy for lowest resolution) is chosen as the base.
    
    Returns a dictionary where keys are base file names (or component identifiers) and
    values are the merged DataFrames.
    """
    G = build_join_graph(candidate_mapping)
    components = list(nx.connected_components(G))
    merged_dfs = {}
    processed_files = set()
    
    for comp in components:
        if len(comp) == 1:
            # No join possible; keep the file as is.
            file = list(comp)[0]
            merged_dfs[file] = dataframes[file]
            processed_files.add(file)
        else:
            comp_files = list(comp)
            # choose base file as one with the smallest number of rows
            base_file = min(comp_files, key=lambda f: len(dataframes[f]))
            merged_df = dataframes[base_file]
            merged_files = {base_file}
            remaining_files = set(comp_files) - merged_files

            # Iteratively try to join any remaining file with the merged dataframe
            while remaining_files:
                joined = False
                for file in list(remaining_files):
                    join_key = find_common_join_key(merged_df, dataframes[file], threshold)
                    if join_key:
                        col_merged, col_new, sig = join_key
                        df_to_join = dataframes[file].copy()
                        # if the join column names differ, rename the new dataframe’s column
                        if col_new != col_merged:
                            df_to_join = df_to_join.rename(columns={col_new: col_merged})
                        # outer join to preserve information
                        merged_df = pd.merge(merged_df, df_to_join, on=col_merged, how='outer',
                                             suffixes=('', '_' + file))
                        merged_files.add(file)
                        remaining_files.remove(file)
                        joined = True
                        break
                if not joined:
                    # If no join key is found for any remaining file, leave them separate.
                    break
            # Record the merged DataFrame from this component.
            merged_dfs[base_file] = merged_df
            processed_files.update(comp)
    
    # For any files not connected in the join graph, add them as individual outputs.
    for file in dataframes:
        if file not in processed_files:
            merged_dfs[file] = dataframes[file]
    return merged_dfs

def process_csv_folder(folder_path, threshold=100):
    """
    Wrapper function that:
      1. Loads all CSV files from the given folder.
      2. Analyzes each DataFrame’s columns for candidate join keys.
      3. Identifies join relationships and builds a join graph.
      4. Chooses a base dataset (lowest resolution) in each joinable group.
      5. Performs iterative joins and returns a minimal set of merged DataFrames.
      
    The 'threshold' parameter controls the maximum unique value count for a column to be
    considered as a candidate join column.
    
    Returns a dictionary of DataFrames keyed by an identifier.
    """
    dataframes = load_csv_files(folder_path)
    candidate_mapping = build_candidate_mapping(dataframes, threshold)
    merged_dfs = join_dataframes(dataframes, candidate_mapping, threshold)
    return merged_dfs

In [45]:
folder = "../data/poc_data_and_similar_paper/"
excel_files = load_excel_files(folder)

In [47]:
excel_files["fourth_corner_functional_diversity.xlsx"]["wild_bee_traits"]

Unnamed: 0,Ecological traits of wild bees,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,,,,,,,,,,,,
1,Species code,Family,Social behavior,Nesting place,Floral specificity,Flight beginning period,End of flight period,Lifespan [month],Voltinism,Pollen carrying-structure,Mean body size,Rarity
2,and_alf,Andrenidae,solit,soil,poly,May/June,July/August,4,biv,scopa on leg,small,rare
3,and_bar,Andrenidae,solit,soil,poly,March/April,July/August,4,biv,scopa on leg,medium,common
4,and_bic,Andrenidae,solit,soil,poly,March/April,July/August,5,biv,scopa on leg,medium,common
...,...,...,...,...,...,...,...,...,...,...,...,...
185,sph_ruf,Halictidae,clep,clep,clep,May/June,July/August,4,mon,clep,medium,rare
186,sph_sca,Halictidae,clep,clep,clep,May/June,July/August,4,mon,clep,medium,rare
187,ste_bre,Megachilidae,clep,clep,clep,May/June,July/August,3,mon,clep,small,common
188,ste_pun,Megachilidae,clep,clep,clep,May/June,July/August,3,mon,clep,medium,common


In [None]:
for item in excel_files.items():
    print(item.keys())
    print()

AttributeError: 'tuple' object has no attribute 'keys'

In [None]:
build_candidate_mapping(excel_files["fourth_corner_functional_diversity.xlsx"]["wild_bee_traits"], threshold=100)

In [4]:
result

{}