In [13]:
import os
import pandas as pd
from tqdm import tqdm
from itertools import combinations
from IPython.display import display

def read_csv_to_dict(csv_file):
    df = pd.read_csv(csv_file)
    return df.set_index('Locus Tag').T.to_dict('dict')

def find_best_match(query_seq, db_seqs):
    best_match = None
    best_score = 0
    best_details = {}
    for tag, details in db_seqs.items():
        db_seq = details['NT Seq']
        match_score = sum(1 for a, b in zip(query_seq, db_seq) if a == b)
        min_length = min(len(query_seq), len(db_seq))
        normalized_score = match_score / min_length
        if normalized_score > best_score:
            best_score = normalized_score
            best_match = tag
            best_details = details
    return best_match, best_score, best_details

def map_sequences(query_csv, db_csv):
    query_seqs = read_csv_to_dict(query_csv)
    db_seqs = read_csv_to_dict(db_csv)
    
    mappings = []
    for query_tag, query_details in tqdm(query_seqs.items(), desc=f"Mapping {os.path.basename(query_csv)} → {os.path.basename(db_csv)}"):
        query_seq = query_details['NT Seq']
        best_match, normalized_score, best_match_details = find_best_match(query_seq, db_seqs)
        mappings.append({
            'Query Locus Tag': query_tag,
            'Query Gene': query_details.get('Gene', 'N/A'),
            'Query EC Number': query_details.get('EC Number', 'N/A'),
            'Best Match Locus Tag': best_match,
            'Best Match Gene': best_match_details.get('Gene', 'N/A'),
            'Best Match EC Number': best_match_details.get('EC Number', 'N/A'),
            'Match Score': round(normalized_score, 3)
        })
    return pd.DataFrame(mappings)

def map_all_pairs_in_folder(folder_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    
    csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]
    file_pairs = list(combinations(csv_files, 2))  # Only one direction
    
    for query_file, db_file in file_pairs:
        query_name = os.path.splitext(os.path.basename(query_file))[0]
        db_name = os.path.splitext(os.path.basename(db_file))[0]
        
        print(f"Processing Pair: {query_name} → {db_name}")
        mapped_results = map_sequences(query_file, db_file)
        
        output_csv = os.path.join(output_folder, f"{query_name}_to_{db_name}_mapping.csv")
        mapped_results.to_csv(output_csv, index=False)
        print(f"Results saved: {output_csv}")

        # Optional: high score summary
        high_score_results = mapped_results[mapped_results['Match Score'] > 0.8]
        success_rate = (len(high_score_results) / len(mapped_results)) * 100 if len(mapped_results) > 0 else 0
        print(f"{len(high_score_results)} high-confidence matches (>{0.8}), success rate: {success_rate:.2f}%")

def color_row_based_on_score(val):
    try:
        score = float(val)
    except ValueError:
        return ''
    color = 'background-color: '
    if score > 0.9:
        color += 'lightgreen'
    elif score > 0.8:
        color += 'lightyellow'
    else:
        color += 'lightcoral'
    return color

def display_and_save_mapped_results(csv_file, html_output_path):
    df = pd.read_csv(csv_file)
    df = df.fillna('-')
    df['Match Score'] = df['Match Score'].apply(lambda x: f'{float(x):.3f}' if isinstance(x, (float, int, str)) and x != '-' else x)
    df_sorted = df.sort_values(by='Match Score', ascending=False, key=lambda col: pd.to_numeric(col, errors='coerce'))
    df_sorted.index = range(1, len(df_sorted) + 1)

    styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])
    html = styled_df.to_html()

    with open(html_output_path, 'w', encoding='utf-8') as f:
        f.write(html)
    print(f"Saved HTML: {html_output_path}")

def generate_html_for_all_results(mapping_folder, html_output_folder):
    os.makedirs(html_output_folder, exist_ok=True)

    csv_files = [f for f in os.listdir(mapping_folder) if f.endswith('.csv')]
    for csv_file in csv_files:
        csv_path = os.path.join(mapping_folder, csv_file)
        html_file = os.path.splitext(csv_file)[0] + '.html'
        html_path = os.path.join(html_output_folder, html_file)
        display_and_save_mapped_results(csv_path, html_path)

def number_of_results(folder_path):
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    num_files = len(csv_files)
    num_pairs = (num_files * (num_files - 1)) // 2  # combinations(n, 2)
    print(f"{num_files} CSV files found in '{folder_path}'.")
    print(f"{num_pairs} result files will be generated (one-way mapping for each pair).")

In [14]:
input_folder = 'Input'        # Folder with all CSV files
output_folder = 'Output'  # Where results will be saved
number_of_results(input_folder)
map_all_pairs_in_folder(input_folder, output_folder)

6 CSV files found in 'Input'
15 result files will be generated (one-way mapping for each pair)
Processing Pair: GCA_000932055.2 → GCA_000953275.1


Mapping GCA_000932055.2.csv → GCA_000953275.1.csv: 100%|█| 3913/3913 [02:43<00:0
  return df.set_index('Locus Tag').T.to_dict('dict')


Results saved: Output/GCA_000932055.2_to_GCA_000953275.1_mapping.csv
3439 high-confidence matches (>0.8), success rate: 87.89%
Processing Pair: GCA_000932055.2 → S0253


Mapping GCA_000932055.2.csv → S0253.csv: 100%|█| 3913/3913 [02:30<00:00, 26.02it


Results saved: Output/GCA_000932055.2_to_S0253_mapping.csv
1644 high-confidence matches (>0.8), success rate: 42.01%
Processing Pair: GCA_000932055.2 → GCA_000085225.1 


Mapping GCA_000932055.2.csv → GCA_000085225.1 .csv: 100%|█| 3913/3913 [02:33<00:


Results saved: Output/GCA_000932055.2_to_GCA_000085225.1 _mapping.csv
2649 high-confidence matches (>0.8), success rate: 67.70%
Processing Pair: GCA_000932055.2 → GCA_000027105.1 


Mapping GCA_000932055.2.csv → GCA_000027105.1 .csv: 100%|█| 3913/3913 [02:35<00:


Results saved: Output/GCA_000932055.2_to_GCA_000027105.1 _mapping.csv
2668 high-confidence matches (>0.8), success rate: 68.18%
Processing Pair: GCA_000932055.2 → GCA_000009205.2


Mapping GCA_000932055.2.csv → GCA_000009205.2.csv: 100%|█| 3913/3913 [02:44<00:0
  return df.set_index('Locus Tag').T.to_dict('dict')


Results saved: Output/GCA_000932055.2_to_GCA_000009205.2_mapping.csv
3438 high-confidence matches (>0.8), success rate: 87.86%
Processing Pair: GCA_000953275.1 → S0253


Mapping GCA_000953275.1.csv → S0253.csv: 100%|█| 3950/3950 [02:32<00:00, 25.95it


Results saved: Output/GCA_000953275.1_to_S0253_mapping.csv
1539 high-confidence matches (>0.8), success rate: 38.96%
Processing Pair: GCA_000953275.1 → GCA_000085225.1 


Mapping GCA_000953275.1.csv → GCA_000085225.1 .csv: 100%|█| 3950/3950 [02:35<00:


Results saved: Output/GCA_000953275.1_to_GCA_000085225.1 _mapping.csv
2665 high-confidence matches (>0.8), success rate: 67.47%
Processing Pair: GCA_000953275.1 → GCA_000027105.1 


Mapping GCA_000953275.1.csv → GCA_000027105.1 .csv: 100%|█| 3950/3950 [02:37<00:


Results saved: Output/GCA_000953275.1_to_GCA_000027105.1 _mapping.csv
2684 high-confidence matches (>0.8), success rate: 67.95%
Processing Pair: GCA_000953275.1 → GCA_000009205.2


Mapping GCA_000953275.1.csv → GCA_000009205.2.csv: 100%|█| 3950/3950 [17:58<00:0
  return df.set_index('Locus Tag').T.to_dict('dict')


Results saved: Output/GCA_000953275.1_to_GCA_000009205.2_mapping.csv
3946 high-confidence matches (>0.8), success rate: 99.90%
Processing Pair: S0253 → GCA_000085225.1 


Mapping S0253.csv → GCA_000085225.1 .csv: 100%|█| 3715/3715 [02:22<00:00, 26.01i
  return df.set_index('Locus Tag').T.to_dict('dict')


Results saved: Output/S0253_to_GCA_000085225.1 _mapping.csv
1367 high-confidence matches (>0.8), success rate: 36.80%
Processing Pair: S0253 → GCA_000027105.1 


Mapping S0253.csv → GCA_000027105.1 .csv: 100%|█| 3715/3715 [02:24<00:00, 25.64i
  return df.set_index('Locus Tag').T.to_dict('dict')


Results saved: Output/S0253_to_GCA_000027105.1 _mapping.csv
1365 high-confidence matches (>0.8), success rate: 36.74%
Processing Pair: S0253 → GCA_000009205.2


Mapping S0253.csv → GCA_000009205.2.csv: 100%|█| 3715/3715 [02:32<00:00, 24.28it


Results saved: Output/S0253_to_GCA_000009205.2_mapping.csv
1579 high-confidence matches (>0.8), success rate: 42.50%
Processing Pair: GCA_000085225.1  → GCA_000027105.1 


Mapping GCA_000085225.1 .csv → GCA_000027105.1 .csv: 100%|█| 3488/3488 [02:28<00


Results saved: Output/GCA_000085225.1 _to_GCA_000027105.1 _mapping.csv
3460 high-confidence matches (>0.8), success rate: 99.20%
Processing Pair: GCA_000085225.1  → GCA_000009205.2


Mapping GCA_000085225.1 .csv → GCA_000009205.2.csv: 100%|█| 3488/3488 [02:34<00:


Results saved: Output/GCA_000085225.1 _to_GCA_000009205.2_mapping.csv
2646 high-confidence matches (>0.8), success rate: 75.86%
Processing Pair: GCA_000027105.1  → GCA_000009205.2


Mapping GCA_000027105.1 .csv → GCA_000009205.2.csv: 100%|█| 3546/3546 [02:37<00:

Results saved: Output/GCA_000027105.1 _to_GCA_000009205.2_mapping.csv
2660 high-confidence matches (>0.8), success rate: 75.01%





In [15]:
mapping_folder = 'Output'            # Folder with the results .csv files
html_output_folder = 'Output'    # Where to save the .html files
generate_html_for_all_results(mapping_folder, html_output_folder)

  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])
  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])


Saved HTML: Output/GCA_000953275.1_to_GCA_000009205.2_mapping.html
Saved HTML: Output/S0253_to_GCA_000027105.1 _mapping.html


  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])
  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])


Saved HTML: Output/GCA_000932055.2_to_GCA_000085225.1 _mapping.html
Saved HTML: Output/GCA_000953275.1_to_S0253_mapping.html


  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])
  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])


Saved HTML: Output/GCA_000932055.2_to_GCA_000953275.1_mapping.html
Saved HTML: Output/GCA_000953275.1_to_GCA_000085225.1 _mapping.html


  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])
  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])


Saved HTML: Output/GCA_000085225.1 _to_GCA_000027105.1 _mapping.html
Saved HTML: Output/GCA_000027105.1 _to_GCA_000009205.2_mapping.html


  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])
  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])


Saved HTML: Output/GCA_000932055.2_to_S0253_mapping.html
Saved HTML: Output/S0253_to_GCA_000009205.2_mapping.html


  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])
  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])


Saved HTML: Output/GCA_000953275.1_to_GCA_000027105.1 _mapping.html
Saved HTML: Output/GCA_000085225.1 _to_GCA_000009205.2_mapping.html


  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])


Saved HTML: Output/GCA_000932055.2_to_GCA_000009205.2_mapping.html
Saved HTML: Output/S0253_to_GCA_000085225.1 _mapping.html


  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])
  styled_df = df_sorted.style.applymap(color_row_based_on_score, subset=['Match Score'])


Saved HTML: Output/GCA_000932055.2_to_GCA_000027105.1 _mapping.html
