In [1]:
import pandas as pd
from tqdm import tqdm

def read_csv_to_dict(csv_file):
    #Read a CSV file 
    df = pd.read_csv(csv_file)
    return df.set_index('Locus Tag').T.to_dict('dict')

def find_best_match(query_seq, db_seqs):
    #Find the best match for a query sequence
    best_match = None
    best_score = 0
    best_details = {}

    for tag, details in db_seqs.items():
        db_seq = details['NT Seq']
        # Calculate the match score
        match_score = sum(1 for a, b in zip(query_seq, db_seq) if a == b)
        
        # Normalize
        min_length = min(len(query_seq), len(db_seq))
        normalized_score = match_score / min_length
        
        if normalized_score > best_score:
            best_score = normalized_score
            best_match = tag
            best_details = details
    
    # Return normalized score
    return best_match, best_score, best_details

def map_sequences(query_csv, db_csv):
    # Map sequences
    query_seqs = read_csv_to_dict(query_csv)
    db_seqs = read_csv_to_dict(db_csv)
    
    mappings = []
    
    for query_tag, query_details in tqdm(query_seqs.items(), desc="Mapping Information From S-0253 To 630"):
        query_seq = query_details['NT Seq']
        best_match, normalized_score, best_match_details = find_best_match(query_seq, db_seqs)
        
        # Results list
        mappings.append({
            'Query Locus Tag': query_tag,
            'Query Gene': query_details.get('Gene', 'N/A'),
            'Query EC Number': query_details.get('EC Number', 'N/A'),
            'Best Match Locus Tag': best_match,
            'Best Match Gene': best_match_details.get('Gene', 'N/A'),
            'Best Match EC Number': best_match_details.get('EC Number', 'N/A'),
            'Match Score': normalized_score
        })
    
    df = pd.DataFrame(mappings)
    
    df['Match Score'] = df['Match Score'].round(3)
    
    return df

In [2]:
# Inputs
species1_query_csv = 'S0253_Feature Table_with_NT_Sequences_demo.csv'
species2_db_csv = '630_Feature Table_with_NT_Sequences.csv'

# Map sequences
mapped_results = map_sequences(species1_query_csv, species2_db_csv)

# Save results
mapped_results.to_csv('mapped_locus_tags.csv', index=False)
print(f"\nMapping results saved to 'mapped_locus_tags.csv'!\n")

# Filter above 0.8
high_score_results = mapped_results[mapped_results['Match Score'] > 0.8].sort_values(by='Match Score', ascending=False)

# Results
print(f"--------- High Confidence Results ---------")
for _, row in high_score_results.iterrows():
    print(f"Query Locus Tag: {row['Query Locus Tag']} --> Best Match Locus Tag: {row['Best Match Locus Tag']} | Match Score: {row['Match Score']:.3f}")

# Summary
total_queries = len(mapped_results)
high_score_count = len(high_score_results)
success_rate = (high_score_count / total_queries) * 100
print(f"\nSummary: {high_score_count} out of {total_queries} query requests have a Match Score above 0.99.")
print(f"The success rate is {success_rate:.2f}%.\n")

  return df.set_index('Locus Tag').T.to_dict('dict')
  return df.set_index('Locus Tag').T.to_dict('dict')
Mapping Information From S-0253 To 630: 100%|███| 49/49 [00:05<00:00,  8.89it/s]


Mapping results saved to 'mapped_locus_tags.csv'!

--------- High Confidence Results ---------
Query Locus Tag: KNZ77_00010 --> Best Match Locus Tag: CDIF630_00002 | Match Score: 1.000
Query Locus Tag: KNZ77_00150 --> Best Match Locus Tag: CDIF630_00033 | Match Score: 1.000
Query Locus Tag: KNZ77_00715 --> Best Match Locus Tag: CDIF630_00149 | Match Score: 1.000
Query Locus Tag: KNZ77_00710 --> Best Match Locus Tag: CDIF630_00148 | Match Score: 1.000
Query Locus Tag: KNZ77_00705 --> Best Match Locus Tag: CDIF630_00147 | Match Score: 1.000
Query Locus Tag: KNZ77_00700 --> Best Match Locus Tag: CDIF630_00146 | Match Score: 1.000
Query Locus Tag: KNZ77_00695 --> Best Match Locus Tag: CDIF630_00145 | Match Score: 1.000
Query Locus Tag: KNZ77_00690 --> Best Match Locus Tag: CDIF630_00144 | Match Score: 1.000
Query Locus Tag: KNZ77_00620 --> Best Match Locus Tag: CDIF630_00130 | Match Score: 1.000
Query Locus Tag: KNZ77_00615 --> Best Match Locus Tag: CDIF630_00129 | Match Score: 1.000
Quer




In [3]:
import pandas as pd
from IPython.display import display, HTML

def color_row_based_on_score(val):
    """Color rows based on the Match Score value."""
    try:
        score = float(val)
    except ValueError:
        return ''  # Return empty string if conversion fails

    color = 'background-color: '
    if score > 0.9:
        color += 'lightgreen'
    elif score > 0.8:
        color += 'lightyellow'
    else:
        color += 'lightcoral'
    return color

def display_mapped_results(csv_file):
    """Read and display the mapped locus tags results from a CSV file with improved aesthetics."""
    df = pd.read_csv(csv_file)
    
    # Replace NaN with '-'
    df = df.fillna('-')
    
    # Format Match Score to 3 decimal places
    df['Match Score'] = df['Match Score'].apply(lambda x: f'{float(x):.3f}' if isinstance(x, (float, int, str)) and x != '-' else x)
    
    # Sort DataFrame by Match Score in descending order
    df_sorted = df.sort_values(by='Match Score', ascending=False, key=lambda col: pd.to_numeric(col, errors='coerce'))
    
    # Reset index and start from 1
    df_sorted.index = range(1, len(df_sorted) + 1)

    # Prepare the DataFrame for styling
    def apply_styles(df):
        return df.style.applymap(color_row_based_on_score, subset=['Match Score'])
    
    # Display a summary first
    total_queries = len(df)
    high_score_results = df_sorted[df_sorted['Match Score'].apply(lambda x: pd.to_numeric(x, errors='coerce')) > 0.8]
    high_score_count = len(high_score_results)
    success_rate = (high_score_count / total_queries) * 100
    
    #print(f"Summary: {high_score_count} out of {total_queries} query requests have a Match Score above 0.8. The success rate is {success_rate:.2f}%.\n")

    # Display the styled DataFrame
    styled_df = apply_styles(df_sorted)
    display(styled_df)
    
    # Optionally, save the formatted HTML table to a file
    html = styled_df.to_html()  # Updated method to convert to HTML
    with open('formatted_mapped_locus_tags.html', 'w') as file:
        file.write(html)

In [4]:
mapped_results_csv = 'mapped_locus_tags.csv'
# Display the mapped results
display_mapped_results(mapped_results_csv)

Unnamed: 0,Query Locus Tag,Query Gene,Query EC Number,Best Match Locus Tag,Best Match Gene,Best Match EC Number,Match Score
1,KNZ77_00150,-,-,CDIF630_00033,-,-,1.0
2,KNZ77_00605,rplK,-,CDIF630_00126,rplK,-,1.0
3,KNZ77_00140,-,-,CDIF630_00031,ybaB,-,1.0
4,KNZ77_00010,dnaN,2.7.7.7,CDIF630_00002,dnaN,2.7.7.7,1.0
5,KNZ77_00175,-,-,CDIF630_00038,-,-,1.0
6,KNZ77_00180,-,-,CDIF630_00015,-,-,1.0
7,KNZ77_00185,-,-,CDIF630_00040,-,-,1.0
8,KNZ77_00190,-,-,CDIF630_00041,-,-,1.0
9,KNZ77_00595,secE,-,CDIF630_00124,secE,-,1.0
10,KNZ77_00615,rplJ,-,CDIF630_00129,rplJ,-,1.0
