# 08 OpenFold Geometric Nearest Neighbours
# This notebook identifies overlapping chains between the User dataset and OpenFold,
# and performs a hierarchical geometric nearest neighbour search (Mean -> Full).


In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')
# Change directory to the working folder if necessary
try:
    os.chdir('/content/drive/MyDrive/BRI Analysis')
    print("Changed directory to /content/drive/MyDrive/BRI Analysis")
except:
    print("Could not change directory. Please check the path.")


Mounted at /content/drive
Changed directory to /content/drive/MyDrive/BRI Analysis


In [None]:
import pandas as pd
import numpy as np
import os
import json
from scipy.spatial import cKDTree
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import gc

# Paths
DATA_DIR = './data'
PLOTS_DIR = './plotting/nearest_neighbours_08'
BRI_DATA_DIR = './data/bri_computations' # Adjust if necessary

os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)
print("Environment Setup Complete.")


Environment Setup Complete.


In [None]:
# @title 1. Load Data Mapping
# Load User Data Mapping (Label -> Author Chain ID) to bridge datasets.

user_chain_map_path = os.path.join(DATA_DIR, 'cleaned_connective_chains_auth_chain_id.csv')
print(f"Loading User Data Mapping from {user_chain_map_path}...")

if os.path.exists(user_chain_map_path):
    df_user = pd.read_csv(user_chain_map_path)
    # Ensure columns are strings
    df_user['pdb_id'] = df_user['pdb_id'].astype(str).str.upper()
    df_user['author_chain_id'] = df_user['author_chain_id'].astype(str)
    df_user['author_chain_id'] = df_user['author_chain_id'].fillna('NA')
    print(f"Loaded {len(df_user)} user chains.")
else:
    raise FileNotFoundError(f"Mapping file not found: {user_chain_map_path}")


Loading User Data Mapping from ./data/cleaned_connective_chains_auth_chain_id.csv...
Loaded 478074 user chains.


In [None]:
# @title 2. Generate mean invariants with corresponding batch number

import os
import pandas as pd
import tqdm
import gc

# @title 1. Generate Mean Invariants with Batch Numbers
inv_dir = "./data/bri_computations"
output_file = "./data/PDB727K_mean_invariants_with_batch.csv"

if os.path.exists(output_file):
    os.remove(output_file)

# Get files and sort them to ensure deterministic order (optional but good practice)
files = sorted([f for f in os.listdir(inv_dir) if f.endswith('.parquet')])

def calculate_means(df):
    # Calculate means for unique group in 'pdb_id', 'model_id', 'chain_id'
    bri_cols = ['x(N)', 'y(N)', 'z(N)', 'x(A)', 'y(A)', 'z(A)', 'x(C)', 'y(C)',
       'z(C)']

    means = df.groupby(['pdb_id', 'model_id', 'chain_id', 'start_residue', 'chain_length'])[bri_cols].mean().reset_index()
    return means


for i, filename in enumerate(tqdm.tqdm(files, desc="Processing Batches")):
    try:
        # Extract batch number from filename "batch_123.parquet"
        # Adjust split logic if your naming convention differs
        try:
            batch_num = int(filename.split('_')[1].split('.')[0])
        except (IndexError, ValueError):
            # Fallback if filename is weird, though user stated "batch_i.parquet"
            print(f"Warning: Could not parse batch number from {filename}. assigning {i}.")
            batch_num = i

        inv_data = pd.read_parquet(os.path.join(inv_dir, filename))

        inv_data['chain_id'] = inv_data['chain_id'].astype(str)
        inv_data['chain_id'] = inv_data['chain_id'].fillna('NA')

        # Calculate means
        mean_data = calculate_means(inv_data)

        # --- ADD BATCH NUMBER ---
        mean_data['batch_number'] = batch_num

        # Write incrementally
        mode = 'w' if i == 0 else 'a'
        header = (i == 0)
        mean_data.to_csv(output_file, index=False, mode=mode, header=header)

        del inv_data, mean_data
        if i % 10 == 0: gc.collect()

    except Exception as e:
        print(f"Skipping {filename} due to error: {e}")

Processing Batches: 100%|██████████| 146/146 [05:48<00:00,  2.39s/it]


In [5]:
# @title 3. Load OpenFold Data
# Load Duplicate list and Chain Cache to identify valid OpenFold chains.

of_cache_path = os.path.join(DATA_DIR, 'openfold_chain_data_cache.json')
of_duplicates_path = os.path.join(DATA_DIR, 'openfold_duplicate_chains.txt')

of_duplicates = set()
if os.path.exists(of_duplicates_path):
    print("Loading OpenFold duplicate chains...")
    with open(of_duplicates_path, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) > 1:
                # Exclude representative (first item) and keep duplicates
                for dup in parts[1:]:
                    of_duplicates.add(dup.upper())
    print(f"Found {len(of_duplicates)} OpenFold duplicate chains.")
else:
    print("Warning: OpenFold duplicate chains file not found. Skipping duplicate filter.")

print("Loading OpenFold Chain Cache...")
if os.path.exists(of_cache_path):
    with open(of_cache_path, 'r') as f:
        of_data = json.load(f)

    of_chains = set()
    for key in of_data:
        if key.upper() not in of_duplicates:
            # key format is '1ABC_A'
            parts = key.split('_')
            if len(parts) >= 2:
                pdb_id = parts[0].upper()
                chain_id = parts[1]
                # if chain_id is np.nan then convert to 'NA'
                if pd.isna(chain_id):
                    chain_id = 'NA'

                of_chains.add((pdb_id, chain_id))
    print(f"Loaded {len(of_chains)} unique OpenFold chains.")
else:
    raise FileNotFoundError(f"OpenFold cache not found: {of_cache_path}")


Loading OpenFold duplicate chains...
Found 472464 OpenFold duplicate chains.
Loading OpenFold Chain Cache...
Loaded 128793 unique OpenFold chains.


In [6]:
# @title 3. Identify Overlap
# Find intersection: User (Author Chain ID) ∩ OpenFold (Chain ID)

user_keys = set(zip(df_user['pdb_id'], df_user['author_chain_id']))
overlap_keys = user_keys.intersection(of_chains)
target_chains_set = set(overlap_keys)

print(f"Found {len(overlap_keys)} overlapping chains.")

Found 58971 overlapping chains.


In [7]:
# @title 4. Filter Mean Invariants
# Filter the user's mean invariants file to only include the overlapping chains.

mean_invariants_path = os.path.join(DATA_DIR, 'PDB727K_mean_invariants_with_batch.csv')
print(f"Loading Mean Invariants from {mean_invariants_path}...")

if os.path.exists(mean_invariants_path):
    df_mean = pd.read_csv(mean_invariants_path)
    # Check column names for PDB/Chain
    # Assuming 'pdb_id' and 'chain_id' (Label) are present
    df_mean['pdb_id'] = df_mean['pdb_id'].astype(str).str.upper()
    df_mean['chain_id'] = df_mean['chain_id'].astype(str)
    df_mean['chain_id'] = df_mean['chain_id'].fillna('NA')

    df_mean = df_mean[df_mean['model_id']==1].copy()

    # Map to Author ID using user_map
    df_user_map = df_user[['pdb_id', 'chain_id', 'author_chain_id']].drop_duplicates()

    print("Merging with Author Chain IDs...")
    df_mean_mapped = df_mean.merge(df_user_map, on=['pdb_id', 'chain_id'], how='inner')

    # Filter by target_chains_set (pdb_id, author_chain_id)
    df_mean_mapped['match_key'] = list(zip(df_mean_mapped['pdb_id'], df_mean_mapped['author_chain_id']))
    df_target = df_mean_mapped[df_mean_mapped['match_key'].isin(target_chains_set)].copy()

# --- NEW: Load Metadata and Filter ---
# Added by automated script to match 07_finding_differences logic
meta_data_path = os.path.join(DATA_DIR, 'PDB727K_webscrape_meta_data.csv')
if os.path.exists(meta_data_path):
    print(f"Loading Metadata from {meta_data_path}...")
    meta_data = pd.read_csv(meta_data_path)

    # Clean Resolution
    # '[2.55]' -> '2.55'
    clean_step_1 = meta_data['Resolution'].astype(str).str.strip('[]')
    clean_step_2 = clean_step_1.str.split(',').str[0]
    meta_data['resolution'] = pd.to_numeric(clean_step_2, errors='coerce')

    # Clean Release Date
    meta_data['release_date'] = pd.to_datetime(meta_data['release_date'])
    meta_data['release_date'] = meta_data['release_date'].dt.tz_localize(None).dt.normalize()

    # Merge metadata into df_target
    # Note: df_target is created from df_mean_mapped further down in the original cell.
    # We should inject this Logic AFTER df_target is created.

    # Join metadata
    # We left join on pdb_id
    df_target = df_target.merge(meta_data[['pdb_id', 'resolution', 'release_date']], on='pdb_id', how='left')

    print(f"Pre-filter count: {len(df_target)}")
    # Apply Filters: Chain Length >= 16, Resolution <= 9, Release Date <= 2021-10-10
    df_target = df_target[
        (df_target['chain_length'] >= 16) &
        (df_target['resolution'] <= 9) &
        (df_target['release_date'] <= '2021-10-10')
    ]
    print(f"Post-filter count: {len(df_target)}")

    print(f"Original Mean Invariants: {len(df_mean)}")
    print(f"Filtered Target Mean Invariants: {len(df_target)}")

    # Clean up to save memory
    #del df_mean, df_mean_mapped
    #gc.collect()
else:
    raise FileNotFoundError(f"Mean Invariants file not found: {mean_invariants_path}")


Loading Mean Invariants from ./data/PDB727K_mean_invariants_with_batch.csv...
Merging with Author Chain IDs...
Loading Metadata from ./data/PDB727K_webscrape_meta_data.csv...
Pre-filter count: 58929
Post-filter count: 44405
Original Mean Invariants: 477859
Filtered Target Mean Invariants: 44405


In [8]:
# @title 5. Phase 1: Mean Invariant Comparison
# Perform coarse nearest neighbour search using Mean Invariants with Chebyshev distance.

mean_pairs_output = os.path.join(DATA_DIR, 'PDB727K_openfold_mean_pairs.csv')
RADIUS = 0.01

# Identify Feature Columns (assuming numeric columns excluding metadata)
meta_cols = {'pdb_id', 'chain_id', 'author_chain_id', 'model_id', 'start_residue', 'chain_length', 'batch_number', 'match_key'}
feature_cols = [c for c in df_target.columns if c not in meta_cols and np.issubdtype(df_target[c].dtype, np.number)]

# Sanity check for feature columns
if len(feature_cols) < 5:
    print(f"Warning: Only found {len(feature_cols)} features. Checking for 'inv_' prefix.")
    feature_cols = [c for c in df_target.columns if c.startswith('inv_')]

print(f"Using {len(feature_cols)} features for comparison.")

# Prepare Output
id_cols = ['pdb_id', 'chain_id', 'author_chain_id', 'chain_length', 'batch_number']
header = [f"{c}_1" for c in id_cols] + [f"{c}_2" for c in id_cols] + ['chebyshev_dist']

# Group by Chain Length
grouped = df_target.groupby('chain_length')
total_pairs = 0

print(f"Writing pairs to {mean_pairs_output}...")
with open(mean_pairs_output, 'w') as f_out:
    f_out.write(','.join(header) + '\n')

    for length, group in tqdm.tqdm(grouped, desc="Processing Lengths"):
        if len(group) < 2: continue

        points = group[feature_cols].values

        # Build cKDTree
        try:
            tree = cKDTree(points)
            # Query pairs within Radius (p=np.inf for Chebyshev)
            pairs = tree.query_pairs(r=RADIUS, p=np.inf)
        except Exception as e:
            print(f"Error processing length {length}: {e}")
            continue

        if not pairs: continue

        pairs_arr = np.array(list(pairs))

        # Calculate precise distances for retrieved pairs
        p1 = points[pairs_arr[:, 0]]
        p2 = points[pairs_arr[:, 1]]
        dists = np.max(np.abs(p1 - p2), axis=1)

        # Retrieve IDs
        res1 = group.iloc[pairs_arr[:, 0]][id_cols].reset_index(drop=True)
        res2 = group.iloc[pairs_arr[:, 1]][id_cols].reset_index(drop=True)

        res1.columns = [f"{c}_1" for c in id_cols]
        res2.columns = [f"{c}_2" for c in id_cols]

        res_df = pd.concat([res1, res2], axis=1)
        res_df['chebyshev_dist'] = dists

        # Append to CSV
        res_df.to_csv(f_out, header=False, index=False)
        total_pairs += len(res_df)

print(f"Phase 1 Complete. Found {total_pairs} pairs.")


Using 10 features for comparison.
Writing pairs to ./data/PDB727K_openfold_mean_pairs.csv...


Processing Lengths: 100%|██████████| 958/958 [00:01<00:00, 850.57it/s] 

Phase 1 Complete. Found 5062 pairs.





In [9]:
import pandas as pd
import numpy as np
import os
import tqdm
import gc

# @title 6. Full Comparison (Memory Optimized: Filter-on-Load)
# ==============================================================================
# Configuration
# ==============================================================================
pairs_file = f"./data/PDB727K_openfold_mean_pairs.csv"
parquet_dir = "./data/bri_computations"
output_full_diff_file = f"./data/PDB727K_full_comparison_results_001_seq_openfold.csv"

full_dist_threshold = 0.01

# Columns to load from Parquet
# We still load 'model_id' to filter by it, even if we assume it is 1
id_cols = ['pdb_id', 'model_id', 'chain_id']
bri_cols = ['x(N)', 'y(N)', 'z(N)', 'x(A)', 'y(A)', 'z(A)', 'x(C)', 'y(C)', 'z(C)']
seq_col = 'residue_label'

load_columns = list(set(id_cols + bri_cols + [seq_col]))

# ==============================================================================
# 1. Identify "Relevant Chains"
# ==============================================================================
if not os.path.exists(pairs_file):
    raise FileNotFoundError("Run Step 2 first.")

print("Loading pairs to identify relevant chains...")
pairs_df = pd.read_csv(pairs_file)

if len(pairs_df) == 0:
    print("No pairs found.")
    exit()

# Extract unique keys (Chain 1 and Chain 2) needed for analysis
# We use a set of tuples for O(1) lookup: (pdb_id, model_id, chain_id, start_residue, chain_length)
# CHANGE: We assume model_id is always 1, so we inject '1' into the key tuples manually.

print("Building set of required chains (Assuming Model ID = 1)...")

# Create vectors of 1s for the zip operation
ones_vector = [1] * len(pairs_df)

keys_1 = list(zip(
    pairs_df['pdb_id_1'],
    ones_vector,          # Hardcoded Model ID 1
    pairs_df['chain_id_1']
))

keys_2 = list(zip(
    pairs_df['pdb_id_2'],
    ones_vector,          # Hardcoded Model ID 1
    pairs_df['chain_id_2']
))

required_keys = set(keys_1) | set(keys_2)

print(f"Total unique chains to load: {len(required_keys)}")

# ==============================================================================
# 2. Load and Filter Data (One Pass over Files)
# ==============================================================================
# Store data as: chain_data_store[key] = {'mat': np.array, 'seq': str}
chain_data_store = {}

# Get list of batch files
batch_files = sorted([f for f in os.listdir(parquet_dir) if f.endswith('.parquet')])

print(f"Scanning {len(batch_files)} batch files...")

for f in tqdm.tqdm(batch_files, desc="Loading Data"):
    try:
        path = os.path.join(parquet_dir, f)

        # Load batch (only relevant columns)
        df = pd.read_parquet(path, columns=load_columns)

        # --- NEW FILTERING STEP ---
        # Strictly filter for model_id == 1
        df = df[df['model_id'] == 1]

        if df.empty:
            continue

        # Create a tuple key column for filtering
        # Note: Vectorized zip is faster than apply
        current_keys = list(zip(
            df['pdb_id'],
            df['model_id'],
            df['chain_id']
        ))

        # Filter: keep rows where the key is in our required set
        mask = [k in required_keys for k in current_keys]

        if not any(mask):
            continue # Nothing useful in this batch

        filtered_df = df[mask].copy()

        # Group by chain to extract Matrix and Sequence
        # We groupby the full key
        grouped = filtered_df.groupby(id_cols)

        for key, group in grouped:
            # key is the tuple (pdb, model, chain, start, length)

            # Extract Matrix
            mat = group[bri_cols].to_numpy()

            # Extract Sequence
            labels = group[seq_col]
            if len(labels) > 0 and isinstance(labels.iloc[0], str):
                # Standard case: sequence of characters
                seq = "".join(labels)
            else:
                seq = ""

            chain_data_store[key] = {'mat': mat, 'seq': seq}

        del df, filtered_df, mask, current_keys
        # gc.collect()

    except Exception as e:
        print(f"Error reading {f}: {e}")

print(f"Successfully loaded {len(chain_data_store)} chains into memory.")

# ==============================================================================
# 3. Compute Distances
# ==============================================================================
print("Computing pairwise comparisons...")

results_list = []

# Iterate through pairs and lookup data from memory
for idx, row in tqdm.tqdm(pairs_df.iterrows(), total=len(pairs_df), desc="Comparing"):

    # CHANGE: Hardcoded '1' for model_id in the lookup key
    key1 = (row['pdb_id_1'], 1, row['chain_id_1'])
    key2 = (row['pdb_id_2'], 1, row['chain_id_2'])

    # Retrieve data
    if key1 not in chain_data_store or key2 not in chain_data_store:
        # Should not happen if logic is correct, but safe to skip
        continue

    data1 = chain_data_store[key1]
    data2 = chain_data_store[key2]

    mat1 = data1['mat']
    mat2 = data2['mat']

    # Check length compatibility
    min_len = min(len(mat1), len(mat2))

    # Compute Distance
    dist = np.max(np.abs(mat1[:min_len] - mat2[:min_len]))

    # Check Threshold
    if dist <= full_dist_threshold:
        seq1 = data1['seq']
        seq2 = data2['seq']

        res_row = row.to_dict()
        res_row['full_chebyshev_dist'] = dist
        res_row['sequence_1'] = seq1
        res_row['sequence_2'] = seq2
        res_row['sequences_identical'] = 1 if seq1 == seq2 else 0

        results_list.append(res_row)

# ==============================================================================
# 4. Save Results
# ==============================================================================
if results_list:
    final_df = pd.DataFrame(results_list)
    final_df.to_csv(output_full_diff_file, index=False)
    print(f"Saved {len(final_df)} passing pairs to {output_full_diff_file}")
else:
    print("No pairs passed the full distance threshold.")

Loading pairs to identify relevant chains...
Building set of required chains (Assuming Model ID = 1)...
Total unique chains to load: 1165
Scanning 146 batch files...


Loading Data: 100%|██████████| 146/146 [01:09<00:00,  2.11it/s]


Successfully loaded 1165 chains into memory.
Computing pairwise comparisons...


Comparing: 100%|██████████| 5062/5062 [00:00<00:00, 10985.24it/s]


Saved 7 passing pairs to ./data/PDB727K_full_comparison_results_001_seq_openfold.csv


In [10]:
final_df

Unnamed: 0,pdb_id_1,chain_id_1,author_chain_id_1,chain_length_1,batch_number_1,pdb_id_2,chain_id_2,author_chain_id_2,chain_length_2,batch_number_2,chebyshev_dist,full_chebyshev_dist,sequence_1,sequence_2,sequences_identical
0,3IYL,A,A,41,57,5ZVT,K,A,41,84,0.0,0.0,GNVQTSVNTYNITGDGNSFTPTSDMTSTAAPAIDLKPGVLN,GNVQTSVNTYNITGDGNSFTPTSDMTSTAAPAIDLKPGVLN,1
1,4G6D,A,A,62,64,4G94,A,A,62,64,0.0,0.0,MKEQLEDVLDTLTDREENVLRLRFGLDDGRTRTLEEVGKVFGVTRE...,MKEQLEDVLDTLTDREENVLRLRFGLDDGRTRTLEEVGKVFGVTRE...,1
2,2BSQ,E,E,68,23,2H1O,G,E,68,29,0.0,0.0,ASVVIRNLSEATHNAIKFRARAAGRSTEAEIRLILDNIAKAQQTVR...,ASVVIRNLSEATHNAIKFRARAAGRSTEAEIRLILDNIAKAQQTVR...,1
3,2BSQ,A,A,143,23,2H1O,C,A,143,29,0.0,0.0,MILLDTNVISEPLRPQPNERVVAWLDSLILEDVYLSAITVAEMRLG...,MILLDTNVISEPLRPQPNERVVAWLDSLILEDVYLSAITVAEMRLG...,1
4,1CE7,A,A,241,2,2MLL,A,A,241,46,0.0,0.0,YERGDLDVTAQTTGAGYFSFITLLRDYVSSGSFSNAIPLLSQSGGG...,YERGDLDVTAQTTGAGYFSFITLLRDYVSSGSFSNAIPLLSQSGGG...,0
5,5MXN,B,A,473,78,5OJQ,B,A,473,79,6.3e-05,0.003,GSLLDEIMAQTRIAPSEEGYDIAKKGVAAFIENLMGSQHSAEPVNK...,GSLLDEIMAQTRCAPSEEGYDIAKKGVAAFIENLMGSQHSAEPVNK...,0
6,7ADJ,A,A,613,105,7ADK,A,A,613,105,0.0,0.0,RKQTITIAGIEVEAEIEGPPGFVTHQRDKDRKISNPTKPYQNHTVN...,RKQTITIAGIEVEAEIEGPPGFVTHQRDKDRKISNPTKPYQNHTVN...,1


In [11]:
# @title 7. Visualization
# Generate histogram of pairwise L-inf distances.

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Configuration
restrict_suffix = ""
input_file = f"./data/PDB727K_full_comparison_results_001_seq_openfold.csv"
output_dir = './plotting/nearest_neighbours_001A_openfold'

# 1. Create Output Directory
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Check if file exists
if os.path.exists(input_file):
    print("Loading data for plotting...")
    df = pd.read_csv(input_file)

    # Filter data
    diff_seq_data = df[df['sequences_identical'] == 0]
    same_seq_data = df[df['sequences_identical'] == 1]

    # Common Plot Settings
    x_label = r'$L_{\infty}$ distance on pairs of BRI, Angstroms'
    y_label = 'Pairs of close chains'
    bins_range = (0, 0.01)
    bin_width = 0.001

    # Helper function to generate plots efficiently
    def generate_histogram(data, color, filename_suffix, log_scale=False):
        plt.figure(figsize=(10, 6))
        sns.set_style("whitegrid")
        sns.set(font_scale=1.2)

        # Plot (Note: edgecolor removed to drop black border)
        sns.histplot(
            data=data,
            x='full_chebyshev_dist',
            binwidth=bin_width,
            binrange=bins_range,
            color=color,
            element="bars",
            linewidth=0  # Explicitly ensure no border
        )

        if log_scale:
            plt.yscale('log')
            filename_suffix += "_log"

        plt.xlabel(x_label)
        plt.ylabel(y_label)
        # No Title

        plt.tight_layout()

        # Construct filename
        filename = f'PDB727K_pairwise_BRI_comparisons_{filename_suffix}.png'
        save_path = os.path.join(output_dir, filename)

        plt.savefig(save_path)
        plt.close()

        print(f"Saved: {save_path}")

    # --- Generate the 4 Plots ---

    # 1. Linear Scale
    generate_histogram(diff_seq_data, 'orange', 'different_seq', log_scale=False)
    generate_histogram(same_seq_data, 'cornflowerblue', 'identical_seq', log_scale=False)

    # 2. Log Scale
    generate_histogram(diff_seq_data, 'orange', 'different_seq', log_scale=True)
    generate_histogram(same_seq_data, 'cornflowerblue', 'identical_seq', log_scale=True)

else:
    print(f"Input file not found: {input_file}")
    print("Please ensure you have run the 'Full Comparison' step to generate the results CSV.")


Loading data for plotting...
Saved: ./plotting/nearest_neighbours_001A_openfold/PDB727K_pairwise_BRI_comparisons_different_seq.png
Saved: ./plotting/nearest_neighbours_001A_openfold/PDB727K_pairwise_BRI_comparisons_identical_seq.png
Saved: ./plotting/nearest_neighbours_001A_openfold/PDB727K_pairwise_BRI_comparisons_different_seq_log.png
Saved: ./plotting/nearest_neighbours_001A_openfold/PDB727K_pairwise_BRI_comparisons_identical_seq_log.png
