In [137]:
from itertools import combinations
import os

In [138]:
def parse_fasta(file_path):
    organisms = set()
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith(">"):
                organism = line.split("[")[-1].rstrip("]\n")
                organisms.add(organism)
    return organisms

In [139]:
def get_data(dir_):
    file_organisms = {}
    for file_name in os.listdir(dir_):
        if file_name.endswith(".txt"):
            organisms = parse_fasta(os.path.join(dir_, file_name))
            file_organisms[file_name] = organisms
    return file_organisms

In [154]:
def find_common_organisms_subset(files, min_common=6, files_wanted=8):
    best_subset_ = []
    best_common_organisms = set()

    for sub_length in range(files_wanted, 1, -1):
        for subset in combinations(list(files.keys()), sub_length):
            common_organisms_ = set.intersection(*(files[file] for file in subset))

            if len(common_organisms_) >= min_common:
                if len(common_organisms_) > len(best_common_organisms):
                    best_subset_ = subset
                    best_common_organisms = common_organisms_
                    break
        if best_subset_:
            break

    return len(best_subset_), len(best_common_organisms), best_subset_, best_common_organisms

In [155]:
data = get_data("aa")

In [156]:
subset_length, common_organisms_length, subset_files, common_organisms = find_common_organisms_subset(data)

In [157]:
print("Subset length:", subset_length)
print("Number of common organisms:", common_organisms_length)

Subset length: 8
Number of common organisms: 6


In [159]:
print("Subset files:", subset_files)

Subset files: ('cathepsin.txt', 'prothrombin.txt', 'thyroxine.txt', 'galectin.txt', 'lactase.txt', 'albumin.txt', 'collagen.txt', 'amylase.txt')


In [164]:
import numpy as np
print("Common organisms:", np.sort(list(common_organisms)))

Common organisms: ['Chlorocebus sabaeus' 'Macaca fascicularis' 'Macaca mulatta'
 'Papio anubis' 'Rhinopithecus roxellana' 'Trachypithecus francoisi']


### Updating fasta file to include headers

In [221]:
input_fasta = "data/db/combined_sequences.fasta"
output_fasta = "combined_sequences.fasta"

with open(input_fasta, "r") as infile, open(output_fasta, "w") as outfile:
        for line in infile:
            if line.startswith(">"):
                # Parse and clean up header
                parts = line.strip().split(" ", 1)
                seq_id = parts[0]  # e.g., >XP_007997053.1
                description = parts[1] if len(parts) > 1 else "Unknown"
                if "[" in description and "]" in description:
                    organism = description.split("[")[1].split("]")[0].replace(" ", "_")
                    protein = description.split("[")[0].strip().replace(" ", "_")
                    corrected_header = f"{seq_id}{protein}_{organism}"
                else:
                    corrected_header = seq_id
                outfile.write(corrected_header + "\n")
            else:
                # Write sequence lines as-is
                outfile.write(line)
