In [1]:
import os
import re
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [2]:
def hash_gene_to_prot_id(cds_path: str, gene_re=r'\[gene=(.*?)\]', prot_id_re=r'\[protein_id=(.*?)\]') -> dict:
    hash = dict()
    cds = list(SeqIO.parse(open(cds_path), 'fasta'))

    for record in cds:
        gene_match = re.search(gene_re, record.description)

        if gene_match:
            prot_id = re.search(prot_id_re, record.description)
            
            if prot_id:
                hash[gene_match.group(1)] = prot_id.group(1)
            else:
                hash[gene_match.group(1)] = ''
    
    return hash

In [3]:
def hash_locus_tag_to_prot_id(cds_path: str, tag_re='\[locus_tag=(.*?)\]', prot_id_re='\[protein_id=(.*?)\]') -> dict:
    hash = dict()
    cds = list(SeqIO.parse(open(cds_path), 'fasta'))

    for record in cds:
        tag_match = re.search(tag_re, record.description)

        if tag_match:
            prot_id = re.search(prot_id_re, record.description)
            
            if prot_id:
                hash[tag_match.group(1)] = prot_id.group(1)
            else:
                hash[tag_match.group(1)] = ''
    
    return hash

In [25]:
def get_gene_name_from_locus_tag(cds_path: str, tag: str, gene_re=r'\[gene=(.*?)\]') -> str:
    tag_re='\[locus_tag=(' + tag + ')\]'

    cds = list(SeqIO.parse(open(cds_path), 'fasta'))

    for record in cds:
        locus_tag = re.search(tag_re, record.description)

        if locus_tag:
            gene_id = re.search(gene_re, record.description)
            
            if gene_id:
                return gene_id.group(1)
            else:
                print('Tag', tag, 'found in', record.name, 'but no gene name matching', gene_re, 'was found.')
                return ''
    
    print('Could not find gene name from locus tag', tag)
    return ''

In [6]:
def get_prot_id_from_gene_name(cds_path: str, gene_name: str, prot_id_re=r'\[protein_id=(.*?)\]') -> str:
    gene_re = '\[gene=(' + gene_name + ')\]'

    cds = list(SeqIO.parse(open(cds_path), 'fasta'))

    for record in cds:
        gene_match = re.search(gene_re, record.description)

        if gene_match:
            prot_id = re.search(prot_id_re, record.description)
            
            if prot_id:
                return prot_id.group(1)
            else:
                print('Gene', gene_name, 'found in', record.name, 'but no protein id matching', prot_id_re, 'was found.')
                return ''

    print('Could not find protein id from gene name', gene_name)
    return ''

In [24]:
def get_prot_id_from_locus_tag(cds_path: str, tag: str, prot_id_re='\[protein_id=(.*?)\]') -> str:
    tag_re = '\[gene=(' + tag + ')\]'

    cds = list(SeqIO.parse(open(cds_path), 'fasta'))

    for record in cds:
        locus_tag = re.search(tag, record.description)

        if locus_tag:
            prot_id = re.search(prot_id_re, record.description)
            
            if prot_id:
                return prot_id.group(1)
            else:
                print('Tag', tag, 'found in', record.name, 'but no protein id matching', prot_id_re, 'was found.')
                return ''

    print('Locus tag', tag, 'not found.')
    return ''

In [4]:
species = 'kmarxianus'
cds_directory = '../../data/input/cds/'
output_directory = os.path.join(cds_directory, 'orthogroups', '')
ortho_path = '../../data/input/orthogroups/Orthogroups.tsv'

df = pd.read_csv(ortho_path, sep='\t')
shared = df.dropna()
species_names = shared.columns[1:]
n_species = len(species_names)

file_paths = [cds_directory + name for name in os.listdir(cds_directory) if name.split('_')[0] in species_names]

path_dict = dict()
for name in species_names:
    for path in file_paths:
        if name in path:
            path_dict[name] = path

In [5]:
path_dict

{'hpolymorpha': '../../data/input/cds/hpolymorpha_cds.fna',
 'iorientalis': '../../data/input/cds/iorientalis_cds.fna',
 'klactis': '../../data/input/cds/klactis_cds.fna',
 'kmarxianus': '../../data/input/cds/kmarxianus_cds.fna',
 'kphaffii': '../../data/input/cds/kphaffii_cds.fna',
 'rtoruloides': '../../data/input/cds/rtoruloides_cds.fna',
 'scerevisiae': '../../data/input/cds/scerevisiae_cds.fna',
 'ylipolytica': '../../data/input/cds/ylipolytica_cds.fna'}

In [6]:
auxotroph_names = [
                'ILV1', 'ILV3',
                'HIS2', 'HIS4', 'HIS7',
                'ARG1', 'ARG3', 'ARG4',
                'LYS1', 'LYS2', 'LYS4', 'LYS9',
                'ADE1', 'ADE2', 'ADE6', 'ADE8',
                'TRP5', 'LEU1', 'THR4', 'ADE13', 'MET2',
]

auxotroph_tags = [
    'KLMA_50610'
]

In [7]:
name_to_prot_id = dict()
prot_id_to_name = dict()

gene_to_prot_id_hash = hash_gene_to_prot_id(path_dict[species])
for auxo in auxotroph_names:
    if auxo in gene_to_prot_id_hash:
        id = gene_to_prot_id_hash[auxo]

        if len(shared[shared[species] == id]) > 0:
            name_to_prot_id[auxo] = id
            prot_id_to_name[id] = auxo
        else:
            print('Auxo gene', auxo, 'has no orthogroup across all', n_species, 'species.')
            print(df[df[species] == id])

    else:
        print('Auxo gene name', auxo, 'not found in', path_dict[species])


tag_to_prot_id_hash = hash_locus_tag_to_prot_id(path_dict[species])
for auxo in auxotroph_tags:
    if auxo in tag_to_prot_id_hash:
        id = tag_to_prot_id_hash[auxo]

        if len(shared[shared[species] == id]) > 0:
            name_to_prot_id[auxo] = id
            prot_id_to_name[id] = auxo
        else:
            print('Auxo tag', auxo, 'has no orthogroup across all', n_species, '.')
            print(df[df[species] == id])
    else:
        print('Auxo tag', auxo, 'not found in', path_dict[species])


print(len(name_to_prot_id), 'out of', len(auxotroph_names) + len(auxotroph_tags), species, 'have orthogroups across all species.')

name_to_prot_id

Auxo gene ARG1 has no orthogroup across all 8 species.
     Orthogroup     hpolymorpha     iorientalis      klactis      kmarxianus  \
3196  OG0003196  XP_018210803.1  XP_029321458.1  XP_452377.1  XP_022673716.1   

            kphaffii     rtoruloides  scerevisiae ylipolytica  
3196  XP_002492426.1  XP_016270319.1  NP_014583.2         NaN  
21 out of 22 kmarxianus have orthogroups across all species.


{'ILV1': 'XP_022676761.1',
 'ILV3': 'XP_022676106.1',
 'HIS2': 'XP_022673956.1',
 'HIS4': 'XP_022673604.1',
 'HIS7': 'XP_022674739.1',
 'ARG3': 'XP_022675310.1',
 'ARG4': 'XP_022673953.1',
 'LYS1': 'XP_022676213.1',
 'LYS2': 'XP_022678348.1',
 'LYS4': 'XP_022675655.1',
 'LYS9': 'XP_022675869.1',
 'ADE1': 'XP_022673725.1',
 'ADE2': 'XP_022674914.1',
 'ADE6': 'XP_022676747.1',
 'ADE8': 'XP_022675776.1',
 'TRP5': 'XP_022674386.1',
 'LEU1': 'XP_022676623.1',
 'THR4': 'XP_022675622.1',
 'ADE13': 'XP_022675223.1',
 'MET2': 'XP_022673628.1',
 'KLMA_50610': 'XP_022677059.1'}

In [8]:
shared[shared[species] == 'XP_022676623.1']

Unnamed: 0,Orthogroup,hpolymorpha,iorientalis,klactis,kmarxianus,kphaffii,rtoruloides,scerevisiae,ylipolytica
1284,OG0001284,XP_018213552.1,XP_029320833.1,XP_451218.1,XP_022676623.1,XP_002491269.1,XP_016275100.1,NP_011506.1,AOW01073.1


In [9]:
species_name_to_prot_id = shared[shared[species] == 'XP_022676623.1'].iloc[:, 1:].to_dict(orient='list')

species_name_to_prot_id

{'hpolymorpha': ['XP_018213552.1'],
 'iorientalis': ['XP_029320833.1'],
 'klactis': ['XP_451218.1'],
 'kmarxianus': ['XP_022676623.1'],
 'kphaffii': ['XP_002491269.1'],
 'rtoruloides': ['XP_016275100.1'],
 'scerevisiae': ['NP_011506.1'],
 'ylipolytica': ['AOW01073.1']}

In [10]:
prot_id_to_orthogroup = dict()
orthogroup_to_prot_id = dict()

for id in name_to_prot_id.values():
    group = shared[shared[species] == id]['Orthogroup'].values[0]
    prot_id_to_orthogroup[id] = group
    orthogroup_to_prot_id[group] = id

orthogroup_to_prot_id

{'OG0000749': 'XP_022676761.1',
 'OG0000711': 'XP_022676106.1',
 'OG0002298': 'XP_022673956.1',
 'OG0001311': 'XP_022673604.1',
 'OG0002783': 'XP_022674739.1',
 'OG0002020': 'XP_022675310.1',
 'OG0002525': 'XP_022673953.1',
 'OG0001014': 'XP_022676213.1',
 'OG0000627': 'XP_022678348.1',
 'OG0002625': 'XP_022675655.1',
 'OG0000909': 'XP_022675869.1',
 'OG0001168': 'XP_022673725.1',
 'OG0001150': 'XP_022674914.1',
 'OG0001300': 'XP_022676747.1',
 'OG0001931': 'XP_022675776.1',
 'OG0000513': 'XP_022674386.1',
 'OG0001284': 'XP_022676623.1',
 'OG0001065': 'XP_022675622.1',
 'OG0001151': 'XP_022675223.1',
 'OG0001180': 'XP_022673628.1',
 'OG0001633': 'XP_022677059.1'}

In [None]:
# OG0001300 4 
# OG0000627 5

In [11]:
prot_id_to_name[orthogroup_to_prot_id['OG0001300']]

'ADE6'

In [12]:
orthogroup_to_prot_id['OG0001300']

'XP_022676747.1'