In [16]:
import pandas as pd
from Bio import SeqIO
import seaborn as sns


map = {'../results/PipCYP719A37/hits09.pep': '../data/proteomes/piper09/proteome09.csv',
       '../results/PipCYP719A37/hits23.pep': '../data/proteomes/piper23/proteome23.csv',
       '../results/PipCYP719A37/hits25.pep': '../data/proteomes/piper25/proteome25.csv',
       '../results/PipCYP719A37/hits26.pep': '../data/proteomes/piper26/proteome26.csv',
       '../results/PipCYP719A37/hits27.pep': '../data/proteomes/piper27/proteome27.csv'}

for hits_path, proteome_path in map.items():

       #import BLAST hits
       hits = pd.Series([record.description for record in SeqIO.parse(hits_path, 'fasta')]).to_frame(name='full_id')
       hits.set_index(hits['full_id'].apply(lambda x: x.split('_cov')[0]), inplace=True)

       #import proteome
       proteome = pd.read_csv(proteome_path, index_col='protein_id')
       proteome['mean'] = proteome.loc[:, [col for col in proteome.columns if 'rep' in col]].mean(axis=1).round(decimals=0) #calculate mean
       proteome['stdev'] = proteome.loc[:, [col for col in proteome.columns if 'rep' in col]].std(axis=1).round(decimals=0) #calculate stdev
          
       #add mean column to hits header
       if '_expr' not in hits['full_id'].iloc[0]:
              mean = proteome.loc[[id for id in hits.index if id in proteome.index], 'mean']
              hits = pd.merge(hits, mean, how='left', left_index=True, right_index=True)
              hits['full_id'] = hits['full_id'] + '_expr' + hits['mean'].astype(int).astype(str)


       # Update headers using SeqIO
       records = list(SeqIO.parse(hits_path, 'fasta'))
       
       for i, record in enumerate(records):
              record.id = hits['full_id'][i]
              record.description = ''
              record.name = ''        
              
       SeqIO.write(records, hits_path, 'fasta-2line')



# add mean column to hits header if not already present
if '_expr' not in hits['full_id'].iloc[0]:
     mean = proteome.loc[[id for id in hits.index if id in proteome.index], 'mean']
     hits = pd.merge(hits, mean, how='left', left_index=True, right_index=True)
     hits['full_id'] = hits['full_id'] + '_expr' + hits['mean'].astype(int).astype(str)


#concatenate hits
all_hits =pd.DataFrame()

for hits_path in map.keys():
    hits = pd.Series([record.id for record in SeqIO.parse(hits_path, 'fasta')]).to_frame(name='full_id')
    hits['short_id'] = hits['full_id'].apply(lambda x: x.split('_cov')[0]) #short id

    all_hits = pd.concat([all_hits, hits], axis=0).reset_index(drop=True)

all_hits['orthog_id'] = '' #add orthogroup_id column

#import sonicparanoid orthogroups
sonicparanoid_path = '../results/orthologs/runs/piperNET/ortholog_groups/ortholog_groups.tsv'
sonicparanoid = pd.read_csv(sonicparanoid_path, sep='\t')


#extract orthogourp id for each hit
legend = {'Pfim': 'proteome09.pep',
          'Parb': 'proteome23.pep',
          'Pama': 'proteome25.pep',
          'Psar': 'proteome26.pep',
          'Pkad': 'proteome27.pep'}

for species, proteome in legend.items():

    for header in all_hits['short_id']:
        
        if species in header:
            orthog_id = sonicparanoid.loc[sonicparanoid[proteome].str.contains(header), ['group_id']] #single-entry dataframe
            if not orthog_id.empty:
                all_hits.loc[all_hits['short_id'] == header, 'orthog_id'] = int(orthog_id.iloc[0])


#Export all_hits dataframe
all_hits.to_csv('../results/PipCYP719A37/orthogroups.csv', index=False)


###Create itol mapping file
itol = all_hits.copy()

#assign colors to orthogroups
unique_orthog = itol['orthog_id'].unique()
palette = ['#2ECC71', '#DAF7A6', '#B7950B', '#FFACAC', '#FF0000', '#48C9B0', '#FBFF00', '#6C3483', '#797D7F', '#85C1E9', '#F7DC6F', '#AEB6BF', '#0E6251', '#E67E22', '#3333FF', '#FF2DF5']
colors = sns.color_palette(palette, n_colors=len(unique_orthog))
color_map = dict(zip(unique_orthog, colors)) #create dictionary
itol['color'] = itol['orthog_id'].map(color_map)
itol['color'] = itol['color'].apply(lambda x: f'#{int(x[0]*255):02x}{int(x[1]*255):02x}{int(x[2]*255):02x}') #convert to hex format
itol.loc[itol['orthog_id']  == '', 'color'] = '#ffffff' #set to white hits with no orthog_id

#adjust to itol template
itol_header = 'TREE_COLORS\nSEPARATOR TAB\nDATA\n'
itol_text = pd.concat([itol['full_id'], pd.Series(['range'] * len(itol), name='type'), itol['color'], itol['orthog_id'], pd.Series(['orthogroup'] * len(itol), name='label')], axis=1)
itol_file = itol_header + itol_text.to_csv(sep='\t', header=False, index=False)

#write file
with open('../results/PipCYP719A37/itol.txt', 'w') as file:
    file.write(itol_file)

#log
print('N° of unique orthogroups:', len(unique_orthog))
print('N° of colors', len(palette))

N° of unique orthogroups: 9
N° of colors 16
