In [83]:
# The plan: get the GO classification of all 631 targeted orthogenes

In [84]:
from Bio import SeqIO
import json
import pandas as pd
import os
import re

guide_gene_names = dict()

with open('1k_ascomycota_library.txt', 'r') as f:
    for l in f.readlines():
        guide_gene_names[l.strip()] = list()

files = os.listdir('out')
# ortholog_re = r'\[orthologous_to_gene=(.*?)\]'

total_genes = 0
for f in files:
    guide_target_names = dict()
    target_to_loc = dict()

    with open(f'out/{f}', 'r') as inf:
        lines = inf.readlines()

        for l in lines:
            if l.startswith(','):
                continue
            
            splitted = l.split(',')

            guide_target_names.setdefault(splitted[1][:20], list()).append((splitted[3], splitted[4].strip(), splitted[2])) 
    
    try:
        records = SeqIO.parse(f'target_sequences/{f}', 'fasta')
    except FileNotFoundError:
        continue

    desc_to_id = {}
    for r in records:
        try:
            desc_to_id[r.id] = re.search(r'\[protein_id=(.*?)\]', r.description).group(1)
        except AttributeError:
            print('No protein ID found for', f, r.id)
    
    total_genes += len(desc_to_id)

    for k, vset in guide_target_names.items():
        for v in vset:
            try:
                guide_gene_names[k].append((f, v[0], desc_to_id[v[0]], v[1], v[2]))
            except KeyError:
                print(f'Key {v[0]} in file {f} doesnt exist.')
                continue

# guide_gene_names

No protein ID found for candida_subhashii_cds.fna lcl|NW_026057298.1_cds_J8A68_002940_4074
Key lcl|NW_026057298.1_cds_J8A68_002940_4074 in file candida_subhashii_cds.fna doesnt exist.
No protein ID found for candida_oxycetoniae_cds.fna lcl|NW_026055383.1_cds_KGF56_001880_4178
Key lcl|NW_026055383.1_cds_KGF56_001880_4178 in file candida_oxycetoniae_cds.fna doesnt exist.
No protein ID found for sphaceloma_murrayae_cds.fna lcl|NKHZ01000081.1_cds_CAC42_2365_1953
Key lcl|NKHZ01000081.1_cds_CAC42_2365_1953 in file sphaceloma_murrayae_cds.fna doesnt exist.
No protein ID found for sordaria_macrospora_cds.fna lcl|NW_020185353.1_cds_SMAC_04436_4942
Key lcl|NW_020185353.1_cds_SMAC_04436_4942 in file sordaria_macrospora_cds.fna doesnt exist.
No protein ID found for botrytis_paeoniae_cds.fna lcl|PQXI01000356.1_cds_BPAE_0358g00010_11459
Key lcl|PQXI01000356.1_cds_BPAE_0358g00010_11459 in file botrytis_paeoniae_cds.fna doesnt exist.
Key lcl|PQXI01000356.1_cds_BPAE_0358g00010_11459 in file botrytis_pa

In [85]:
guide_gene_names

{'AGAAGAGGAGGAGGAGGAGG': [('podospora_comata_cds.fna',
   'PODCO_203180_t1',
   'PODCO_203180_t1',
   '914',
   '+'),
  ('podospora_comata_cds.fna',
   'PODCO_119030_t1',
   'PODCO_119030_t1',
   '592',
   '+'),
  ('podospora_comata_cds.fna',
   'PODCO_307360_t1',
   'PODCO_307360_t1',
   '2612',
   '+'),
  ('podospora_comata_cds.fna',
   'PODCO_209610_t1',
   'PODCO_209610_t1',
   '1024',
   '-'),
  ('aspergillus_nanangensis_cds.fna',
   'lcl|VCAU01000043.1_cds_KAF9888792.1_8261',
   'KAF9888792.1',
   '620',
   '+'),
  ('aspergillus_saccharolyticus_cds.fna',
   'lcl|NW_020290641.1_cds_XP_025428793.1_7183',
   'XP_025428793.1',
   '326',
   '+'),
  ('aspergillus_saccharolyticus_cds.fna',
   'lcl|NW_020290677.1_cds_XP_025426466.1_9592',
   'XP_025426466.1',
   '28',
   '-'),
  ('aspergillus_uvarum_cds.fna',
   'lcl|NW_020291469.1_cds_XP_025485509.1_12015',
   'XP_025485509.1',
   '44',
   '+'),
  ('aspergillus_uvarum_cds.fna',
   'lcl|NW_020291317.1_cds_XP_025493037.1_4515',
   'XP_025

In [86]:
reverse_index = dict()

for k, vlist in guide_gene_names.items():
    for vtuple in vlist:
        reverse_index.setdefault(vtuple[2], list()).append((vtuple[1], vtuple[0], k, vtuple[3], vtuple[4]))

reverse_index

{'PODCO_203180_t1': [('PODCO_203180_t1',
   'podospora_comata_cds.fna',
   'AGAAGAGGAGGAGGAGGAGG',
   '914',
   '+')],
 'PODCO_119030_t1': [('PODCO_119030_t1',
   'podospora_comata_cds.fna',
   'AGAAGAGGAGGAGGAGGAGG',
   '592',
   '+')],
 'PODCO_307360_t1': [('PODCO_307360_t1',
   'podospora_comata_cds.fna',
   'AGAAGAGGAGGAGGAGGAGG',
   '2612',
   '+')],
 'PODCO_209610_t1': [('PODCO_209610_t1',
   'podospora_comata_cds.fna',
   'AGAAGAGGAGGAGGAGGAGG',
   '1024',
   '-')],
 'KAF9888792.1': [('lcl|VCAU01000043.1_cds_KAF9888792.1_8261',
   'aspergillus_nanangensis_cds.fna',
   'AGAAGAGGAGGAGGAGGAGG',
   '620',
   '+')],
 'XP_025428793.1': [('lcl|NW_020290641.1_cds_XP_025428793.1_7183',
   'aspergillus_saccharolyticus_cds.fna',
   'AGAAGAGGAGGAGGAGGAGG',
   '326',
   '+')],
 'XP_025426466.1': [('lcl|NW_020290677.1_cds_XP_025426466.1_9592',
   'aspergillus_saccharolyticus_cds.fna',
   'AGAAGAGGAGGAGGAGGAGG',
   '28',
   '-')],
 'XP_025485509.1': [('lcl|NW_020291469.1_cds_XP_025485509.1_120

In [87]:
# species_list = list()
# prot_ids_list = list()
# gene_ids_list = list()

# bio_proc_list = list()
# bio_proc_names_list = list()

# mol_func_list = list()
# mol_func_names_list = list()

# cell_comp_list = list()
# cell_comp_names_list = list()

# superfamily_list = list()
# superfamily_locs_list = list()
# superfamily_acc_list = list()

# family_list = list()
# family_locs_list = list()
# family_acc_list = list()

# domain_list = list()
# domain_locs_list = list()
# domain_acc_list = list()

# guides_list = list()
# target_locs_list = list()

# all_go_terms = dict()

# for f in os.listdir('interpro_results'):
#     with open(f'interpro_results/{f}', 'r') as json_file:
#         data = json.load(json_file)
#         for prot_id, v in data.items():
#             mol_func_set = set()
#             bio_proc_set = set()
#             cell_comp_set = set()

#             superfamily_dict = dict()
#             family_dict = dict()
#             domain_dict = dict()

#             superfamily_locs_str = ''
#             family_locs_str = ''
#             domain_locs_str = ''

#             for element in data[prot_id]['results'][0]['matches']:
#                 if element['signature']['entry'] != None:

#                     if element['signature']['entry']['type'] == 'DOMAIN':
#                         if element['signature']['entry']['name'] not in domain_dict:
#                             domain_dict[element['signature']['entry']['name']] = ''
#                             all_go_terms[element['signature']['entry']['name']] = element['signature']['entry']['accession']

#                             for loc in element['locations']:
#                                 domain_dict[element['signature']['entry']['name']] += str(loc['start']) + '-' + str(loc['end'])
#                                 if len(element['locations']) > 1:
#                                     domain_dict[element['signature']['entry']['name']] += '|'
                            
#                             if domain_dict[element['signature']['entry']['name']][-1] == '|':
#                                 domain_dict[element['signature']['entry']['name']] = domain_dict[element['signature']['entry']['name']][:-1]

#                     elif element['signature']['entry']['type'] == 'FAMILY':
#                         if element['signature']['entry']['name'] not in family_dict:
#                             family_dict[element['signature']['entry']['name']] = ''
#                             all_go_terms[element['signature']['entry']['name']] = element['signature']['entry']['accession']
                        
#                             for loc in element['locations']:
#                                 family_dict[element['signature']['entry']['name']] += str(loc['start']) + '-' + str(loc['end'])
#                                 if len(element['locations']) > 1:
#                                     family_dict[element['signature']['entry']['name']] += '|'
                            
#                             if family_dict[element['signature']['entry']['name']][-1] == '|':
#                                 family_dict[element['signature']['entry']['name']] = family_dict[element['signature']['entry']['name']][:-1]

#                     elif element['signature']['entry']['type'] == 'HOMOLOGOUS_SUPERFAMILY':
#                         if element['signature']['entry']['name'] not in superfamily_dict:
#                             superfamily_dict[element['signature']['entry']['name']] = ''
#                             all_go_terms[element['signature']['entry']['name']] = element['signature']['entry']['accession']

#                             for loc in element['locations']:
#                                 superfamily_dict[element['signature']['entry']['name']] += str(loc['start']) + '-' + str(loc['end'])
#                                 if len(element['locations']) > 1:
#                                     superfamily_dict[element['signature']['entry']['name']] += '|'
                            
#                             if superfamily_dict[element['signature']['entry']['name']][-1] == '|':
#                                 superfamily_dict[element['signature']['entry']['name']] = superfamily_dict[element['signature']['entry']['name']][:-1]
                            
#                 if 'goXRefs' in element.keys():
#                     for go_category in element['goXRefs']:
#                         if go_category['category'] == 'MOLECULAR_FUNCTION':
#                             mol_func_set.add(go_category['id'])
#                             all_go_terms[go_category['id']] = go_category['name']
#                         elif go_category['category'] == 'BIOLOGICAL_PROCESS':
#                             bio_proc_set.add(go_category['id'])
#                             all_go_terms[go_category['id']] = go_category['name']
#                         elif go_category['category'] == 'CELLULAR_COMPONENT':
#                             cell_comp_set.add(go_category['id'])
#                             all_go_terms[go_category['id']] = go_category['name']
                
#                 if 'entry' in element['signature'].keys() and isinstance(element['signature']['entry'], dict) and 'goXRefs' in element['signature']['entry'].keys():
#                     for go_category in element['signature']['entry']['goXRefs']:
#                         if go_category['category'] == 'MOLECULAR_FUNCTION':
#                             mol_func_set.add(go_category['id'])
#                             all_go_terms[go_category['id']] = go_category['name']
#                         elif go_category['category'] == 'BIOLOGICAL_PROCESS':
#                             bio_proc_set.add(go_category['id'])
#                             all_go_terms[go_category['id']] = go_category['name']
#                         elif go_category['category'] == 'CELLULAR_COMPONENT':
#                             cell_comp_set.add(go_category['id'])
#                             all_go_terms[go_category['id']] = go_category['name']
            
#             for tup in reverse_index[prot_id]:
#                 s = ''
#                 l = list()
#                 for m in mol_func_set:
#                     s += m + ';'
#                     l.append(m)
#                 s = s[:-1]
#                 mol_func_list.append(s)

#                 s = ''
#                 for m in l:
#                     s += all_go_terms[m] + ';'
#                 s = s[:-1]
#                 mol_func_names_list.append(s)
#                 # ---

#                 s = ''
#                 l = list()
#                 for m in bio_proc_set:
#                     s += m + ';'
#                     l.append(m)
#                 s = s[:-1]
#                 bio_proc_list.append(s)

#                 s = ''
#                 for m in l:
#                     s += all_go_terms[m] + ';'
#                 s = s[:-1]
#                 bio_proc_names_list.append(s)
#                 # ---

#                 s = ''
#                 l = list()
#                 for m in cell_comp_set:
#                     s += m + ';'
#                     l.append(m)
#                 s = s[:-1]
#                 cell_comp_list.append(s)
            
#                 s = ''
#                 for m in l:
#                     s += all_go_terms[m] + ';'
#                 s = s[:-1]
#                 cell_comp_names_list.append(s)
#                 # ---

#                 dom_str = ''
#                 dom_loc_str = ''
#                 l = list()
#                 for k, v in domain_dict.items():
#                     dom_str += k + ';'
#                     dom_loc_str += v + ';'
#                     l.append(k)
#                 dom_str = dom_str[:-1]
#                 dom_loc_str = dom_loc_str[:-1]

#                 domain_list.append(dom_str)
#                 domain_locs_list.append(dom_loc_str)

#                 s = ''
#                 for m in l:
#                     s += all_go_terms[m] + ';'
#                 s = s[:-1]
#                 domain_acc_list.append(s)
#                 # ---

#                 fam_str = ''
#                 fam_loc_str = ''
#                 l = list()
#                 for k, v in family_dict.items():
#                     fam_str += k + ';'
#                     fam_loc_str += v + ';'
#                     l.append(k)
#                 fam_str = fam_str[:-1]
#                 fam_loc_str = fam_loc_str[:-1]

#                 family_list.append(fam_str)
#                 family_locs_list.append(fam_loc_str)

#                 s = ''
#                 for m in l:
#                     s += all_go_terms[m] + ';'
#                 s = s[:-1]
#                 family_acc_list.append(s)
#                 # ---

#                 superf_str = ''
#                 superf_loc_str = ''
#                 l = list()
#                 for k, v in superfamily_dict.items():
#                     superf_str += k + ';'
#                     superf_loc_str += v + ';'
#                     l.append(k)
#                 superf_str = superf_str[:-1]
#                 superf_loc_str = superf_loc_str[:-1]

#                 superfamily_list.append(superf_str)
#                 superfamily_locs_list.append(superf_loc_str)

#                 s = ''
#                 for m in l:
#                     s += all_go_terms[m] + ';'
#                 s = s[:-1]
#                 superfamily_acc_list.append(s)        

#                 prot_ids_list.append(prot_id)
#                 gene_ids_list.append(tup[0])
#                 species_list.append(tup[1])
#                 guides_list.append(tup[2])
#                 target_locs_list.append(tup[3])


In [88]:
species_list = list()
prot_ids_list = list()
gene_ids_list = list()

bio_proc_list = list()
bio_proc_names_list = list()

mol_func_list = list()
mol_func_names_list = list()

cell_comp_list = list()
cell_comp_names_list = list()

superfamily_list = list()
superfamily_locs_list = list()
superfamily_acc_list = list()

family_list = list()
family_locs_list = list()
family_acc_list = list()

domain_list = list()
domain_locs_list = list()
domain_acc_list = list()

conserved_site_list = list()
conserved_site_locs_list = list()
conserved_site_acc_list = list()

binding_site_list = list()
binding_site_locs_list = list()
binding_site_acc_list = list()

active_site_list = list()
active_site_locs_list = list()
active_site_acc_list = list()

repeats_list = list()
repeat_locs_list = list()
repeat_acc_list = list()

other_list = list()
other_locs_list = list()
other_acc_list = list()

guides_list = list()
target_locs_list = list()
target_strand_list = list()

all_go_terms = dict()

for f in os.listdir('interpro_results'):
    with open(f'interpro_results/{f}', 'r') as json_file:
        data = json.load(json_file)
        for prot_id, v in data.items():
            mol_func_set = set()
            bio_proc_set = set()
            cell_comp_set = set()

            superfamilies_str = ''
            superfamily_acc_str = ''
            superfamily_locs_str = ''

            families_str = ''
            family_acc_str = ''
            family_locs_str = ''

            domains_str = ''
            domain_acc_str = ''
            domain_locs_str = ''

            conserved_site_str = ''
            conserved_site_acc_str = ''
            conserved_site_locs_str = ''

            binding_site_str = ''
            binding_site_acc_str = ''
            binding_site_locs_str = ''

            active_site_str = ''
            active_site_acc_str = ''
            active_site_locs_str = ''
            
            repeats_str = ''
            repeat_acc_str = ''
            repeat_locs_str = ''

            other_str = ''
            other_acc_str = ''
            other_locs_str = ''

            for element in data[prot_id]['results'][0]['matches']:
                if element['signature']['entry'] != None:

                    if element['signature']['entry']['type'] == 'DOMAIN':
                        all_go_terms[element['signature']['entry']['name']] = element['signature']['entry']['accession']
                
                        domains_str += element['signature']['entry']['name']
                        domain_acc_str += element['signature']['entry']['accession']

                        for idx, loc in enumerate(element['locations']):
                            domain_locs_str += str(loc['start']) + '-' + str(loc['end'])
                            if len(element['locations']) > idx + 1:
                                domain_locs_str += '|'
                        
                        domains_str += ';'
                        domain_locs_str += ';'
                        domain_acc_str += ';'

                    elif element['signature']['entry']['type'] == 'FAMILY':
                        all_go_terms[element['signature']['entry']['name']] = element['signature']['entry']['accession']
                
                        families_str += element['signature']['entry']['name']
                        family_acc_str += element['signature']['entry']['accession']

                        for idx, loc in enumerate(element['locations']):
                            family_locs_str += str(loc['start']) + '-' + str(loc['end'])
                            if len(element['locations']) > idx + 1:
                                family_locs_str += '|'
                        
                        families_str += ';'
                        family_locs_str += ';'
                        family_acc_str += ';'
                        
                    elif element['signature']['entry']['type'] == 'HOMOLOGOUS_SUPERFAMILY':
                        all_go_terms[element['signature']['entry']['name']] = element['signature']['entry']['accession']
                
                        superfamilies_str += element['signature']['entry']['name']
                        superfamily_acc_str += element['signature']['entry']['accession']

                        for idx, loc in enumerate(element['locations']):
                            superfamily_locs_str += str(loc['start']) + '-' + str(loc['end'])
                            if len(element['locations']) > idx + 1:
                                superfamily_locs_str += '|'
                        
                        superfamilies_str += ';'
                        superfamily_locs_str += ';'
                        superfamily_acc_str += ';'

                    elif element['signature']['entry']['type'] == 'CONSERVED_SITE':
                        all_go_terms[element['signature']['entry']['name']] = element['signature']['entry']['accession']
                
                        conserved_site_str += element['signature']['entry']['name']
                        conserved_site_acc_str += element['signature']['entry']['accession']

                        for idx, loc in enumerate(element['locations']):
                            conserved_site_locs_str += str(loc['start']) + '-' + str(loc['end'])
                            if len(element['locations']) > idx + 1:
                                conserved_site_locs_str += '|'
                        
                        conserved_site_str += ';'
                        conserved_site_locs_str += ';'
                        conserved_site_acc_str += ';'

                    elif element['signature']['entry']['type'] == 'BINDING_SITE':
                        all_go_terms[element['signature']['entry']['name']] = element['signature']['entry']['accession']
                
                        binding_site_str += element['signature']['entry']['name']
                        binding_site_acc_str += element['signature']['entry']['accession']

                        for idx, loc in enumerate(element['locations']):
                            binding_site_locs_str += str(loc['start']) + '-' + str(loc['end'])
                            if len(element['locations']) > idx + 1:
                                binding_site_locs_str += '|'
                        
                        binding_site_str += ';'
                        binding_site_locs_str += ';'
                        binding_site_acc_str += ';'

                    elif element['signature']['entry']['type'] == 'ACTIVE_SITE':
                        all_go_terms[element['signature']['entry']['name']] = element['signature']['entry']['accession']
                
                        active_site_str += element['signature']['entry']['name']
                        active_site_acc_str += element['signature']['entry']['accession']

                        for idx, loc in enumerate(element['locations']):
                            active_site_locs_str += str(loc['start']) + '-' + str(loc['end'])
                            if len(element['locations']) > idx + 1:
                                active_site_locs_str += '|'
                        
                        active_site_str += ';'
                        active_site_locs_str += ';'
                        active_site_acc_str += ';'

                    elif element['signature']['entry']['type'] == 'REPEAT':
                        all_go_terms[element['signature']['entry']['name']] = element['signature']['entry']['accession']
                
                        repeats_str += element['signature']['entry']['name']
                        repeat_acc_str += element['signature']['entry']['accession']

                        for idx, loc in enumerate(element['locations']):
                            repeat_locs_str += str(loc['start']) + '-' + str(loc['end'])
                            if len(element['locations']) > idx + 1:
                                repeat_locs_str += '|'
                        
                        repeats_str += ';'
                        repeat_locs_str += ';'
                        repeat_acc_str += ';'

                else:
                    if element['signature']['name'] != None and element['signature']['name'] != '' and element['signature']['accession'] != None and element['signature']['accession'] != '':
                        other_str += element['signature']['name']
                        other_acc_str += element['signature']['accession']

                        for idx, loc in enumerate(element['locations']):
                            other_locs_str += str(loc['start']) + '-' + str(loc['end'])
                            if len(element['locations']) > idx + 1:
                                other_locs_str += '|'
                        
                        other_str += ';'
                        other_locs_str += ';'
                        other_acc_str += ';'

                if 'goXRefs' in element.keys():
                    for go_category in element['goXRefs']:
                        if go_category['category'] == 'MOLECULAR_FUNCTION':
                            mol_func_set.add(go_category['id'])
                            all_go_terms[go_category['id']] = go_category['name']
                        elif go_category['category'] == 'BIOLOGICAL_PROCESS':
                            bio_proc_set.add(go_category['id'])
                            all_go_terms[go_category['id']] = go_category['name']
                        elif go_category['category'] == 'CELLULAR_COMPONENT':
                            cell_comp_set.add(go_category['id'])
                            all_go_terms[go_category['id']] = go_category['name']
                
                if 'entry' in element['signature'].keys() and isinstance(element['signature']['entry'], dict) and 'goXRefs' in element['signature']['entry'].keys():
                    for go_category in element['signature']['entry']['goXRefs']:
                        if go_category['category'] == 'MOLECULAR_FUNCTION':
                            mol_func_set.add(go_category['id'])
                            all_go_terms[go_category['id']] = go_category['name']
                        elif go_category['category'] == 'BIOLOGICAL_PROCESS':
                            bio_proc_set.add(go_category['id'])
                            all_go_terms[go_category['id']] = go_category['name']
                        elif go_category['category'] == 'CELLULAR_COMPONENT':
                            cell_comp_set.add(go_category['id'])
                            all_go_terms[go_category['id']] = go_category['name']
            
            for tup in reverse_index[prot_id]:
                s = ''
                l = list()
                for m in mol_func_set:
                    s += m + ';'
                    l.append(m)
                s = s[:-1]
                mol_func_list.append(s)

                s = ''
                for m in l:
                    s += all_go_terms[m] + ';'
                s = s[:-1]
                mol_func_names_list.append(s)
                # ---

                s = ''
                l = list()
                for m in bio_proc_set:
                    s += m + ';'
                    l.append(m)
                s = s[:-1]
                bio_proc_list.append(s)

                s = ''
                for m in l:
                    s += all_go_terms[m] + ';'
                s = s[:-1]
                bio_proc_names_list.append(s)
                # ---

                s = ''
                l = list()
                for m in cell_comp_set:
                    s += m + ';'
                    l.append(m)
                s = s[:-1]
                cell_comp_list.append(s)
            
                s = ''
                for m in l:
                    s += all_go_terms[m] + ';'
                s = s[:-1]
                cell_comp_names_list.append(s)
                # --- 

                domain_list.append(domains_str[:-1])
                domain_locs_list.append(domain_locs_str[:-1])
                domain_acc_list.append(domain_acc_str[:-1])

                family_list.append(families_str[:-1])
                family_locs_list.append(family_locs_str[:-1])
                family_acc_list.append(family_acc_str[:-1])

                superfamily_list.append(superfamilies_str[:-1])
                superfamily_locs_list.append(superfamily_locs_str[:-1])
                superfamily_acc_list.append(superfamily_acc_str[:-1])

                conserved_site_list.append(conserved_site_str[:-1])
                conserved_site_locs_list.append(conserved_site_locs_str[:-1])
                conserved_site_acc_list.append(conserved_site_acc_str[:-1])

                binding_site_list.append(binding_site_str[:-1])
                binding_site_locs_list.append(binding_site_locs_str[:-1])
                binding_site_acc_list.append(binding_site_acc_str[:-1])

                active_site_list.append(active_site_str[:-1])
                active_site_locs_list.append(active_site_locs_str[:-1])
                active_site_acc_list.append(active_site_acc_str[:-1])

                repeats_list.append(repeats_str[:-1])
                repeat_locs_list.append(repeat_locs_str[:-1])
                repeat_acc_list.append(repeat_acc_str[:-1])

                other_list.append(other_str[:-1])
                other_locs_list.append(other_locs_str[:-1])
                other_acc_list.append(other_acc_str[:-1])

                prot_ids_list.append(prot_id)
                gene_ids_list.append(tup[0])
                species_list.append(tup[1])
                guides_list.append(tup[2])
                target_locs_list.append(tup[3])
                target_strand_list.append(tup[4])

In [89]:
headers = ['species_name', 'prot_id', 'gene_id',
           'biological_process_go', 'biological_process_go_name',
           'molecular_function_go', 'molecular_function_go_name',
           'cellular_component_go', 'cellular_component_go_name',
           'domains', 'domain_accessions', 'domain_locations', 'families', 'family_accessions', 'family_locations',
           'superfamilies', 'superfamily_accessions', 'superfamily_locations',
           'conserved_sites', 'conserved_site_accessions', 'conserved_site_locations',
           'binding_sites', 'binding_site_accessions', 'binding_site_locations',
           'active_sites', 'active_site_accessions', 'active_site_locations',
           'repeats', 'repeat_accessions', 'repeat_locations',
           'other_features', 'other_feature_accessions', 'other_feature_locations',
           'targeting_guide', 'target_strand', 'guide_nuc_start_position']

df = pd.DataFrame(list(zip(species_list, prot_ids_list, gene_ids_list,
                      bio_proc_list, bio_proc_names_list,
                      mol_func_list, mol_func_names_list,
                      cell_comp_list, cell_comp_names_list,
                      domain_list, domain_acc_list, domain_locs_list,
                      family_list, family_acc_list, family_locs_list,
                      superfamily_list, superfamily_acc_list, superfamily_locs_list,
                      conserved_site_list, conserved_site_acc_list, conserved_site_locs_list,
                      binding_site_list, binding_site_acc_list, binding_site_locs_list,
                      active_site_list, active_site_acc_list, active_site_locs_list,
                      repeats_list, repeat_acc_list, repeat_locs_list,
                      other_list, other_acc_list, other_locs_list,
                      guides_list, target_strand_list, target_locs_list)), columns=headers)

def start(s):
    return (s + 2) // 3

def end(s):
    return (s + 22 + 2) // 3

df['guide_nuc_start_position'] = df['guide_nuc_start_position'].astype(int) + 1  # Interpro is 1-based. start_position is 0-based. This fixes it.
df['prot_start'] = df['guide_nuc_start_position'].apply(start)
df['prot_end'] = df['guide_nuc_start_position'].apply(end)
df['prot_ranges'] = df.apply(lambda row: f"{row['prot_start']}-{row['prot_end']}", axis=1)
df.drop(columns=['prot_start', 'prot_end'], inplace=True)
df.fillna('N/A', inplace=True)
df.replace('', 'N/A', inplace=True)

df.to_csv('interpro_results.csv', index=False)

In [90]:
df

Unnamed: 0,species_name,prot_id,gene_id,biological_process_go,biological_process_go_name,molecular_function_go,molecular_function_go_name,cellular_component_go,cellular_component_go_name,domains,...,repeats,repeat_accessions,repeat_locations,other_features,other_feature_accessions,other_feature_locations,targeting_guide,target_strand,guide_nuc_start_position,prot_ranges
0,kluyveromyces_lactis_cds.fna,XP_451741.1,lcl|NC_006038.1_cds_XP_451741.1_696,,,GO:0003755,peptidyl-prolyl cis-trans isomerase activity,GO:0000785;GO:0005730,chromatin;nucleolus,PPIase_FKBP_dom;NPL;PPIase_FKBP_dom,...,,,,Coil;Nucleoplasmin core domain;disorder_predic...,Coil;G3DSA:2.60.120.340;mobidb-lite;SSF54534;C...,103-123;9-181;172-273;264-418;250-288;11-417;1...,AGAGGAAGAGGAAGAGGAAG,+,321,107-115
1,kluyveromyces_lactis_cds.fna,XP_451741.1,lcl|NC_006038.1_cds_XP_451741.1_696,,,GO:0003755,peptidyl-prolyl cis-trans isomerase activity,GO:0000785;GO:0005730,chromatin;nucleolus,PPIase_FKBP_dom;NPL;PPIase_FKBP_dom,...,,,,Coil;Nucleoplasmin core domain;disorder_predic...,Coil;G3DSA:2.60.120.340;mobidb-lite;SSF54534;C...,103-123;9-181;172-273;264-418;250-288;11-417;1...,AGAGGAAGAGGAAGAGGAAG,+,768,256-264
2,kluyveromyces_lactis_cds.fna,XP_451741.1,lcl|NC_006038.1_cds_XP_451741.1_696,,,GO:0003755,peptidyl-prolyl cis-trans isomerase activity,GO:0000785;GO:0005730,chromatin;nucleolus,PPIase_FKBP_dom;NPL;PPIase_FKBP_dom,...,,,,Coil;Nucleoplasmin core domain;disorder_predic...,Coil;G3DSA:2.60.120.340;mobidb-lite;SSF54534;C...,103-123;9-181;172-273;264-418;250-288;11-417;1...,AGAGGAAGAGGAAGAGGAAG,+,762,254-262
3,kluyveromyces_lactis_cds.fna,XP_451741.1,lcl|NC_006038.1_cds_XP_451741.1_696,,,GO:0003755,peptidyl-prolyl cis-trans isomerase activity,GO:0000785;GO:0005730,chromatin;nucleolus,PPIase_FKBP_dom;NPL;PPIase_FKBP_dom,...,,,,Coil;Nucleoplasmin core domain;disorder_predic...,Coil;G3DSA:2.60.120.340;mobidb-lite;SSF54534;C...,103-123;9-181;172-273;264-418;250-288;11-417;1...,AGAGGAAGAGGAAGAGGAAG,+,756,252-260
4,kluyveromyces_lactis_cds.fna,XP_453196.1,lcl|NC_006040.1_cds_XP_453196.1_2079,,,GO:0061133;GO:0070628,endopeptidase activator activity;proteasome bi...,GO:0008541;GO:0005634;GO:0005737,"proteasome regulatory particle, lid subcomplex...",Rpn13/ADRM1_Pru;Rpn13/ADRM1_Pru,...,,,,Coil;disorder_prediction;disorder_prediction;d...,Coil;mobidb-lite;mobidb-lite;mobidb-lite,152-186;168-206;183-198;168-182,AGAGGAAGAGGAAGAGGAAG,+,507,169-177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,aplosporella_prunicola_cds.fna,XP_033397713.1,lcl|NW_022983540.1_cds_XP_033397713.1_5413,,,,,,,,...,,,,disorder_prediction;Coil;disorder_prediction;d...,mobidb-lite;Coil;mobidb-lite;mobidb-lite;mobid...,665-684;47-72;290-306;105-131;513-527;665-724;...,AGAAGAGGAGGAGGAGGAGG,+,186,62-70
8689,aplosporella_prunicola_cds.fna,XP_033401083.1,lcl|NW_022983532.1_cds_XP_033401083.1_1800,GO:0000278;GO:0000226;GO:0007017,mitotic cell cycle;microtubule cytoskeleton or...,GO:0005525;GO:0003924;GO:0005200,GTP binding;GTPase activity;structural constit...,GO:0005737;GO:0005874,cytoplasm;microtubule,Tubulin_FtsZ_GTPase;Tubulin/FtsZ_2-layer-sand-...,...,,,,disorder_prediction;beta_tubulin;Coil;disorder...,mobidb-lite;cd02187;Coil;mobidb-lite;Coil,421-446;2-426;408-428;431-446;323-343,GCTTTCTTGCATTGGTACAC,+,1177,393-400
8690,aplosporella_prunicola_cds.fna,XP_033401625.1,lcl|NW_022983531.1_cds_XP_033401625.1_1234,,,,,,,,...,,,,Coil;disorder_prediction;disorder_prediction;d...,Coil;mobidb-lite;mobidb-lite;mobidb-lite;mobid...,233-271;170-190;1-48;1-279;136-152;448-470;7-5...,AGAAGAGGAGGAGGAGGAGG,+,1356,452-460
8691,aplosporella_prunicola_cds.fna,XP_033402011.1,lcl|NW_022983531.1_cds_XP_033402011.1_1622,,,,,,,,...,,,,disorder_prediction;disorder_prediction;disord...,mobidb-lite;mobidb-lite;mobidb-lite;mobidb-lit...,136-150;99-264;169-208;216-232;1-28;150-170;45-85,AGAAGAGGAGGAGGAGGAGG,+,459,153-161
