In [28]:
import os
from Bio import Entrez
import json

Entrez.email = "VIMVer@univ-amu.fr"

Vazy1_dir:str = "./../../../../testymolo/media/tables_csv/"
Vazy2_dir:str = "./../../../../testymolo/media/data/transfert/"



# Tables

### Methods

In [30]:
### caution when not all rows same nbr of col
def custom_csv_parser_to_list(infilepath:str):
    with open(infilepath ,'r') as handle:

        rows:list = []

        for line in handle.readlines():
            line = line.strip(';') # removing terminal ';' 
            values:list = [] # re-initialize value list
            val:str = "" # current parsing value
            inquote:bool = False # re-initialize in quote : False
            quote_char:str = '' # ' or "
            for c in line:
                if(c == ','): # encounter comma
                    if(not inquote): ## and not in quote
                        val = val.strip("'")
                        val = val.strip('"')
                        if(len(val) > 0):
                            values.append(val)
                        else:
                            values.append(None)
                        val = ""
                        continue
                elif(c == '"' or c == "'"): # encounter quote
                    if(inquote): # already in quote 
                        if(quote_char == c):  # encounter ending quote mark
                            inquote = False
                        else: # encounter the other quote mark 
                            pass
                    else: # encounter starting quote mark
                        inquote = True
                        quote_char = c
                elif((c == ' ' or c == '\n') and not inquote):  # get rid of whitespaces
                    continue
                val += c
            if(len(val)>0):
                val = val.strip("'")
                val = val.strip('"')
                values.append(val)

            if(len(values) > 0):
                rows.append(values)
                #print(values)  
                
        #print("nbr of col", set([len(row) for row in rows]))
        #print("nbr of rows", len(rows))

        return rows
    
def csv_to_json(data:list, table_headers):
    return [ { table_headers[i]:item[i] for i in range(len(item)) } for item in data ]

In [32]:
def fetch_phylogeny(accession):
    
    def get_tax_data(taxid):
        try:
            search = Entrez.efetch(id=taxid, db='taxonomy', retmode='xml')
            return Entrez.read(search)
        except:
            return []

    def extract_TaxId(taxon):
        return taxon['TaxId']

    def extract_Name(taxon):
        return taxon['ScientificName']

    def extract_phylo(taxon):
        return taxon['Lineage']
    
    for result in get_tax_data(accession):
        taxon = {
            'id':extract_TaxId(result),
            'name':extract_Name(result),
            'phylogeny':extract_phylo(result)
            }
        #taxon_json = json.dumps(taxon)
        #print(taxon_json)
    
        return taxon

In [41]:
def parse_table(table_name:str, table_headers:tuple) -> list:
    return csv_to_json(custom_csv_parser_to_list(os.path.join(Vazy1_dir, table_name+'.csv')), table_headers)

### Variables

In [29]:
Vazy2_Organism_headers:tuple = ("id", "name", "abr", "group", "phylogeny", "complete")
Vazy2_Organism_json:list = []

VAZY_2:dict = {}
Vazy2_Organism_json:list = []

In [42]:
VAZY_1:dict = {
    'Organism': parse_table('Organism', ("Tax_id", "Name", "Categorie", "Classe", "Ordre", "Fam", "SsFam", "Genre", "Note_org")),
    'CAZy_DB': parse_table('CAZy_DB', ("DB_ac", "Protein", "DB_nom", "Organism", "abr", "Tax_id", "EC", "_3D_status", "Length", "Sequence", "DB_note", "Created", "Modified", "PP_status", "Lib_sort")),
    'CAZy_GB_GP': parse_table('CAZy_GB_GP', ("DB_ac", "GP_ac", "GB_ac", "GP_gi", "GP_gene", "GP_genomic", "GP_begin", "GP_end", "GP_note")),
    'CAZy_PDB': parse_table('CAZy_PDB', ("DB_ac", "PDB_id", "PDB_chain", "PDB_begin", "PDB_end", "PDB_note", "PDB_bornModo")),
    'CAZy_PP': parse_table('CAZy_PP', ("DB_ac", "PP_ac", "PP_gi", "PP_gene", "PP_begin", "PP_end", "PP_note", "PP_diff")),
    'CAZy_SP': parse_table('CAZy_SP', ("DB_ac", "SP_ac", "SP_id", "SP_gene", "SP_begin", "SP_end", "SP_note")),
    'CAZyModO': parse_table('CAZyModO', ("URL_family", "URL_title", "URL_body", "URL_relative", "URL_note", "Modified")),
    'Cz_EC' : parse_table('Cz_EC', ("DB_ac", "EC")),
    'EC_num': parse_table('EC_num', ("EC", "EC_name", "EC_othername", "EC_reaction", "EC_comment")),
    'ModO_Composition': parse_table('ModO_Composition', ("DB_ac", "Mod_num", "Family", "Subf")),
    'ModO_CrossRefs': parse_table('ModO_CrossRefs', ("Family", "Link", "URL_base", "URL_string", "Ref_note")),
    'ModO_Families': parse_table('ModO_Families', ("Family", "Family_Name", "Family_Activity", "Family_Taxo", "Clan", "Family_note", "Family_Private_note", "Fold", "ModoS_uniqfct", "ModoS_Activity", "ModoS_Description", "ModoS_Fold", "ModoS_note", "ModoS_Private_note", "web_descript", "web_status")),
    'ModO_Limits': parse_table('ModO_Limits', ("DB_ac", "Mod_num", "Mod_begin", "Mod_end") ),
    'Motifs':parse_table('Motifs', ("motifs_ac", "motifs_name", "motifs_activity", "motifs_classification", "motifs_form", "motifs_note")),
    'Prot_Infos':parse_table('Prot_Infos', ("DB_ac", "INFO_indice", "INFO_ac", "INFO_Nature", "INFO", "INFO_pmid")),
    'Prot_MOTIF':parse_table('Prot_MOTIF', ("DB_ac", "MOTIF_indice", "MOTIF_ac", "MOTIF_name", "MOTIF_begin", "MOTIF_end", "MOTIF", "MOTIF_activity", "MOTIF_class", "MOTIF_note", "MOTIF_ref", "MOTIF_pmid")),
    'Prot_MUT': parse_table('Prot_MUT', ("DB_ac", "MUT_indice", "MUT_ac", "MUT", "MUT_note", "MUT_ref", "MUT_pmid")),
    'Prot_REG': parse_table('Prot_REG', ("DB_ac", "REG_indice", "REG_ac", "REG_name", "REG_begin", "REG_end", "REG_note", "REG_ref", "REG_pmid")),
    'Prot_RI': parse_table('Prot_RI',("DB_ac", "RI_indice", "RI_ac", "RI_name", "RI_begin", "RI_end", "RI_no_limit", "RI_note", "RI_ref", "RI_pmid")),
}

print(VAZY_1['Organism'][0])


{'Tax_id': '11020', 'Name': 'Barmah Forest virus', 'Categorie': 'Viruses', 'Classe': 'ssRNA positive-strand viruses, no DNA stage', 'Ordre': 'nd', 'Fam': 'Togaviridae', 'SsFam': 'nd', 'Genre': 'Alphavirus', 'Note_org': 'BFV complex'}


### Resolve

In [35]:
for organism in Vazy1_Organism_json:
    taxon:dict = fetch_phylogeny(organism['Tax_id'])
    if taxon:
        if "Nidovirales" in taxon['phylogeny']:
            if not organism['Tax_id'] in [Vazy2_item['id'] for Vazy2_item in Vazy2_Organism_json]:
                Vazy2_Organism_json.append(taxon)

# Vazy1

### Variables

In [57]:
#import json
Nidovirales_taxid:list = []
Vazy2_json:list = []
with open("./../../../../testymolo/media/data/Organism.temp.json") as handle:
    Vazy2_json:list = json.load(handle)
Nidovirales_taxid = [ item['id'] for item in Vazy2_json]
Nidovirales_taxid.sort()
print(Nidovirales_taxid)

#TaxId:list = [ int(Org['Tax_id']) for Org in VAZY_1['Organism'] ]
#TaxId.sort()
#print(TaxId)

[11120, 11128, 11137, 11138, 11149, 28295, 31631, 227984, 233262, 694007, 694008, 1335626, 2697049]


## Methods

In [68]:
def filter__CAZy_DB__by__taxid(taxid:int) -> list:
    result:list=[]
    for item in VAZY_1['CAZy_DB']:
        if str(item['Tax_id']) == str(taxid):
            result.append(item)
    #print(result)
    return result

In [79]:
trace:dict = {}

for organism in VAZY_1['Organism']:

    taxid:str = organism['Tax_id']

    #print(taxid)
    #print(Nidovirales_taxid)

    for nido_taxid in Nidovirales_taxid:
        if taxid == str(nido_taxid):

            print(taxid)

            trace[str(taxid)] = { **organism, 'CAZy_DB':filter__CAZy_DB__by__taxid(taxid)}
            print(trace)

        break

    #break


11120
{'11120': {'Tax_id': '11120', 'Name': 'Avian infectious bronchitis virus', 'Categorie': 'Viruses', 'Classe': 'ssRNA positive-strand viruses, no DNA stage', 'Ordre': 'Nidovirales', 'Fam': 'Coronaviridae', 'SsFam': 'nd', 'Genre': 'Coronavirus', 'Note_org': 'NULL', 'CAZy_DB': [{'DB_ac': '473', 'Protein': 'ORF1ab polyprotein', 'DB_nom': 'None', 'Organism': 'Avian infectious bronchitis virus [Beaudette]', 'abr': 'AIBV[Beaudette]', 'Tax_id': '11120', 'EC': 'None', '_3D_status': 'None', 'Length': '6629', 'Sequence': 'MASSLKQGVSPKPRDVILVSKDIPEQLCDALFFYTSHNPKDYADAFAVRQKFDRSLQTGKQFKFETVCGLFLLKGVDKITPGVPAKVLKATSKLADLEDIFGVSPLARKYRELLKTACQWSLTVEALDVRAQTLDEIFDPTEILWLQVAAKIHVSSMAMRRLVGEVTAKVMDALGSNLSALFQIVKQQIARIFQKALAIFENVNELPQRIAALKMAFAKCARSITVVVVERTLVVKEFAGTCLASINGAVAKFFEELPNGFMGSKIFTTLAFFKEAAVRVVENIPNAPRGTKGFEVVGNAKGTQVVVRGMRNDLTLLDQKADIPVEPEGWSAILDGHLCYVFRSGDRFYAAPLSGNFALSDVHCCERVVCLSDGVTPEINDGLILAAIYSSFSVSELVTALKKGEPFKFLGHKFVYAKDAAVSFTLAKAATIADVLRLFQSARVIAEDVWSSFTEKSFEFWKLAYGKVRNLEEFVKTY