Make connection to `UniProt` using `bioservices`

In [1]:
from bioservices import UniProt

# Connect to UniProt service
u = UniProt(verbose=False)

Creating directory /Users/lpritc/Library/Caches/bioservices 


## Let's get the domains for the entire tree and plot them

In [32]:
from collections import defaultdict  # LP: required for parsing the domain data
from pathlib import Path

# We'll need ETE3 to parse and render the tree
from ete3 import Tree, TreeStyle, NodeStyle, TextFace, CircleFace, faces, SeqMotifFace
from tqdm.notebook import tqdm
import pandas as pd

In [6]:
# File locations and other constants
treepath = Path("EYROOTED.tree")  # transporter family tree, Newick format
annopath = Path("all_annotations.csv")     # annotations for some family members
# tr = Tree('EYROOTED.tree')  # LP: I commented this out as the path is defined above and is what is used below
stem = "eyrooted"  # LP: I changed the stem so that it's differentiated from others on my machine

In [7]:
# Load the tree
# We have these as functions so that we can annotate/render a clean tree each time
def load_tree():
    tree = Tree(str(treepath))
    return tree

# Load the annotation
def load_annotation():
    anno = pd.read_csv(annopath)
    return anno

# The function lets us return the tree and its corresponding annotation,
# where the leaves of this specific tree object are present in the
# annotation dataframe, and so can be manipulated easily.
def annotate_tree():
    tree = load_tree()
    anno = load_annotation()

    leaves = []  # Will hold leaves for the tree where they match the annotation row, or None if there is none
    
    for id in anno["0"]:               # iterate over annotations
        for leaf in tree.iter_leaves():  # iterate over all leaves in the tree
            assigned = False
            if id in str(leaf):
                leaves.append(leaf)
                assigned = True
                break
        if not assigned:
            leaves.append(None)
    
    anno["leaves"] = leaves

    return tree, anno

the error happens on trees that previousy rendered fine

In [8]:
tree, anno = annotate_tree()  # get clean tree

# Declare tree style
everything = TreeStyle()
everything.show_leaf_name = False
everything.mode = "c"

# One text face for each kingdom
face_dict = {"Eukaryota": TextFace("eukaryota"),
             "Bacteria": TextFace("bacteria"),
             "Archaea": TextFace("archaea")}

# Set colours for kingdoms
colour_dict = {"Eukaryota": "#FFFACD", "Bacteria": "#F0F8FF", "Archaea": "#FFE4E1"}

# Set face colours for kingdoms
colour_dict = {"Eukaryota": "#FFFACD", "Bacteria": "#F0F8FF", "Archaea": "#FFE4E1"}

# Set colours for residue types
rescolours = {"Y": "#FFCC66", "E": "#009933", "M": "#9966FF"}

# Iterate over leaves
for idx, row in anno.iterrows():
    # Get leaf information
    leaf = row["leaves"]
    restype = row["first"]
    fourres = row["four"]    
    kingdomname = row["kingdom"].strip()    

    # Style the leaf node
    leaf.img_style["bgcolor"] = colour_dict[kingdomname]    
    leaf.add_face(face_dict[kingdomname], 1, "aligned")

    # Add gateway residue bubble
    if restype in ("Y", "E", "M"):
        face = CircleFace(radius=20, color=rescolours[restype], style="sphere", label=restype)
        face.opacity = 0.3
        leaf.add_face(face, 1, position="float")

    # Add four residues label
    if len(fourres):
        leaf.add_face(TextFace(fourres), 2, position="float")

tree.render(f"{stem}_everything_full.pdf", tree_style=everything, w=24, h=24, units="in");

**LP:** everything works fine for me to this point

got the info by using code I found on interpro. it came as a dictionary and i pulled the relevant data out of it

In [9]:
my_file = open("domain_info.txt", "r") 
  
# reading the file 
dat = my_file.read() 
  
# replacing end splitting the text  
# when newline ('\n') is seen. 
domain_info = dat.split("\n") 
print(domain_info) 
my_file.close() 

["A0A377PLP5: [{'start': 35, 'end': 428, 'dc-status': 'CONTINUOUS'}]Ammonium Transporter Family  430", "A0A097R891: [{'start': 34, 'end': 427, 'dc-status': 'CONTINUOUS'}]Ammonium Transporter Family  429", "A0A377PBY1: [{'start': 35, 'end': 428, 'dc-status': 'CONTINUOUS'}]Ammonium Transporter Family  430", "A0A1C6Z081: [{'start': 35, 'end': 428, 'dc-status': 'CONTINUOUS'}]Ammonium Transporter Family  430", "G9Y5B0: [{'start': 34, 'end': 427, 'dc-status': 'CONTINUOUS'}]Ammonium Transporter Family  429", "A0A2A2M8K7: [{'start': 34, 'end': 427, 'dc-status': 'CONTINUOUS'}]Ammonium Transporter Family  429", "E0SD83: [{'start': 35, 'end': 428, 'dc-status': 'CONTINUOUS'}]Ammonium Transporter Family  430", "D2BUJ8: [{'start': 35, 'end': 428, 'dc-status': 'CONTINUOUS'}]Ammonium Transporter Family  430", "C6CBB1: [{'start': 35, 'end': 428, 'dc-status': 'CONTINUOUS'}]Ammonium Transporter Family  430", "A0A1X3RP39: [{'start': 35, 'end': 428, 'dc-status': 'CONTINUOUS'}]Ammonium Transporter Family  4

**LP:** Here I see data that looks like this:

```text
["A0A377PLP5: [{'start': 35, 'end': 428, 'dc-status': 'CONTINUOUS'}]Ammonium Transporter Family  430", "A0A097R891: [{'start': 34, 'end': 427, 'dc-status': 'CONTINUOUS'}]Ammonium Transporter Family  429", [...]
```

This is a list of strings. It needs to be parsed into the appropriate format, as you note below. We're aiming ultimately for a format that is a Python `list`:

```python
[start, end, shape, width, height, fgcolour, bgcolour, textlabel]
```

In my code, we need to parse a slightly recalcitrant UniProt output format that includes a header string and plain text. Here you've got a better-behaved file so we can handle this differently, keying a data dictionary by accession number, and referring to this directly when we draw the tree.

There's another tweak - you have potentially multiple motifs/lines in the file per accession, so we need to use a `defaultdict` with `list` type

In [46]:
# Parse the data from the domain info file into a dictionary keyed by accession number
domain_dict = defaultdict(list)

with Path("domain_info.txt").open() as ifh:    # Using a file context so that everything is closed when we're done
    for line in [_ for _ in ifh.readlines()]:  # Iterating over each line
        accn, data = line.split(": ", 1)       # Get the accession to use as a key, and the data for processing
        loc, name = data.split("]")            # Get the domain name for the text label
        namedata = name.strip().split()        # Split domain name and sequence length
        label, length = " ".join(namedata[:-1]), int(namedata[-1])
        locdata = loc.split(": ")
        start, end = int(locdata[1].split(",")[0]), int(locdata[2].split(",")[0])
        domain_dict[accn].append((start, end, label, length))  # Adds the current motif to a list of motifs

# Visualise the first few lines of content
print(f"{list(domain_dict.items())[:3]=}\n")

# Visualise a multidomain entry
print(f'{domain_dict["Q1Q357"]=}\n')

list(domain_dict.items())[:3]=[('A0A377PLP5', [(35, 428, 'Ammonium Transporter Family', 430)]), ('A0A097R891', [(34, 427, 'Ammonium Transporter Family', 429)]), ('A0A377PBY1', [(35, 428, 'Ammonium Transporter Family', 430)])]

domain_dict["Q1Q357"]=[(451, 518, 'His Kinase A (phospho-acceptor) domain', 679), (11, 408, 'Ammonium Transporter Family', 679), (565, 674, 'Histidine kinase-, DNA gyrase B-, and HSP90-like ATPase', 679)]



I wanted to create a dictionary that looks like yours so I can just comepletly reuse your code

**LP** That's not a bad idea, but the way I wrote the code in the previous example was tailored to the data I was collecting.

In that notebook, I had to iterate over each leaf in order to make a UniProt query to grab the data, and it made sense to key the dictionary by the leaf itself, jsts for convenience.

In this case, you have all the domain annotations at once and don't need to key the dictionary in the same way. To do that would require a look-up of the accession number in the tree, and that would be an unnecessary set of calculations. We can proceed in a way that's more appropriate for the data you have.

In [None]:
# dom_dict = {}
# dom_list = []


# for j in domain_info:
#     id = j.split(':')[0]
#     st = j.split("rt': ")[1]
#     sta = int(st.split(", 'e")[0])
#     e = st.split("d': ")[1]
#     en = int(e.split(", 'd")[0])
#     na = j.split("}]")[1]
#     nam = na.split("  ")[0]
#     len = j.split('  ')[1]
#     star_en = (f'\nDOMAIN {sta}..{en}; note="{nam}"')
#     dom_dict[str(id)] = (id, star_en, len)
#     dom_list.append(f'{id}: {id}, DOMAIN {sta}..{en}; note="{nam}", {len}')

the above needs to be merged and the key replaced by the leaf name

In [None]:
# merged_dict = {}
# merged_leaf ={}

# for entry in dom_list:
#     parts = entry.split(": ", 1)  # Split only on the first ": "
#     uniprot_id = parts[0]  # Extract ID
#     info_length = parts[1].rsplit(" ", 1)  # Split into information and length
#     information = info_length[0]
#     length = info_length[1]  


#     if uniprot_id in merged_dict:
#         merged_dict[uniprot_id] = (
#             uniprot_id, 
#             merged_dict[uniprot_id][1] + "; " + information,  # Properly concatenated info
#             length  # Keep the same length
#         )
#     else:
#         merged_dict[uniprot_id] = (uniprot_id, information, length)
#         for leaf in tr:
#             if leaf.name.split(' ')[-1] in uniprot_id:
#                 merged_leaf[str(leaf)] = (uniprot_id, information, length)
# # Print the final dictionary
# print(merged_leaf)


now i have a ditionary that looks superficially like yours

Having reproduced the tree from the previous notebook, let's get as many bits of domain information as we can get from UniProt.

In [49]:
def result_to_face(domains, fontsize=2):
    """Return a SeqMotifFace, based on the passed UniProt result
    
    LP: We need to make a change here because the result received by the function
        is not the same as in the previous notebook. It's a list of
        (start, end, label, legnth) tuples
    """
    # LP: We can lose most of the prep work here as we did it on parsing
    motifs = []
    for start, end, label, length in domains:
        # seq.start, seq.end, shape, width, height, fgcolor, bgcolor
        if "Ammonium Transporter" in label:   # LP: change here for how we check for a transporter domain
            motifs.append([start, end, "[]", None, 10, "black", "blue", f"arial|{fontsize}|white|{label}"])
        else:
            motifs.append([start, end, "[]", None, 10, "black", "red", f"arial|{fontsize}|white|{label}"])
    if len(motifs):
        if length is None:
            return SeqMotifFace(seq=None, motifs=motifs, seq_format="-")
        else:
            return SeqMotifFace(seq="-" * length, motifs=motifs, seq_format="-")
    return None

but when running the tree it gives the str not callable error even on something that has not been changed

**LP:** There were a few different reasons this was happening. The code comments above might help explain the reasons.

In [50]:
tree, anno = annotate_tree()  # get clean tree

# Declare tree style
everything = TreeStyle()
everything.show_leaf_name = False
everything.mode = "c"

# One text face for each kingdom
face_dict = {"Eukaryota": TextFace("eukaryota"),
             "Bacteria": TextFace("bacteria"),
             "Archaea": TextFace("archaea")}

# Set colours for kingdoms
colour_dict = {"Eukaryota": "#FFFACD", "Bacteria": "#F0F8FF", "Archaea": "#FFE4E1"}

# Set face colours for kingdoms
colour_dict = {"Eukaryota": "#FFFACD", "Bacteria": "#F0F8FF", "Archaea": "#FFE4E1"}

# Set colours for residue types
rescolours = {"Y": "#FFCC66", "E": "#009933", "M": "#9966FF"}

# Iterate over leaves
for idx, row in tqdm(anno.iterrows()):
    # Get leaf information
    leaf = row["leaves"]
    restype = row["first"]
    fourres = row["four"]    
    kingdomname = row["kingdom"].strip()    
    accn = str(leaf).split()[-1]     # LP: Use the accession to key/query the domain dictionary

    # Style the leaf node
    leaf.img_style["bgcolor"] = colour_dict[kingdomname]    
    leaf.add_face(face_dict[kingdomname], 1, "aligned")

    # Add gateway residue bubble
    if restype in ("Y", "E", "M"):
        face = CircleFace(radius=20, color=rescolours[restype], style="sphere", label=restype)
        face.opacity = 0.3
        leaf.add_face(face, 1, position="float")

    # Add four residues label
    if len(fourres):
        leaf.add_face(TextFace(fourres), 2, position="float")

    # Add motifs/domains
    # LP: Our first change is here - we want to use the leaf's accession number to
    #     query domain_dict
    motifs = result_to_face(domain_dict[accn])
    if motifs is not None:
        leaf.add_face(motifs, 0, "aligned")

tree.render(f"{stem}_domains.pdf", tree_style=everything, w=24, h=24, units="in");

0it [00:00, ?it/s]