## Summary MLO component proteins in human and mouse from DrLLPS and PhaSepDB

In [1]:
import pandas as pd
import numpy as np

import bs4 as bs
import urllib
import requests

In [2]:
def humanToMouse_id(uniprot_id_human):    
    """find mouse uniprot id for each protein according to its human uniprot id"""
    try:
        url = "https://www.uniprot.org/uniprot/{0}" . format(uniprot_id_human)
        with requests.Session() as s:
            sauce = s.get(url, timeout = 10)
            soup = bs.BeautifulSoup(sauce.content,'html.parser')
            section = soup.find("section", id="page-header")
            divs = section.find_all("div")
            h2 = divs[1].find("h2")
            title = h2.find("span")
            uniprot_genename_human = title.get_text().lstrip("(").rstrip(")")
        
        print(uniprot_genename_human)
        
        url_mouse = ""
        url_mouse = "https://www.uniprot.org/uniprot/?query={0}&sort=score" . format(uniprot_genename_human.replace("HUMAN", "MOUSE"))

        with requests.Session() as s:
            sauce = s.get(url_mouse, timeout = 10)
            uniprot_id_mouse = str(sauce.url).split("/")[-1]
            
        if "?" not in uniprot_id_mouse:
             return uniprot_id_mouse
        else:
            return None
    except:
        return None

### DrLLPS

In [3]:
df_DrLLPS = pd.read_csv("~/projects/Factor.Harbor/data/public/LLPS/DrLLPS/LLPS.txt", header = 0, sep = "\t")

# human
df_DrLLPS_human = df_DrLLPS.loc[(
            (df_DrLLPS["Species"] == "Homo sapiens") & 
            ((df_DrLLPS["LLPS Type"] == "Scaffold") | (df_DrLLPS["LLPS Type"] == "Client") | (df_DrLLPS["LLPS Type"] == "Regulator"))
            ), :]
DrLLPS_condensate_human = np.unique(np.concatenate([np.array(row.split(", ")) for row in df_DrLLPS_human.loc[:,"Condensate"].values]))
DrLLPS_condensate_protein_human = []
for condensate in np.setdiff1d(DrLLPS_condensate_human, ["Droplet", "Others"]):
    condensate_protein = np.unique(df_DrLLPS_human.loc[df_DrLLPS_human["Condensate"].str.contains(condensate),"UniProt ID"].dropna().values)
    DrLLPS_condensate_protein_human.append([condensate, condensate_protein])
    
# mouse
df_DrLLPS_mouse = df_DrLLPS.loc[(
            (df_DrLLPS["Species"] == "Mus musculus") & 
            ((df_DrLLPS["LLPS Type"] == "Scaffold") | (df_DrLLPS["LLPS Type"] == "Client") | (df_DrLLPS["LLPS Type"] == "Regulator"))
            ), :]
DrLLPS_condensate_mouse = np.unique(np.concatenate([np.array(row.split(", ")) for row in df_DrLLPS_mouse.loc[:,"Condensate"].values]))
DrLLPS_condensate_protein_mouse = []
for condensate in np.setdiff1d(DrLLPS_condensate_mouse, ["Droplet", "Others"]):
    condensate_protein = np.unique(df_DrLLPS_mouse.loc[df_DrLLPS_mouse["Condensate"].str.contains(condensate),"UniProt ID"].dropna().values)
    DrLLPS_condensate_protein_mouse.append([condensate, condensate_protein])


### PhaSepDB (v1)

In [4]:
df_PhaSepDB_reviewed = pd.read_csv("~/projects/Factor.Harbor/data/public/LLPS/PhaSepDB/Reviewed_Data_V1.3.txt", header = 1, sep = "\t")
df_PhaSepDB_reviewed_human = df_PhaSepDB_reviewed.loc[((df_PhaSepDB_reviewed["Organism"] == "Homo sapiens ") | (df_PhaSepDB_reviewed["Organism"] == "Homo sapiens")),:]
df_PhaSepDB_reviewed_human_MLO = df_PhaSepDB_reviewed_human.loc[~df_PhaSepDB_reviewed_human["Body"].isnull(), ["UniprotEntry", "GeneSymbol", "Organism", "Body"]]
df_PhaSepDB_reviewed_mouse = df_PhaSepDB_reviewed.loc[df_PhaSepDB_reviewed["Organism"] == "Mus musculus",:]
df_PhaSepDB_reviewed_mouse_MLO = df_PhaSepDB_reviewed_mouse.loc[~df_PhaSepDB_reviewed_mouse["Body"].isnull(), ["UniprotEntry", "GeneSymbol", "Organism", "Body"]]

df_PhaSepDB_UniProtReviewed = pd.read_csv("~/projects/Factor.Harbor/data/public/LLPS/PhaSepDB/UniProtReviewed_Data_V1.3.txt", header = 1, sep = "\t")
df_PhaSepDB_UniProtReviewed_human = df_PhaSepDB_UniProtReviewed.loc[((df_PhaSepDB_UniProtReviewed["Organism"] == "Homo sapiens ") | (df_PhaSepDB_UniProtReviewed["Organism"] == "Homo sapiens")),:]
df_PhaSepDB_UniProtReviewed_human_MLO = df_PhaSepDB_UniProtReviewed_human.loc[~df_PhaSepDB_UniProtReviewed_human["Body"].isnull(), ["UniprotEntry", "GeneSymbol", "Organism", "Body"]]
df_PhaSepDB_UniProtReviewed_mouse = df_PhaSepDB_UniProtReviewed.loc[df_PhaSepDB_UniProtReviewed["Organism"] == "Mus musculus",:]
df_PhaSepDB_UniProtReviewed_mouse_MLO = df_PhaSepDB_UniProtReviewed_mouse.loc[~df_PhaSepDB_UniProtReviewed_mouse["Body"].isnull(), ["UniprotEntry", "GeneSymbol", "Organism", "Body"]]

df_PhaSepDB_high_throughput = pd.read_csv("~/projects/Factor.Harbor/data/public/LLPS/PhaSepDB/High_throughput_Data_V1.3.txt", header = 1, sep = "\t")
df_PhaSepDB_high_throughput_human = df_PhaSepDB_high_throughput.loc[((df_PhaSepDB_high_throughput["Organism"] == "Homo sapiens ") | (df_PhaSepDB_high_throughput["Organism"] == "Homo sapiens")), :]
df_PhaSepDB_high_throughput_human_MLO = df_PhaSepDB_high_throughput_human.loc[~df_PhaSepDB_high_throughput_human["Body"].isnull(), ["UniprotEntry", "GeneSymbol", "Organism", "Body"]]
df_PhaSepDB_high_throughput_mouse = df_PhaSepDB_high_throughput.loc[df_PhaSepDB_high_throughput["Organism"] == "Mus musculus", :]
df_PhaSepDB_high_throughput_mouse_MLO = df_PhaSepDB_high_throughput_mouse.loc[~df_PhaSepDB_high_throughput_mouse["Body"].isnull(), ["UniprotEntry", "GeneSymbol", "Organism", "Body"]]

df_PhaSepDB_human_MLO = pd.concat([df_PhaSepDB_reviewed_human_MLO, df_PhaSepDB_UniProtReviewed_human_MLO, df_PhaSepDB_high_throughput_human_MLO], axis = 0)
df_PhaSepDB_mouse_MLO = pd.concat([df_PhaSepDB_reviewed_mouse_MLO, df_PhaSepDB_UniProtReviewed_mouse_MLO, df_PhaSepDB_high_throughput_mouse_MLO], axis = 0)

PhaSepDB_condensate_protein_human = []
PhaSepDB_condensate_human = np.unique(df_PhaSepDB_human_MLO.loc[:, "Body"].values)
for condensate in PhaSepDB_condensate_human:
    condensate_protein = np.unique(df_PhaSepDB_human_MLO.loc[df_PhaSepDB_human_MLO["Body"] == condensate, "UniprotEntry"].dropna().values)
    PhaSepDB_condensate_protein_human.append([condensate, condensate_protein])
    
PhaSepDB_condensate_protein_mouse = []
PhaSepDB_condensate_mouse = np.unique(df_PhaSepDB_mouse_MLO.loc[:, "Body"].values)
for condensate in PhaSepDB_condensate_mouse:
    condensate_protein = np.unique(df_PhaSepDB_mouse_MLO.loc[df_PhaSepDB_mouse_MLO["Body"] == condensate, "UniprotEntry"].dropna().values)
    PhaSepDB_condensate_protein_mouse.append([condensate, condensate_protein])

### PhaSepDB (v2)

In [5]:
df_PhaSepDB_v2_MLO = pd.read_excel("~/projects/Factor.Harbor/data/public/LLPS/PhaSepDB_v2/phasepdbv2_mlo.xlsx")
df_PhaSepDB_v2_human_MLO = df_PhaSepDB_v2_MLO.loc[df_PhaSepDB_v2_MLO["organism"] == "Homo sapiens", ["entry", "name", "organism", "MLO"]]
df_PhaSepDB_v2_human_MLO.columns = ["UniprotEntry", "GeneSymbol", "Organism", "Body"]
df_PhaSepDB_v2_mouse_MLO = df_PhaSepDB_v2_MLO.loc[df_PhaSepDB_v2_MLO["organism"] == "Mus musculus", ["entry", "name", "organism", "MLO"]]
df_PhaSepDB_v2_mouse_MLO.columns = ["UniprotEntry", "GeneSymbol", "Organism", "Body"]

PhaSepDB_v2_condensate_protein_human = []
PhaSepDB_v2_condensate_human = np.unique(df_PhaSepDB_v2_human_MLO.loc[:, "Body"].values)
for condensate in PhaSepDB_v2_condensate_human:
    condensate_protein = np.unique(df_PhaSepDB_v2_human_MLO.loc[df_PhaSepDB_v2_human_MLO["Body"] == condensate, "UniprotEntry"].dropna().values)
    PhaSepDB_v2_condensate_protein_human.append([condensate, condensate_protein])

PhaSepDB_v2_condensate_protein_mouse = []
PhaSepDB_v2_condensate_mouse = np.unique(df_PhaSepDB_v2_mouse_MLO.loc[:, "Body"].values)
for condensate in PhaSepDB_v2_condensate_mouse:
    condensate_protein = np.unique(df_PhaSepDB_v2_mouse_MLO.loc[df_PhaSepDB_v2_mouse_MLO["Body"] == condensate, "UniprotEntry"].dropna().values)
    PhaSepDB_v2_condensate_protein_mouse.append([condensate, condensate_protein])
    

### Merge all MLOs

In [6]:
# show all MLOs
DrLLPS_MLOs_human = [condensate for condensate, condensate_protein in DrLLPS_condensate_protein_human]
PhaSepDB_MLOs_human = [condensate for condensate, condensate_protein in PhaSepDB_condensate_protein_human]
PhaSepDB_v2_MLOs_human = [condensate for condensate, condensate_protein in PhaSepDB_v2_condensate_protein_human]

DrLLPS_MLOs_mouse = [condensate for condensate, condensate_protein in DrLLPS_condensate_protein_mouse]
PhaSepDB_MLOs_mouse = [condensate for condensate, condensate_protein in PhaSepDB_condensate_protein_mouse]
PhaSepDB_v2_MLOs_mouse = [condensate for condensate, condensate_protein in PhaSepDB_v2_condensate_protein_mouse]

In [9]:
dict_MLOs = {"Balbiani body" : ["Balbiani body"],
             "Cajal body" : ["Cajal body"],
             "Centrosome" : ["Centrosome/Spindle pole body", "Centrosome"],
             "Chromatoid body" : ["Chromatoid body"],
             "Cleavage body" : ["Cleavage body"],
             "Cytoplasmic granule" : ["Cytoplasmic granule"],
             "Cytoplasmic ribonucleoprotein granule": ["Cytoplasmic ribonucleoprotein granule"],
             "DDX1 body" : ["DDX1 body"],
             "DNA damage foci" : ["DNA damage foci"],
             "Gem" : ["Gem"],
             "Gemini of cajal body" : ["Gemini of cajal body"],
             "Heterochromatin" : ["Heterochromatin"],
             "Histone locus body": ["Histone locus body", "Histone Locus body"],
             "Inflammasome" : ["Inflammasome"],
             "Membrane clusters" : ["Membrane clusters"],
             "miRISC" : ["miRISC"],
             "Microtubule" : ["Microtubule"],
            "Mitochondrial RNA granule" : ["Mitochondrial RNA granule"],
            "Neuronal granule" : ["Neuronal granule"], 
            "Neuronal inclusions": ["Neuronal inclusions"], 
            "Nuage" : ["Nuage"],
            "Nuclear Compartments" : ["Nuclear Compartments"],
            "Nuclear body" : ["Nuclear body"],
            "Nuclear pore complex" : ["Nuclear pore complex"],
            "Nuclear speckle" : ["Nuclear speckle", "Nuclear speckles", "Nuclues speckles"],
            "Nuclear stress body" : ["Nuclear stress body"],
            "Nucleolus" : ["Nucleolus"],
            "OPT domain" : ["OPT domain"],
            "P granule" : ["P granule"],
            "P-body" : ["P-body"],
            "PML nuclear body" : ["PML nuclear body", "PML body"],
            "Paraspeckle" : ["Paraspeckle"],
            "PcG body" : ["PcG body", "Polycomb bodies", "Polycomb body"],
            "Pericentriolar matrix" : ["Pericentriolar matrix"],
            "Perinucleolar compartment" : ["Perinucleolar compartment"],
            "Postsynaptic density" : ["Postsynaptic density", "Pre and postsynaptic densities", "Pre and postsynaptic density"],
             "Receptor cluster" : ["Receptor cluster"],
            "RNP granules" : ["RNP granules", "IMP1 ribonucleoprotein granule"],
            "Rosenthal fiber" : ["Rosenthal fiber"],
            "SMN graule": ["SMN granule", "SMN complex"],
            "Sam68 nuclear body" : ["Sam68 nuclear body", "Sam68 nuclear bodies"],
            "Signaling puncta" : ["Signaling puncta"],
            "Spindle apparatus" : ["Spindle apparatus", "Spindle pole"],
            "Splicesome" : ["Splicesome"],
            "Stress granule" : ["Stress granule", "Sress granule"],
            "TAU protein" : ["TAU protein"],
            "Transcription Factories" : ["Transcription Factories"],
            "U body" : ["U body"],
            "Z granule" : ["Z granule"]
}

In [10]:
print(len(dict_MLOs))

49


#### stat

In [19]:
# human MLOs
merged_condensate_protein_human = []
for MLO in dict_MLOs:
    condensate_proteins = []
    for condensate, condensate_protein in DrLLPS_condensate_protein_human:
        if condensate in dict_MLOs[MLO]:
            condensate_proteins.append(condensate_protein)
            break
    for condensate, condensate_protein in PhaSepDB_condensate_protein_human:
        if condensate in dict_MLOs[MLO]:
            condensate_proteins.append(condensate_protein)
            break
    for condensate, condensate_protein in PhaSepDB_v2_condensate_protein_human:
        if condensate in dict_MLOs[MLO]:
            condensate_proteins.append(condensate_protein)
            break
    
    if len(condensate_proteins) > 0:
        merged_condensate_proteins = np.unique(np.concatenate(condensate_proteins))
        merged_condensate_protein_human.append([MLO, merged_condensate_proteins])

# mouse MLOs
merged_condensate_protein_mouse = []
for MLO in dict_MLOs:
    condensate_proteins = []
    for condensate, condensate_protein in DrLLPS_condensate_protein_mouse:
        if condensate in dict_MLOs[MLO]:
            condensate_proteins.append(condensate_protein)
            break
    for condensate, condensate_protein in PhaSepDB_condensate_protein_mouse:
        if condensate in dict_MLOs[MLO]:
            condensate_proteins.append(condensate_protein)
            break
    for condensate, condensate_protein in PhaSepDB_v2_condensate_protein_mouse:
        if condensate in dict_MLOs[MLO]:
            condensate_proteins.append(condensate_protein)
            break
    
    if len(condensate_proteins) > 0:
        merged_condensate_proteins = np.unique(np.concatenate(condensate_proteins))
        merged_condensate_protein_mouse.append([MLO, merged_condensate_proteins])

In [22]:
len(merged_condensate_protein_mouse)

18

In [11]:
with open("MLO_human_merged.txt", "w") as outf:
    outf.write("MLO\tUniprot_IDs\n")
    for condensate, condensate_protein in merged_condensate_protein_human:
        outf.write("{0}\t{1}\n" . format(condensate, "," . join(condensate_protein)))
outf.close()

In [12]:
with open("MLO_mouse_merged.txt", "w") as outf:
    outf.write("MLO\tUniprot_IDs\n")
    for condensate, condensate_protein in merged_condensate_protein_mouse:
        outf.write("{0}\t{1}\n" . format(condensate, "," . join(condensate_protein)))
outf.close()

### Mouse extended (plus human MLO proteins)

In [3]:
with open("MLO_mouseExtended_merged.txt", "w") as outf:
    outf.write("MLO\tUniprot_IDs\n")
    for condensate, condensate_protein in merged_condensate_protein_human:
        condensate_protein_humanToMouse = []
        for uniprot_id_human in condensate_protein:
            uniprot_id_mouse = humanToMouse_id(uniprot_id_human)
            if uniprot_id_mouse != None:
                condensate_protein_humanToMouse.append(uniprot_id_mouse)
        outf.write("{0}\t{1}\n" . format(condensate + "_HUMAN", "," . join(np.unique(condensate_protein_humanToMouse))))
    
    for condensate, condensate_protein in merged_condensate_protein_mouse:
        outf.write("{0}\t{1}\n" . format(condensate, "," . join(condensate_protein)))
outf.close()