In [9]:
import pandas as pd
import numpy as np
import os,sys
import bs4 as bs
import urllib
import requests
import xlrd

## Summary all known phase-separated proteins in human

### DrLLPS (scaffold protein in human)

In [10]:
df_DrLLPS = pd.read_csv("~/projects/Factor.Harbor/data/public/LLPS/DrLLPS/LLPS.txt", header = 0, sep = "\t")
df_DrLLPS_human = df_DrLLPS.loc[((df_DrLLPS["Species"] == "Homo sapiens") & (df_DrLLPS["LLPS Type"] == "Scaffold")), 
                                ["UniProt ID", "Gene name", "Species"]]
df_DrLLPS_human.loc[:, "Source"] = "DrLLPS"
print(df_DrLLPS_human.shape)

(86, 4)


### LLPSDB (human)

In [12]:
# all raw natural protein from LLPSDB
llpsdb_dir = os.path.join(os.path.expanduser("~"), "projects/Factor.Harbor/data/public/LLPS/LLPSDB/protein.xls")
workbook = xlrd.open_workbook(llpsdb_dir)
table = workbook.sheets()[0]
Gene_names = table.col_values(1, start_rowx = 1, end_rowx = None)
Uniprot_IDs = table.col_values(5, start_rowx = 1, end_rowx = None)
Species = table.col_values(7, start_rowx = 1, end_rowx = None)
df_LLPSDB = pd.DataFrame([Uniprot_IDs, Gene_names, Species]).T
df_LLPSDB.columns = ["UniProt ID", "Gene name", "Species"]
df_LLPSDB_human = df_LLPSDB.loc[df_LLPSDB["Species"] == "Homo sapiens", :]
df_LLPSDB_human.loc[:, "Source"] = "LLPSDB"
print(df_LLPSDB_human.shape)

(92, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_LLPSDB_human.loc[:, "Source"] = "LLPSDB"


### PhaSepDB (v1, human)

In [13]:
df_PhaSepDB = pd.read_csv("/mnt/Storage/home/yuzhaowei/projects/Factor.Harbor/data/public/LLPS/PhaSepDB/Reviewed_Data_V1.3.txt", header = 1, sep = "\t")
df_PhaSepDB_human = df_PhaSepDB.loc[((df_PhaSepDB["Organism"] == "Homo sapiens ") | (df_PhaSepDB["Organism"] == "Homo sapiens")), ["UniprotEntry", "GeneSymbol", "Organism"]]
df_PhaSepDB_human.loc[:, "Source"] = "PhaSepDB_v1"
df_PhaSepDB_human.loc[:, "Organism"] = "Homo sapiens"
df_PhaSepDB_human.columns = ["UniProt ID", "Gene name", "Species", "Source"]
print(df_PhaSepDB_human.shape)

(392, 4)


### PhaSepDB (v2, human)

In [15]:
df_PhaSepDB_v2 = pd.read_excel("~/projects/Factor.Harbor/data/public/LLPS/PhaSepDB_v2/phasepdbv2_llps.xlsx")
df_PhaSepDB_v2_human = df_PhaSepDB_v2.loc[((df_PhaSepDB_v2["organism"] == "Homo sapiens") & (df_PhaSepDB_v2["Status"] == "reviewed")), ["uniprot_entry", "Gene_names", "organism"]].drop_duplicates(subset = ["uniprot_entry"], keep = "first")
df_PhaSepDB_v2_human.loc[:, "Source"] = "PhaSepDB_v2"
df_PhaSepDB_v2_human.columns = ["UniProt ID", "Gene name", "Species", "Source"]
print(df_PhaSepDB_v2_human.shape)

(271, 4)


### PhaSepPro (human)

In [16]:
import urllib
import json
with urllib.request.urlopen("https://phasepro.elte.hu/download_full.json") as url:
    # Variable 'data' will contain the full database as a nested dictionary
    data = json.loads(url.read().decode())    

In [17]:
PhaSepPro_list = []
for UniprotID in data.keys():
    PhaSepPro_list.append([UniprotID, data[UniprotID]["gene"], data[UniprotID]["organism"]])
df_PhaSepPro = pd.DataFrame(PhaSepPro_list)
df_PhaSepPro.columns = ["UniProt ID", "Gene name", "Species"]
df_PhaSepPro.loc[:, "Source"] = "PhaSepPro"
df_PhaSepPro_human = df_PhaSepPro.loc[df_PhaSepPro["Species"] == "Homo sapiens", :]
print(df_PhaSepPro_human.shape)

(59, 4)


### Merged (human)

In [18]:
df_LLPS_human_merged = pd.concat([df_DrLLPS_human, df_LLPSDB_human, df_PhaSepDB_human, df_PhaSepDB_v2_human, df_PhaSepPro_human], axis = 0)
df_LLPS_human_merged.loc[:, "UniProt ID"] = df_LLPS_human_merged.loc[:, "UniProt ID"].str.upper()
df_LLPS_human_merged.loc[:, "Gene name"] = df_LLPS_human_merged.loc[:, "Gene name"].str.upper()
df_LLPS_human_merged.loc[:, "UniProt ID"] = df_LLPS_human_merged.loc[:, "UniProt ID"].str.replace(u'\xa0', u'')

df_LLPS_human_merged.to_csv("LLPS_human_merged.txt", header = True, sep = "\t", index = False)

In [19]:
# merged phase-seprated proteins
len(np.unique(df_LLPS_human_merged.loc[:, "UniProt ID"])) 

437

## Summary all known phase-separated proteins in mouse

### DrLLPS (scaffold protein in mouse)

In [21]:
df_DrLLPS_mouse = df_DrLLPS.loc[((df_DrLLPS["Species"] == "Mus musculus") & (df_DrLLPS["LLPS Type"] == "Scaffold")), 
                                ["UniProt ID", "Gene name", "Species"]]
df_DrLLPS_mouse.loc[:, "Source"] = "DrLLPS"
print(df_DrLLPS_mouse.shape)

(12, 4)


### LLPSDB (mouse)

In [22]:
df_LLPSDB_mouse = df_LLPSDB.loc[df_LLPSDB["Species"] == "Mus musculus", :]
df_LLPSDB_mouse.loc[:, "Source"] = "LLPSDB"
print(df_LLPSDB_mouse.shape)

(9, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_LLPSDB_mouse.loc[:, "Source"] = "LLPSDB"


### PhaSepDB (v1, mouse)

In [23]:
df_PhaSepDB_mouse = df_PhaSepDB.loc[(df_PhaSepDB["Organism"] == "Mus musculus"), ["UniprotEntry", "GeneSymbol", "Organism"]]
df_PhaSepDB_mouse.loc[:, "Source"] = "PhaSepDB_v1"
df_PhaSepDB_mouse.loc[:, "Organism"] = "Mus musculus"
df_PhaSepDB_mouse.columns = ["UniProt ID", "Gene name", "Species", "Source"]
print(df_PhaSepDB_mouse.shape)

(15, 4)


### PhaSepDB (v2, mouse)

In [24]:
df_PhaSepDB_v2_mouse = df_PhaSepDB_v2.loc[((df_PhaSepDB_v2["organism"] == "Mus musculus") & (df_PhaSepDB_v2["Status"] == "reviewed")), ["uniprot_entry", "Gene_names", "organism"]].drop_duplicates(subset = ["uniprot_entry"], keep = "first")
df_PhaSepDB_v2_mouse.loc[:, "Source"] = "PhaSepDB_v2"
df_PhaSepDB_v2_mouse.columns = ["UniProt ID", "Gene name", "Species", "Source"]
print(df_PhaSepDB_v2_mouse.shape)

(48, 4)


### PhaSepPro (mouse)

In [25]:
df_PhaSepPro_mouse = df_PhaSepPro.loc[df_PhaSepPro["Species"] == "Mus musculus", :]
print(df_PhaSepPro_mouse.shape)

(2, 4)


### Merged (mouse)

In [26]:
df_LLPS_mouse_merged = pd.concat([df_DrLLPS_mouse, df_LLPSDB_mouse, df_PhaSepDB_mouse, df_PhaSepDB_v2_mouse, df_PhaSepPro_mouse], axis = 0)
df_LLPS_mouse_merged.loc[:, "UniProt ID"] = df_LLPS_mouse_merged.loc[:, "UniProt ID"].str.upper()
df_LLPS_mouse_merged.loc[:, "Gene name"] = df_LLPS_mouse_merged.loc[:, "Gene name"].str.upper()
df_LLPS_mouse_merged.loc[:, "UniProt ID"] = df_LLPS_mouse_merged.loc[:, "UniProt ID"].str.replace(u'\xa0', u'')

df_LLPS_mouse_merged.to_csv("LLPS_mouse_merged.txt", header = True, sep = "\t", index = False)

In [27]:
len(np.unique(df_LLPS_mouse_merged.loc[:, "UniProt ID"]))

61

## Mouse extended (plus human phase-separated proteins)

### As known phase-separated protein in mouse is far less than in human, so we extended phase-separated proteins in mouse by adding human phase-separated proteins 

In [28]:
def humanToMouse_id(uniprot_id_human):    
    """find mouse uniprot id for each protein according to its human uniprot id"""
    try:
        url = "https://www.uniprot.org/uniprot/{0}" . format(uniprot_id_human)
        with requests.Session() as s:
            sauce = s.get(url, timeout = 10)
            soup = bs.BeautifulSoup(sauce.content,'html.parser')
            section = soup.find("section", id="page-header")
            divs = section.find_all("div")
            h2 = divs[1].find("h2")
            title = h2.find("span")
            uniprot_genename_human = title.get_text().lstrip("(").rstrip(")")
        
        # print(uniprot_genename_human)
        
        url_mouse = ""
        url_mouse = "https://www.uniprot.org/uniprot/?query={0}&sort=score" . format(uniprot_genename_human.replace("HUMAN", "MOUSE"))

        with requests.Session() as s:
            sauce = s.get(url_mouse, timeout = 10)
            uniprot_id_mouse = str(sauce.url).split("/")[-1]
            
        if "?" not in uniprot_id_mouse:
             return uniprot_id_mouse
        else:
            return None
    except:
        return None

In [29]:
df_LLPS_human_merged.head()

Unnamed: 0,UniProt ID,Gene name,Species,Source
0,P35637,FUS,Homo sapiens,DrLLPS
1,O43670,ZNF207,Homo sapiens,DrLLPS
2,P09651,HNRNPA1,Homo sapiens,DrLLPS
3,P10636,MAPT,Homo sapiens,DrLLPS
4,Q13148,TARDBP,Homo sapiens,DrLLPS


In [30]:
list_LLPS_humanToMouse = []
for index, row in df_LLPS_human_merged.iterrows():
    
    uniprot_id_human = row["UniProt ID"]
    gene_name = row["Gene name"]
    species = row["Species"]
    source = row["Source"]
    uniprot_id_mouse = humanToMouse_id(uniprot_id_human)
    if uniprot_id_mouse:
        list_LLPS_humanToMouse.append([uniprot_id_mouse, gene_name, species, source])

In [32]:
df_LLPS_humanToMouse = pd.DataFrame(list_LLPS_humanToMouse)
df_LLPS_humanToMouse.columns = df_LLPS_mouse_merged.columns.values
df_LLPS_mouseExtended = pd.concat([df_LLPS_humanToMouse, df_LLPS_mouse_merged], axis = 0)
df_LLPS_mouseExtended.to_csv("LLPS_mouseExtended.txt", header = True, sep = "\t", index = False)