In [1]:
import pandas as pd
import numpy as np

import bs4 as bs
import urllib
import requests

## 1. Summary all known phase-separated proteins in human

### (1) DrLLPS (scaffold protein in human)

In [2]:
df_DrLLPS = pd.read_csv("/mnt/Storage/home/yuzhaowei/projects/Factor.Harbor/data/public/LLPS/DrLLPS/LLPS.txt", header = 0, sep = "\t")
df_DrLLPS_human = df_DrLLPS.loc[((df_DrLLPS["Species"] == "Homo sapiens") & (df_DrLLPS["LLPS Type"] == "Scaffold")), 
                                ["UniProt ID", "Gene name", "Species"]]
df_DrLLPS_human.loc[:, "Source"] = "DrLLPS"
print(df_DrLLPS_human.shape)
print(df_DrLLPS_human.head())

(86, 4)
  UniProt ID Gene name       Species  Source
0     P35637       FUS  Homo sapiens  DrLLPS
1     O43670    ZNF207  Homo sapiens  DrLLPS
2     P09651   HNRNPA1  Homo sapiens  DrLLPS
3     P10636      MAPT  Homo sapiens  DrLLPS
4     Q13148    TARDBP  Homo sapiens  DrLLPS


### (2) LLPSDB (human)

In [3]:
# all raw natural protein from LLPSDB
import xlrd
workbook = xlrd.open_workbook("/mnt/Storage/home/yuzhaowei/projects/Factor.Harbor/data/public/LLPS/LLPSDB/protein.xls")
table = workbook.sheets()[0]
Gene_names = table.col_values(1, start_rowx = 1, end_rowx = None)
Uniprot_IDs = table.col_values(5, start_rowx = 1, end_rowx = None)
Species = table.col_values(7, start_rowx = 1, end_rowx = None)
df_LLPSDB = pd.DataFrame([Uniprot_IDs, Gene_names, Species]).T
df_LLPSDB.columns = ["UniProt ID", "Gene name", "Species"]
df_LLPSDB_human = df_LLPSDB.loc[df_LLPSDB["Species"] == "Homo sapiens", :]
df_LLPSDB_human.loc[:, "Source"] = "LLPSDB"
print(df_LLPSDB_human.shape)
print(df_LLPSDB_human.head())

(92, 4)
   UniProt ID Gene name       Species  Source
1      P35637       FUS  Homo sapiens  LLPSDB
6      P31483     hTIA1  Homo sapiens  LLPSDB
7    P09651-2   hnRNPA1  Homo sapiens  LLPSDB
9      P26599       PTB  Homo sapiens  LLPSDB
10     Q9NQI0      Ddx4  Homo sapiens  LLPSDB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


### (3) PhaSepDB (v1, human)

In [4]:
df_PhaSepDB = pd.read_csv("/mnt/Storage/home/yuzhaowei/projects/Factor.Harbor/data/public/LLPS/PhaSepDB/Reviewed_Data_V1.3.txt", header = 1, sep = "\t")
df_PhaSepDB_human = df_PhaSepDB.loc[((df_PhaSepDB["Organism"] == "Homo sapiens ") | (df_PhaSepDB["Organism"] == "Homo sapiens")), ["UniprotEntry", "GeneSymbol", "Organism"]]
df_PhaSepDB_human.loc[:, "Source"] = "PhaSepDB_v1"
df_PhaSepDB_human.loc[:, "Organism"] = "Homo sapiens"
df_PhaSepDB_human.columns = ["UniProt ID", "Gene name", "Species", "Source"]
print(df_PhaSepDB_human.shape)
print(df_PhaSepDB_human.head())

(392, 4)
  UniProt ID          Gene name       Species       Source
0     Q96LT7            C9orf72  Homo sapiens  PhaSepDB_v1
1     Q01844          EWSR1 EWS  Homo sapiens  PhaSepDB_v1
2     Q92804  TAF15 RBP56 TAF2N  Homo sapiens  PhaSepDB_v1
3     P08047          SP1 TSFP1  Homo sapiens  PhaSepDB_v1
4     Q16198           EWS-Fli1  Homo sapiens  PhaSepDB_v1


### (4) PhaSepDB (v2, human)

In [5]:
df_PhaSepDB_v2 = pd.read_excel("~/projects/Factor.Harbor/data/public/LLPS/PhaSepDB_v2/phasepdbv2_llps.xlsx")
df_PhaSepDB_v2_human = df_PhaSepDB_v2.loc[((df_PhaSepDB_v2["organism"] == "Homo sapiens") & (df_PhaSepDB_v2["Status"] == "reviewed")), ["uniprot_entry", "Gene_names", "organism"]].drop_duplicates(subset = ["uniprot_entry"], keep = "first")
df_PhaSepDB_v2_human.loc[:, "Source"] = "PhaSepDB_v2"
df_PhaSepDB_v2_human.columns = ["UniProt ID", "Gene name", "Species", "Source"]
print(df_PhaSepDB_v2_human.shape)
print(df_PhaSepDB_v2_human.head())

(271, 4)
   UniProt ID     Gene name       Species       Source
2      P06748      NPM1 NPM  Homo sapiens  PhaSepDB_v2
4      P04150     NR3C1 GRL  Homo sapiens  PhaSepDB_v2
7      Q13283    G3BP1 G3BP  Homo sapiens  PhaSepDB_v2
12     Q14781          CBX2  Homo sapiens  PhaSepDB_v2
14     Q13148  TARDBP TDP43  Homo sapiens  PhaSepDB_v2


### (5) PhaSepPro (human)

In [6]:
import urllib
import json
with urllib.request.urlopen("https://phasepro.elte.hu/download_full.json") as url:
    # Variable 'data' will contain the full database as a nested dictionary
    data = json.loads(url.read().decode())    

In [7]:
PhaSepPro_list = []
for UniprotID in data.keys():
    PhaSepPro_list.append([UniprotID, data[UniprotID]["gene"], data[UniprotID]["organism"]])
df_PhaSepPro = pd.DataFrame(PhaSepPro_list)
df_PhaSepPro.columns = ["UniProt ID", "Gene name", "Species"]
df_PhaSepPro.loc[:, "Source"] = "PhaSepPro"
df_PhaSepPro_human = df_PhaSepPro.loc[df_PhaSepPro["Species"] == "Homo sapiens", :]
print(df_PhaSepPro_human.shape)
print(df_PhaSepPro_human.head())

(59, 4)
  UniProt ID Gene name       Species     Source
0     P31483      TIA1  Homo sapiens  PhaSepPro
2     Q9UER7      DAXX  Homo sapiens  PhaSepPro
3     Q9UPQ9    TNRC6B  Homo sapiens  PhaSepPro
4     Q12888   TP53BP1  Homo sapiens  PhaSepPro
5     Q07889      SOS1  Homo sapiens  PhaSepPro


### (6) Merged (human)

In [8]:
df_LLPS_human_merged = pd.concat([df_DrLLPS_human, df_LLPSDB_human, df_PhaSepDB_human, df_PhaSepDB_v2_human, df_PhaSepPro_human], axis = 0)
df_LLPS_human_merged.loc[:, "UniProt ID"] = df_LLPS_human_merged.loc[:, "UniProt ID"].str.upper()
df_LLPS_human_merged.loc[:, "Gene name"] = df_LLPS_human_merged.loc[:, "Gene name"].str.upper()
df_LLPS_human_merged.loc[:, "UniProt ID"] = df_LLPS_human_merged.loc[:, "UniProt ID"].str.replace(u'\xa0', u'')

df_LLPS_human_merged.to_csv("LLPS_human_merged.txt", header = True, sep = "\t", index = False)

In [9]:
# merged phase-seprated proteins
len(np.unique(df_LLPS_human_merged.loc[:, "UniProt ID"])) 

437

## 2. Summary all known phase-separated proteins in mouse

### (1) DrLLPS (scaffold protein in mouse)

In [10]:
df_DrLLPS_mouse = df_DrLLPS.loc[((df_DrLLPS["Species"] == "Mus musculus") & (df_DrLLPS["LLPS Type"] == "Scaffold")), 
                                ["UniProt ID", "Gene name", "Species"]]
df_DrLLPS_mouse.loc[:, "Source"] = "DrLLPS"
print(df_DrLLPS_mouse.shape)
print(df_DrLLPS_mouse.head())

(12, 4)
   UniProt ID Gene name       Species  Source
86     Q61686      Cbx5  Mus musculus  DrLLPS
87     Q64337    Sqstm1  Mus musculus  DrLLPS
88     F6SEU4   Syngap1  Mus musculus  DrLLPS
89     P70279     Surf6  Mus musculus  DrLLPS
90     P52912      Tia1  Mus musculus  DrLLPS


### (2) LLPSDB (mouse)

In [11]:
df_LLPSDB_mouse = df_LLPSDB.loc[df_LLPSDB["Species"] == "Mus musculus", :]
df_LLPSDB_mouse.loc[:, "Source"] = "LLPSDB"
print(df_LLPSDB_mouse.shape)
print(df_LLPSDB_mouse.head())

(9, 4)
    UniProt ID  Gene name       Species  Source
19      J3QQ18     SynGAP  Mus musculus  LLPSDB
41      Q91Z69     srGAP1  Mus musculus  LLPSDB
53      P52912      mTIA1  Mus musculus  LLPSDB
79      Q8C6L5       mGAS  Mus musculus  LLPSDB
167     Q60598  cortactin  Mus musculus  LLPSDB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


### (3) PhaSepDB (v1, mouse)

In [12]:
df_PhaSepDB_mouse = df_PhaSepDB.loc[(df_PhaSepDB["Organism"] == "Mus musculus"), ["UniprotEntry", "GeneSymbol", "Organism"]]
df_PhaSepDB_mouse.loc[:, "Source"] = "PhaSepDB_v1"
df_PhaSepDB_mouse.loc[:, "Organism"] = "Mus musculus"
df_PhaSepDB_mouse.columns = ["UniProt ID", "Gene name", "Species", "Source"]
print(df_PhaSepDB_mouse.shape)
print(df_PhaSepDB_mouse.head())

(15, 4)
    UniProt ID                Gene name       Species       Source
30      J3QQ18                  Syngap1  Mus musculus  PhaSepDB_v1
32      Q4ACU6  Shank3 Kiaa1650 Prosap2  Mus musculus  PhaSepDB_v1
34      Q9D415     Dlgap1 Gkap Kiaa4162  Mus musculus  PhaSepDB_v1
237     P83741        Wnk1 Hsn2 Prkwnk1  Mus musculus  PhaSepDB_v1
238     Q80UE6             Wnk4 Prkwnk4  Mus musculus  PhaSepDB_v1


### (4) PhaSepDB (v2, mouse)

In [13]:
df_PhaSepDB_v2_mouse = df_PhaSepDB_v2.loc[((df_PhaSepDB_v2["organism"] == "Mus musculus") & (df_PhaSepDB_v2["Status"] == "reviewed")), ["uniprot_entry", "Gene_names", "organism"]].drop_duplicates(subset = ["uniprot_entry"], keep = "first")
df_PhaSepDB_v2_mouse.loc[:, "Source"] = "PhaSepDB_v2"
df_PhaSepDB_v2_mouse.columns = ["UniProt ID", "Gene name", "Species", "Source"]
print(df_PhaSepDB_v2_mouse.shape)
print(df_PhaSepDB_v2_mouse.head())

(48, 4)
   UniProt ID                                      Gene name       Species  \
25     Q9ES28                        Arhgef7 Kiaa0142 Pak3bp  Mus musculus   
34     P35922                                     Fmr1 Fmr-1  Mus musculus   
50     P08775                          Polr2a Rpii215 Rpo2-1  Mus musculus   
51     Q925J9  Med1 Crsp210 Drip205 Pbp Pparbp Trap220 Trip2  Mus musculus   
72     Q60838                                           Dvl2  Mus musculus   

         Source  
25  PhaSepDB_v2  
34  PhaSepDB_v2  
50  PhaSepDB_v2  
51  PhaSepDB_v2  
72  PhaSepDB_v2  


### (5) PhaSepPro (mouse)

In [14]:
df_PhaSepPro_mouse = df_PhaSepPro.loc[df_PhaSepPro["Species"] == "Mus musculus", :]
print(df_PhaSepPro_mouse.shape)
print(df_PhaSepPro_mouse.head())

(2, 4)
   UniProt ID Gene name       Species     Source
11     Q60598      CTTN  Mus musculus  PhaSepPro
66     J3QQ18   SYNGAP1  Mus musculus  PhaSepPro


### (6) Merged (mouse)

In [15]:
df_LLPS_mouse_merged = pd.concat([df_DrLLPS_mouse, df_LLPSDB_mouse, df_PhaSepDB_mouse, df_PhaSepDB_v2_mouse, df_PhaSepPro_mouse], axis = 0)
df_LLPS_mouse_merged.loc[:, "UniProt ID"] = df_LLPS_mouse_merged.loc[:, "UniProt ID"].str.upper()
df_LLPS_mouse_merged.loc[:, "Gene name"] = df_LLPS_mouse_merged.loc[:, "Gene name"].str.upper()
df_LLPS_mouse_merged.loc[:, "UniProt ID"] = df_LLPS_mouse_merged.loc[:, "UniProt ID"].str.replace(u'\xa0', u'')

df_LLPS_mouse_merged.to_csv("LLPS_mouse_merged.txt", header = True, sep = "\t", index = False)

In [16]:
len(np.unique(df_LLPS_mouse_merged.loc[:, "UniProt ID"]))

61

## 3. Mouse extended (plus human phase-separated proteins)

### (1) As known phase-separated protein in mouse is far less than in human, so we extended phase-separated proteins in mouse by adding human phase-separated proteins 

In [17]:
def humanToMouse_id(uniprot_id_human):    
    """find mouse uniprot id for each protein according to its human uniprot id"""
    try:
        url = "https://www.uniprot.org/uniprot/{0}" . format(uniprot_id_human)
        with requests.Session() as s:
            sauce = s.get(url, timeout = 10)
            soup = bs.BeautifulSoup(sauce.content,'html.parser')
            section = soup.find("section", id="page-header")
            divs = section.find_all("div")
            h2 = divs[1].find("h2")
            title = h2.find("span")
            uniprot_genename_human = title.get_text().lstrip("(").rstrip(")")
        
        print(uniprot_genename_human)
        
        url_mouse = ""
        url_mouse = "https://www.uniprot.org/uniprot/?query={0}&sort=score" . format(uniprot_genename_human.replace("HUMAN", "MOUSE"))

        with requests.Session() as s:
            sauce = s.get(url_mouse, timeout = 10)
            uniprot_id_mouse = str(sauce.url).split("/")[-1]
            
        if "?" not in uniprot_id_mouse:
             return uniprot_id_mouse
        else:
            return None
    except:
        return None

In [18]:
df_LLPS_human_merged.head()

Unnamed: 0,UniProt ID,Gene name,Species,Source
0,P35637,FUS,Homo sapiens,DrLLPS
1,O43670,ZNF207,Homo sapiens,DrLLPS
2,P09651,HNRNPA1,Homo sapiens,DrLLPS
3,P10636,MAPT,Homo sapiens,DrLLPS
4,Q13148,TARDBP,Homo sapiens,DrLLPS


In [19]:
list_LLPS_humanToMouse = []
for index, row in df_LLPS_human_merged.iterrows():
        
    uniprot_id_human = row["UniProt ID"]
    gene_name = row["Gene name"]
    species = row["Species"]
    source = row["Source"]
    uniprot_id_mouse = humanToMouse_id(uniprot_id_human)
    if uniprot_id_mouse:
        list_LLPS_humanToMouse.append([uniprot_id_mouse, gene_name, species, source])

FUS_HUMAN
ZN207_HUMAN
ROA1_HUMAN
TAU_HUMAN
TADBP_HUMAN
LAT_HUMAN
MLP3B_HUMAN
SGO1_HUMAN
NPM_HUMAN
SURF6_HUMAN
A4_HUMAN
ROA0_HUMAN
CBX2_HUMAN
ROA2_HUMAN
RA1L2_HUMAN
ELAV1_HUMAN
NU153_HUMAN
DLG4_HUMAN
RL23A_HUMAN
IMB1_HUMAN
TIA1_HUMAN
RPB1_HUMAN
DAXX_HUMAN
CI072_HUMAN
FYN_HUMAN
RFOX1_HUMAN
CIRBP_HUMAN
SQSTM_HUMAN
TNR6B_HUMAN
DDX4_HUMAN
IMA1_HUMAN
UBQL2_HUMAN
EWS_HUMAN
G3BP1_HUMAN
XPO1_HUMAN
PTBP1_HUMAN
CBX5_HUMAN
GRB2_HUMAN
NOG2_HUMAN
ROA3_HUMAN
DYR1A_HUMAN
ELN_HUMAN
SYN2_HUMAN
SYN1_HUMAN
DDX3X_HUMAN
RARA_HUMAN
RBP56_HUMAN
NUP98_HUMAN
CSTF2_HUMAN
AGO2_HUMAN
RBM3_HUMAN
LEG3_HUMAN
TIAR_HUMAN
P53_HUMAN
RL5_HUMAN
DAZP1_HUMAN
NCK1_HUMAN
HNRPD_HUMAN
CSTFT_HUMAN
RBM14_HUMAN
HNRH3_HUMAN
ITSN1_HUMAN
DDX1_HUMAN
HOME3_HUMAN
CPEB2_HUMAN
CCNT1_HUMAN
COIL_HUMAN
SOS1_HUMAN
SYGP1_HUMAN
HNRDL_HUMAN
FMR1_HUMAN
PML_HUMAN
NPHN_HUMAN
WASL_HUMAN
PSPC1_HUMAN
HNRH2_HUMAN
SRSF2_HUMAN
HNRH1_HUMAN
ABL1_HUMAN
BRD4_HUMAN
SUMO3_HUMAN
PIAS2_HUMAN
AXIN1_HUMAN
MED1_HUMAN
CGAS_HUMAN
SPOP_HUMAN
FUS_HUMAN
TIA1_HUMAN
ROA1_H

DAZP1_HUMAN
DDX5_HUMAN
DDX6_HUMAN
MARE1_HUMAN
DYRK3_HUMAN
CSTF2_HUMAN
CSTFT_HUMAN
DHX9_HUMAN
DEUP1_HUMAN
CAPR1_HUMAN
CAV1_HUMAN
CAVN1_HUMAN
CBP_HUMAN
FXR1_HUMAN
FMR1_HUMAN
CDT1_HUMAN
CEP63_HUMAN
CE152_HUMAN
KEAP1_HUMAN
CRGD_HUMAN
SUV91_HUMAN
SUMO3_HUMAN
STIL_HUMAN
STAT3_HUMAN
SRSF2_HUMAN
SRRM2_HUMAN
RBP56_HUMAN
SYN2_HUMAN
SYN1_HUMAN
SRRM1_HUMAN
NCOA3_HUMAN
WWTR1_HUMAN
RUNX2_HUMAN
RL5_HUMAN
RL23A_HUMAN
RPAC2_HUMAN
RBX1_HUMAN
SAS6_HUMAN
SMAD3_HUMAN
SOX2_HUMAN
SOS1_HUMAN
BLNK_HUMAN
SHAN3_HUMAN
SGO1_HUMAN
SF3B1_HUMAN
RASK_HUMAN
UBP42_HUMAN
WASP_HUMAN
UBP10_HUMAN
USH1G_HUMAN
USH1C_HUMAN
UBQL4_HUMAN
UBQL1_HUMAN
UBC9_HUMAN
CTNB1_HUMAN
SYUA_HUMAN
ZO3_HUMAN
ZO2_HUMAN
UBE3A_HUMAN
TBK1_HUMAN
TBP_HUMAN
TEAD1_HUMAN
TNR6B_HUMAN
RU17_HUMAN
TIF1B_HUMAN
TERF1_HUMAN
TEAD4_HUMAN
TFEB_HUMAN
RBM3_HUMAN
RBM20_HUMAN
NBR1_HUMAN
NELFD_HUMAN
NCK1_HUMAN
BASP1_HUMAN
NPHN_HUMAN
NELFE_HUMAN
MYO7B_HUMAN
NF2L2_HUMAN
NUFP2_HUMAN
NUCL_HUMAN
NSD2_HUMAN
MYO7A_HUMAN
MLP3B_HUMAN
MDC1_HUMAN
LMNA_HUMAN
PHLB2_HUMAN
LIPA1_HUMA

In [20]:
df_LLPS_humanToMouse = pd.DataFrame(list_LLPS_humanToMouse)
df_LLPS_humanToMouse.columns = df_LLPS_mouse_merged.columns.values
df_LLPS_mouseExtended = pd.concat([df_LLPS_humanToMouse, df_LLPS_mouse_merged], axis = 0)
df_LLPS_mouseExtended.to_csv("LLPS_mouseExtended.txt", header = True, sep = "\t", index = False)