In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [None]:
import os

plots_folder="./plots/"

if os.path.isdir(plots_folder)==False:
    os.mkdir(plots_folder)

In [None]:
test_data_file="../DATASETS/TestSet_data.csv"

In [None]:
model_file="../src/TRAINED_MODELS/ALL_FEATURES/MLP/gridsearchCV_Object.pkl"

In [None]:
test_data=pd.read_csv(test_data_file,index_col=0)
test_data_columns=test_data.columns

In [None]:
model=joblib.load(model_file)

In [None]:
pc_scales=pd.read_csv("../src/human_proteome_physchemscales.csv.gz",
                     index_col=0,compression="gzip")
af_scales=pd.read_csv("../human_proteome_alphafold2.csv.zip",
                      index_col=0,compression="zip")
af_scales["n_contacts_norm"]=af_scales.n_contacts/af_scales.Length
af_scales["RG_protein_norm"]=af_scales.RG_protein/af_scales.Length

In [None]:
inters=list(set(pc_scales.index).intersection(af_scales.index))
pc_scales=pc_scales.loc[inters]
af_scales=af_scales.loc[inters]
print((pc_scales.index==af_scales.index).all())

In [None]:
all_scales=pd.merge(pc_scales,af_scales,left_index=True, right_index=True)
all_scales=all_scales.loc[:,test_data_columns]
print((all_scales.columns==test_data.columns).all())

In [None]:
nan_fraction = all_scales.isna().mean()

# Display the result
print(nan_fraction[nan_fraction>0])

In [None]:
all_scales=all_scales.fillna(1.0)

In [None]:
catG2_scores=model.best_estimator_.predict_proba(all_scales)[:,1]

In [None]:
catG2_scores_df=pd.DataFrame(data=catG2_scores,index=all_scales.index)
catG2_scores_df.columns=['LLPS_score']
# catG2_scores_df.to_csv("catG2_scores_human_proteome.csv")

In [None]:
# Check that the prediction is correct
catG2_pred_test=pd.read_csv("../src/TRAINED_MODELS/ALL_FEATURES/MLP/catGRANULE2_prediction_Test.csv",index_col=0)

In [None]:
# Load DrLLPS data
DrLLPS=pd.read_csv("./DrLLPS/LLPS.csv",delimiter="\t")
DrLLPS=DrLLPS[DrLLPS.Species=="Homo sapiens"]

In [None]:
inters_DrLLPS=list(set(catG2_scores_df.index).intersection(set(DrLLPS["UniProt ID"])))
len(inters_DrLLPS)

In [None]:
DrLLPS=DrLLPS.loc[DrLLPS["UniProt ID"].isin(inters_DrLLPS)]
DrLLPS=DrLLPS.set_index("UniProt ID")
catG2_scores_df_DrLLPS=catG2_scores_df.loc[inters_DrLLPS]

In [None]:
catG2_scores_df_DrLLPS=catG2_scores_df_DrLLPS.reindex(DrLLPS.index)

In [None]:
(catG2_scores_df_DrLLPS.index==DrLLPS.index).all()

In [None]:
DrLLPS["LLPS_score"]=catG2_scores_df.LLPS_score

In [None]:
DrLLPS.loc[:,["Condensate","LLPS_score"]]

In [None]:
# Convert string representations of lists to actual lists
# DrLLPS['Condensate'] = DrLLPS['Condensate'].apply(eval)
DrLLPS['Condensate'] = DrLLPS['Condensate'].apply(lambda x: x.split(', '))
# Explode the 'Condensate' column to separate rows based on commas
df_expanded = DrLLPS.explode('Condensate')

# Calculate the mean 'LLPS_score' for each 'Condensate' and sort in descending order
condensate_median = df_expanded.groupby('Condensate')['LLPS_score'].median().sort_values(ascending=False)

# Filter compartments with less than 5 counts
min_counts = 5
condensate_counts = df_expanded['Condensate'].value_counts()[condensate_median.index]
condensate_counts = condensate_counts[condensate_counts >= min_counts]

# Order the DataFrame based on the mean 'LLPS_score' and filtered counts
df_expanded['Condensate'] = pd.Categorical(df_expanded['Condensate'], categories=condensate_counts.index, ordered=True)
df_expanded = df_expanded[df_expanded['Condensate'].isin(condensate_counts.index)]

In [None]:
# Load the human proteome with the subcellular location to select a negative control
human_proteome=pd.read_csv("uniprotkb_Human_AND_model_organism_9606_2024_01_09.tsv.zip",delimiter="\t",
                          compression="zip")
human_proteome=human_proteome.dropna(subset="Subcellular location [CC]")

In [None]:
catG2_all_human=pd.read_csv("catG2_scores_human_proteome.csv",index_col=0)

In [None]:
membrane_proteins=list(human_proteome[human_proteome["Subcellular location [CC]"].str.contains("Membrane")]["Entry"])

In [None]:
catG2_membrane=catG2_all_human.loc[set(catG2_all_human.index).intersection(set(membrane_proteins))]

In [None]:
catG2_membrane["Condensate"]="Membrane"
catG2_membrane=catG2_membrane.loc[:,["Condensate","LLPS_score"]]

In [None]:
new_df_expanded=pd.concat([df_expanded.loc[:,["Condensate","LLPS_score"]],catG2_membrane])
condensate_median = new_df_expanded.groupby('Condensate')['LLPS_score'].median().sort_values(ascending=False)

# Filter compartments with less than 5 counts
min_counts = 5
condensate_counts = new_df_expanded['Condensate'].value_counts()[condensate_median.index]
condensate_counts = condensate_counts[condensate_counts >= min_counts]

# Order the DataFrame based on the mean 'LLPS_score' and filtered counts
new_df_expanded['Condensate'] = pd.Categorical(new_df_expanded['Condensate'], categories=condensate_counts.index, ordered=True)
new_df_expanded = new_df_expanded[new_df_expanded['Condensate'].isin(condensate_counts.index)]

In [None]:
membrane_proteins=list(human_proteome[human_proteome["Subcellular location [CC]"].str.contains("Membrane")]["Entry"])
df_memb=catG2_all_human.loc[catG2_all_human.index.isin(membrane_proteins)]
df_memb["Subcellular_Location"]="Membrane"
print(len(membrane_proteins))
nucleus_proteins=list(human_proteome[human_proteome["Subcellular location [CC]"].str.contains("Nucleus",case=False)]["Entry"])
df_nuc=catG2_all_human.loc[catG2_all_human.index.isin(nucleus_proteins)]
df_nuc["Subcellular_Location"]="Nucleus"
print(len(nucleus_proteins))
nucleolus_proteins=list(new_df_expanded[new_df_expanded.Condensate=='Nucleolus'].index)+list(human_proteome[human_proteome["Subcellular location [CC]"].str.contains("Nucleolus",case=False)]["Entry"])
df_nucleolus=catG2_all_human.loc[catG2_all_human.index.isin(list(set(nucleolus_proteins)))]
df_nucleolus["Subcellular_Location"]="Nucleolus"
print(len(nucleolus_proteins))
cytoplasm_proteins=list(human_proteome[human_proteome["Subcellular location [CC]"].str.contains("Cytoplasm",case=False)]["Entry"])
df_cytoplasm=catG2_all_human.loc[catG2_all_human.index.isin(cytoplasm_proteins)]
df_cytoplasm["Subcellular_Location"]="Cytoplasm"
print(len(cytoplasm_proteins))
extracellular_proteins=list(human_proteome[human_proteome["Subcellular location [CC]"].str.contains("Extracellular",case=False)]["Entry"])
df_extracell=catG2_all_human.loc[catG2_all_human.index.isin(extracellular_proteins)]
df_extracell["Subcellular_Location"]="Extracellular"
print(len(extracellular_proteins))
mitochondrial_proteins=list(human_proteome[human_proteome["Subcellular location [CC]"].str.contains("Mitochondr",case=False)]["Entry"])
df_mito=catG2_all_human.loc[catG2_all_human.index.isin(mitochondrial_proteins)]
df_mito["Subcellular_Location"]="Mitochondrial"
print(len(mitochondrial_proteins))
secreted_proteins=list(human_proteome[human_proteome["Subcellular location [CC]"].str.contains("Secreted",case=False)]["Entry"])
df_secr=catG2_all_human.loc[catG2_all_human.index.isin(secreted_proteins)]
df_secr["Subcellular_Location"]="Secreted"
print(len(secreted_proteins))

In [None]:
df_subloc=pd.concat([df_nuc,df_nucleolus,df_cytoplasm,df_extracell,df_mito,df_secr,df_memb])

In [None]:
# Calculate the mean 'LLPS_score' for each 'Condensate' and sort in descending order
subloc_median = df_subloc.groupby('Subcellular_Location')['LLPS_score'].median().sort_values(ascending=False)

# Filter compartments with less than 5 counts
min_counts = 5
subloc_counts = df_subloc['Subcellular_Location'].value_counts()[subloc_median.index]
subloc_counts = subloc_counts[subloc_counts >= min_counts]

# Order the DataFrame based on the mean 'LLPS_score' and filtered counts
df_subloc['Subcellular_Location'] = pd.Categorical(df_subloc['Subcellular_Location'], categories=subloc_counts.index, ordered=True)
df_subloc = df_subloc[df_subloc['Subcellular_Location'].isin(subloc_counts.index)]

In [None]:
# Create a boxplot using seaborn
plt.figure(figsize=(10, 6))

sns.reset_defaults()

boxplot = sns.violinplot(x='Subcellular_Location', y='LLPS_score', data=df_subloc, palette=sns.color_palette("pastel"))#,violin=False)
# sns.swarmplot(x='Condensate', y='LLPS_score',size=5, data=new_df_expanded, color='black',alpha=.4)
plt.axhline(0.5,linestyle="dashed",color="grey",lw=3)
plt.xlabel("Subcellular Location")
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
# plt.title('Boxplot of LLPS Scores by Condensate (Counts >= 5)')

# Annotate each box with the number of proteins
for i, count in enumerate(subloc_counts):
    boxplot.text(i, 1.15, f'n={count}', ha='center', va='bottom', color='black')
# plt.savefig(plots_folder+"LLPS_score_sub_location.pdf",bbox_inches="tight")
plt.show()
# plt.savefig("LLPS_score_condensate_type.pdf",bbox_inches="tight"),plt.close()

In [None]:
location_color=dict(zip(list(df_subloc.Subcellular_Location.cat.categories),sns.color_palette("pastel")))
location_color

In [None]:
location_color = {'Nucleolus': (0.6313725490196078, 0.788235294117647, 0.9568627450980393),
 'Cytoplasm': (1.0, 0.7058823529411765, 0.5098039215686274),
 'Nucleus': (0.5529411764705883, 0.8980392156862745, 0.6313725490196078),
 'Mitochondrial': (1.0, 0.6235294117647059, 0.6078431372549019),
 'Extracellular': (0.8156862745098039, 0.7333333333333333, 1.0),
 'Secreted': (0.8705882352941177, 0.7333333333333333, 0.6078431372549019),
 'Membrane': (0.9803921568627451, 0.6901960784313725, 0.8941176470588236),
 'Others': (0.5, 0.5, 0.5)}  # Grey color for "?"

In [None]:
locations=["Nucleus", "Nucleus", "Nucleolus","Nucleus",
           "Nucleus", "Others", "Others", 
           "Nucleus","Cytoplasm","Nucleus",
           "Others","Cytoplasm","Nucleus",
           "Nucleus","Nucleus","Nucleus",
           "Nucleus","Nucleus","Nucleus","Others",
           "Nucleus", "Mitochondrial", "Nucleus",
           "Membrane"]

condensate_location_dict=dict(zip(list(new_df_expanded.Condensate.cat.categories),locations))
new_df_expanded["Subcellular_Location"]=new_df_expanded["Condensate"].map(condensate_location_dict)
new_df_expanded["Location_color"]=new_df_expanded["Subcellular_Location"].map(location_color)

# new_df_expanded.Condensate=new_df_expanded.Condensate.cat.reorder_categories(['Nucleolus','Stress granule','P-body','Sam68 nuclear body', 'DNA damage foci', 'Nuclear speckle',
#        'Histone locus body', 
#        'Nuclear stress body',  'Cajal body',
#         'PcG body',
#        'Centrosome/Spindle pole body', 'Paraspeckle', 'Chromatoid body',
#        'PML nuclear body', 'Spindle apparatus', 'OPT domain', 
#        'Gemini of cajal body', 'Receptor cluster','Mitochondrial RNA granule','Droplet', 'Neuronal granule','Postsynaptic density','Others',
#        'Membrane'])

In [None]:
condensate_colors_dict = {condensate: location_color[loc] for condensate, loc in condensate_location_dict.items()}

# reordered_condensate_colors_dict = {condensate: condensate_colors_dict[condensate] for condensate in new_df_expanded.Condensate.cat.categories}


In [None]:
condensate_counts=condensate_counts.reindex(new_df_expanded.Condensate.cat.categories)

In [None]:
new_df_expanded['Subcellular_Location']=new_df_expanded.Subcellular_Location.astype("category")

In [None]:
new_df_expanded['Subcellular_Location']

In [None]:
new_df_expanded['Subcellular_Location']=new_df_expanded['Subcellular_Location'].cat.reorder_categories(['Nucleolus','Cytoplasm', 'Nucleus', 'Mitochondrial', 'Others', 'Membrane'])

In [None]:
sns.reset_defaults()

min_counts = 10
condensate_counts_filtered = condensate_counts.loc[condensate_counts >= min_counts]
condensate_counts_filtered = condensate_counts_filtered.drop("Membrane")

# Order the DataFrame based on the mean 'LLPS_score' and filtered counts
new_df_expanded_filtered=new_df_expanded.copy()
new_df_expanded_filtered = new_df_expanded_filtered.loc[new_df_expanded_filtered['Condensate'].isin(condensate_counts_filtered.index)]
new_df_expanded_filtered.Subcellular_Location=new_df_expanded_filtered.Subcellular_Location.cat.remove_unused_categories()

condensate_colors_dict_filtered = {key: condensate_colors_dict[key] for key in condensate_counts_filtered.index}

new_df_expanded_filtered.Condensate=new_df_expanded_filtered.Condensate.cat.remove_unused_categories()

# Create a boxplot using seaborn
plt.figure(figsize=(20, 6))

# sns.set_palette([location_color[loc] for loc in new_df_expanded['Subcellular_Location'].unique()])

boxplot = sns.violinplot(x='Condensate', y='LLPS_score', data=new_df_expanded_filtered,palette=condensate_colors_dict_filtered.values(),dodge=True)
# sns.swarmplot(x='Condensate', y='LLPS_score',size=5, data=new_df_expanded, color='black',alpha=.4)
plt.xlabel("DrLLPS condensate category")
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
# plt.title('Boxplot of LLPS Scores by Condensate (Counts >= 5)')
plt.axhline(0.5,linestyle="dashed",color="grey",lw=3)
# Annotate each box with the number of proteins
for i, count in enumerate(condensate_counts_filtered):
    print(i,count)
    boxplot.text(i, 1.27, f'n={count}', ha='center', va='bottom', color='black')
legend_labels = [loc for loc in new_df_expanded_filtered['Subcellular_Location'].cat.categories]
legend_colors = [location_color[loc] for loc in legend_labels]
legend_elements = [plt.Line2D([0], [0], marker='o', color='w', label=label, 
                              markerfacecolor=color, markersize=10) for label, color in zip(legend_labels, legend_colors)]
boxplot.legend(handles=legend_elements,title='Subcellular Location', bbox_to_anchor=(1,1))
# plt.savefig(plots_folder+"LLPS_score_condensate_type_color_n_15.pdf",bbox_inches="tight")
    
plt.show()
# plt.savefig("LLPS_score_condensate_type.pdf",bbox_inches="tight"),plt.close()