In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.utils.validation import column_or_1d

In [None]:
data_file="../DATASETS/TrainSet_data.csv"
labels_file="../DATASETS/TrainSet_IDs.csv"
test="../DATASETS/TestSet_data.csv"
testlabels_file="../DATASETS/TestSet_IDs.csv"

model=joblib.load("../src/TRAINED_MODELS/ALL_FEATURES/MLP/gridsearchCV_Object.pkl")
ft_imp_df=pd.read_csv("../src/TRAINED_MODELS/ALL_FEATURES/MLP/GridSearchSelectedFeatures_with_Importance.csv",index_col=0)
X = pd.read_csv(data_file, index_col=0)
X = X.fillna(1.0)
data_columns=X.columns
y = pd.read_csv(labels_file, index_col=0)
X_v = X.values
y = y['labels'].values
y = column_or_1d(y)

test_data = pd.read_csv(test, index_col=0)
test_data = test_data.fillna(1.0)
test_data_index=test_data.index
test_data=test_data.values

ytest = pd.read_csv(testlabels_file, index_col=0)
ytest = ytest['labels'].values
ytest = column_or_1d(ytest)

In [None]:
pc_scales=pd.read_csv("../src/human_proteome_physchemscales.csv.gz",compression="gzip",index_col=0)
af_scales=pd.read_csv("../src/human_proteome_alphafold2.csv.zip",index_col=0,compression="zip")
af_scales["RG_protein_norm"]=af_scales["RG_protein"]/af_scales["Length"]
af_scales["n_contacts_norm"]=af_scales["n_contacts"]/af_scales["Length"]

In [None]:
inters=list(set(pc_scales.index).intersection(af_scales.index))
pc_scales=pc_scales.loc[inters]
af_scales=af_scales.loc[inters]
all_scales=pd.concat([pc_scales,af_scales],axis=1)
all_scales=all_scales.loc[:,X.columns]

In [None]:
feature_family_df=pd.read_excel("../../RECURSIVE_BEST_FEATURES_ELIMINATION_JAN2024/Supplementary_Table_S1.xlsx")
feature_family_df=feature_family_df.loc[:,["Feature","Feature_ID","Family","Type"]]
mapping_features=dict(zip(feature_family_df.Feature,feature_family_df.Feature_ID))

In [None]:
condensate_df=pd.read_csv("../df_condensates.csv")
condensate_df["Condensate"]=condensate_df["Condensate"].astype("category")

In [None]:
myprots=list(set(condensate_df.Uniprot_ID))
print(len(myprots), len(set(myprots).intersection(set(X.index))))
print(len(myprots), len(set(myprots).intersection(set(test_data_index))))

In [None]:
myprot_data=all_scales.loc[myprots]
len(myprot_data)

In [None]:
# Load catRAPID signature results
cat_sig=pd.read_csv("/mnt/large/jfiorentino/catRAPID_signature/Gian_catsig_data_all_human_proteome.csv",index_col=0)
# cat_sig=cat_sig.set_index("Uniprot_ID")

In [None]:
castello=pd.DataFrame(all_scales["NucleicAcidBinding-classicalRBD-Castelloetal--Cell2011-149-1393-1406"])
castello["Uniprot_ID"]=castello.index
catsig=cat_sig.loc[:,["Uniprot_ID","TotalpredictionScore"]]

newcond=pd.merge(condensate_df,castello,on="Uniprot_ID",how="left")


In [None]:
newcond=pd.merge(newcond,catsig,on="Uniprot_ID",how="left")

In [None]:
sorted_categories = newcond.groupby('Condensate')['NucleicAcidBinding-classicalRBD-Castelloetal--Cell2011-149-1393-1406'].mean().sort_values().index.tolist()

# Now you can use sorted_categories to sort your dataframe by category_column
newcond['Condensate'] = pd.Categorical(newcond['Condensate'], categories=sorted_categories, ordered=True)
newcond = newcond.sort_values('Condensate')

In [None]:
import seaborn as sns

fig,ax=plt.subplots(figsize=(15,4))
sns.boxplot(data=newcond,x="Condensate",y="NucleicAcidBinding-classicalRBD-Castelloetal--Cell2011-149-1393-1406",color="skyblue",ax=ax)
sns.pointplot(data=newcond,x="Condensate",y="NucleicAcidBinding-classicalRBD-Castelloetal--Cell2011-149-1393-1406",color="indianred",ax=ax)
ax.set_ylabel("NucleicAcidBinding_2")
ax.tick_params(axis='x', rotation=90)
plt.savefig("castello_RBD_condensate.pdf",bbox_inches="tight"),plt.close()

In [None]:
sorted_categories = newcond.groupby('Condensate')['TotalpredictionScore'].mean().sort_values().index.tolist()

# Now you can use sorted_categories to sort your dataframe by category_column
newcond['Condensate'] = pd.Categorical(newcond['Condensate'], categories=sorted_categories, ordered=True)
newcond = newcond.sort_values('Condensate')

In [None]:
import seaborn as sns

fig,ax=plt.subplots(figsize=(15,4))
sns.boxplot(data=newcond,x="Condensate",y="TotalpredictionScore",color="skyblue",ax=ax)
sns.pointplot(data=newcond,x="Condensate",y="TotalpredictionScore",color="indianred",ax=ax)
ax.tick_params(axis='x', rotation=90)
plt.savefig("catrapid_signature_condensate.pdf",bbox_inches="tight"),plt.close()

In [None]:
disprot=pd.DataFrame(all_scales["DisProt-DunkerAK-ProteinPeptLett-2008-15-9--956"])
disprot["Uniprot_ID"]=disprot.index

newcond=pd.merge(newcond,disprot,on="Uniprot_ID",how="left")

In [None]:
sorted_categories = newcond.groupby('Condensate')['DisProt-DunkerAK-ProteinPeptLett-2008-15-9--956'].mean().sort_values().index.tolist()

# Now you can use sorted_categories to sort your dataframe by category_column
newcond['Condensate'] = pd.Categorical(newcond['Condensate'], categories=sorted_categories, ordered=True)
newcond = newcond.sort_values('Condensate')

In [None]:
import seaborn as sns

fig,ax=plt.subplots(figsize=(15,4))
sns.boxplot(data=newcond,x="Condensate",y="DisProt-DunkerAK-ProteinPeptLett-2008-15-9--956",color="skyblue",ax=ax)
sns.pointplot(data=newcond,x="Condensate",y="DisProt-DunkerAK-ProteinPeptLett-2008-15-9--956",color="indianred",ax=ax)
ax.tick_params(axis='x', rotation=90)
ax.set_ylabel("Disorder_10")
plt.savefig("disorder_condensate.pdf",bbox_inches="tight"),plt.close()

In [None]:
agg=pd.DataFrame(all_scales["Aggregation-Tartaglia-J-Mol-Biol-2010-402-919"])
agg["Uniprot_ID"]=agg.index

newcond=pd.merge(newcond,agg,on="Uniprot_ID",how="left")

In [None]:
sorted_categories = newcond.groupby('Condensate')['Aggregation-Tartaglia-J-Mol-Biol-2010-402-919'].mean().sort_values().index.tolist()

# Now you can use sorted_categories to sort your dataframe by category_column
newcond['Condensate'] = pd.Categorical(newcond['Condensate'], categories=sorted_categories, ordered=True)
newcond = newcond.sort_values('Condensate')

In [None]:
import seaborn as sns

fig,ax=plt.subplots(figsize=(15,4))
sns.boxplot(data=newcond,x="Condensate",y="Aggregation-Tartaglia-J-Mol-Biol-2010-402-919",color="skyblue",ax=ax)
sns.pointplot(data=newcond,x="Condensate",y="Aggregation-Tartaglia-J-Mol-Biol-2010-402-919",color="indianred",ax=ax)
ax.tick_params(axis='x', rotation=90)
# plt.savefig("catrapid_signature_condensate.pdf",bbox_inches="tight"),plt.close()

# Permutation importance by condensate

In [None]:
str_ft=['RBD_ext_min_2', 'Percentage_BetaBridge_FullSeq',
       'Percentage_AlphaHelix_ExtSeq', 'n_contacts',
       'Percentage_AlphaHelix_FullSeq',
       'Percentage_Turn_FullSeq',
       'Percentage_Bend_FullSeq',
       'asa_std', 'Percentage_Bend_ExtSeq', 'extCharge', 'fullCharge',
       'Length', 'stddev_plddt',
       'Percentage_Coil_ExtSeq',
       'RG_protein_norm',
       'average_plddt']

In [None]:
y = pd.read_csv(labels_file, index_col=0)
ytest = pd.read_csv(testlabels_file, index_col=0)

train_negs=y[y.labels==0].index
test_negs=ytest[ytest.labels==0].index
all_negs=list(set(list(train_negs)+list(test_negs)))

In [None]:
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import random
random.seed(42)
all_dfs=[]

for cond in list(condensate_df.Condensate.cat.categories):
    tmp_prot=list(set(condensate_df.loc[condensate_df.Condensate==cond,"Uniprot_ID"]))
    # Select the data corresponding to the proteins of a given condensate
    data_cond=myprot_data.loc[tmp_prot]
    
    cond_df=[]
    for i in range(50):
        print(cond,i)
        negs=random.choices(all_negs, k=len(data_cond))
        data_negs=all_scales.loc[negs]
        # Append the positives and the negatives
        mydata=pd.concat([data_cond,data_negs],axis=0)
        mydata=mydata.fillna(1.0)
        myy=[1]*len(data_cond)+[0]*len(data_negs)

        #calculate permutation importance for test data 
        result_test = permutation_importance(
            model, mydata.values, myy, n_repeats=5, random_state=42, n_jobs=24
        )
        cond_df.append(pd.DataFrame(result_test.importances.T,columns=X.columns))
    cond_df=pd.concat(cond_df)
    print(cond_df)
    all_dfs.append(cond_df)

In [None]:
all_dfs_2=[]
for df in all_dfs:
    df2=df.copy()
    df2=df2.loc[:,ft_imp_df.Sel_Ft]
    # importances_test_s = importances_test[importances_test.columns].mean().sort_values().index
    # importances_test_s = importances_test[importances_test_s]
    df2=df2.T
    df2["Type"]="Physico-chemical"
    df2.loc[str_ft,"Type"]="AlphaFold"
    df2["Feature"]=df2.index
    melted_df_train = pd.melt(df2, id_vars=['Type', 'Feature'], var_name='Index', value_name='Values')

    # Drop the 'Index' column if you don't need it
    melted_df_train = melted_df_train.drop('Index', axis=1)
    melted_df_train["Feature_ID"]=melted_df_train.Feature.map(mapping_features)

    all_dfs_2.append(melted_df_train)

In [None]:
all_dfs_3=[]
for df in all_dfs:
    df2=df.copy()
    df2=df2.loc[:,ft_imp_df.Sel_Ft]
    df2_mean=df2.mean(axis=0)
    print(df2_mean)
    all_dfs_3.append(df2_mean)

mean_df=pd.DataFrame(all_dfs_3)

In [None]:
mean_df.index=list(condensate_df.Condensate.cat.categories)
mean_df.columns=mean_df.columns.map(mapping_features)

In [None]:
myorder = [
    "average_plddt",
    "Burial_2",
    "AlphaHelix_9",
    "BetaSheet_1",
    "Hydrophobicity_6",
    "NucleicAcidBinding_2",
    "RG_protein_norm",
    "Aggregation_3",
    "Coil_ExtSeq",
    "BetaSheet_4",
    "Disorder_8",
    "stddev_plddt",
    "fg",
    "Length",
    "fullCharge",
    "extCharge",
    "Bend_ExtSeq",
    "asa_std",
    "Membrane_7",
    "Bend_FullSeq",
    "Turn_1",
    "Turn_FullSeq",
    "AlphaHelix_4",
    "AlphaHelix_FullSeq",
    "n_contacts",
    "AlphaHelix_ExtSeq",
    "BetaBridge_FullSeq",
    "RBD_ext_min_2"
]


In [None]:
# fig,ax=plt.subplots(figsize=(8,8))
sns.set(font_scale=1.)

myfig=sns.clustermap(mean_df.loc[:,myorder],col_cluster=False,row_cluster=True)

myfig.savefig("Condensate_Permutation_Importance.pdf")