In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import graph_tool.all as gt

import json
import random
import nmi
import glob
import string
import statsmodels

from pathlib import Path
from matplotlib.ticker import FormatStrFormatter
from time import localtime, strftime
from matplotlib.patches import Patch
from sbmtm import sbmtm
from nsbm import nsbm

from helps import *

import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')
plt.rcParams['font.size'] = 15
plt.rcParams["xtick.labelsize"]=13
plt.rcParams["ytick.labelsize"]=13
plt.rcParams["axes.titlesize"]=15
plt.rcParams["figure.dpi"]=600
plt.rcParams["savefig.format"]="pdf"
plt.rcParams["savefig.bbox"]="tight"

# triSBM: experiment run

In [None]:
mRNA=pd.read_csv("Results/hSBM-mRNA/hSBM-mRNA.csv.gz", index_col=0)
lncRNA=pd.read_csv("Results/hSBM-lncRNA/hSBM-lncRNA.csv.gz", index_col=0)
lncRNA=lncRNA[mRNA.columns]
mRNA.shape, lncRNA.shape

In [None]:
#It takes 2 minutes to build the graph on a i5-8265U 4 cores 1.60 GHz laptop
model = nsbm()
print(strftime("%Y-%m-%d %H:%M:%S", localtime()))
model.make_graph_multiple_df(mRNA, [lncRNA])
print(strftime("%Y-%m-%d %H:%M:%S", localtime()))

In [None]:
print(strftime("%Y-%m-%d %H:%M:%S", localtime()))
model.fit(n_init=1,verbose=False)
print(strftime("%Y-%m-%d %H:%M:%S", localtime()))

In [None]:
path_to_save="Results/triSBM-mRNA-lncRNA/nSBM/triSBM-mRNA-lncRNA"
Path(path_to_save).mkdir(parents=True, exist_ok=True)
save_levels_nSBM(model,path_to_save)

# triSBM: analysis
Here you can find all the steps that we followed to analyse the outcome of the trisbm-mRNA-lncRNA experiment.

In [None]:
df_mRNA=pd.read_csv("Results/hSBM-mRNA/hSBM-mRNA.csv.gz", index_col=0)
df_lncRNA=pd.read_csv("Results/hSBM-lncRNA/hSBM-lncRNA.csv.gz", index_col=0)
df_lncRNA=df_lncRNA[df_mRNA.columns]
df_mRNA.shape, df_lncRNA.shape

In [None]:
labels=pd.read_csv("HelperFiles/All-datasets-labels.csv",index_col=0)
labels=labels.loc[df_lncRNA.columns]
subtypes=list(sorted(set(labels.typehisto)))
info=pd.read_csv("HelperFiles/ENS-Info.txt",sep="\t", index_col=0)
info = info[~info.index.duplicated(keep='first')]

In [None]:
performances={}
for level in range(0,3):
    df_clu=pd.read_csv(f"Results/triSBM-mRNA-lncRNA/triSBM/triSBM-mRNA-lncRNA-level-{level}-clusters.csv",
                       index_col=0) 
    labels=labels.loc[df_clu.columns]
    labels["nSBM"]=[str(np.array(df_clu[col]).argmax()) for col in df_clu.columns]
    
    NMI=np.around(nmi.compute_normalised_mutual_information(labels.typehisto,labels["nSBM"]),decimals=3)
    nmi_rand=0
    for k in range(1000):
        a=labels["nSBM"].to_list()
        np.random.shuffle(a)
        nmi_rand+=nmi.compute_normalised_mutual_information(labels["typehisto"],a)/1000

    performances[f"Level {level}"]=[NMI,NMI/nmi_rand]
with open(f"Results/triSBM-mRNA-lncRNA/triSBM/triSBM-mRNA-lncRNA_NMI.json", 'w') as fp:
    json.dump(performances, fp)

In [None]:
with open(f"Results/triSBM-mRNA-lncRNA/triSBM/triSBM-mRNA-lncRNA_NMI.json") as f:
        performances=json.load(f)
for key in performances.keys():
    print(f"{key} NMI: {performances[key][0]}, NMI/NMI*: {int(performances[key][1])}")

## NMI

In [None]:
fig, axs = plt.subplots(1,3,figsize=(28,8))
axs=axs.flatten()
ax=0
performances={}
for level in range(0,3):
    df_clu=pd.read_csv(f"Results/triSBM-mRNA-lncRNA/nSBM/triSBM-mRNA-lncRNA-level-{level}-clusters.csv",
                       index_col=0) 
    labels=labels.loc[df_clu.columns]
    labels["nSBM"]=[str(np.array(df_clu[col]).argmax()) for col in df_clu.columns]
    
    labels["typehisto_1"]=pd.Series(list(labels["typehisto"])).astype('category').cat.codes.values    
    fraction_sites = pd.DataFrame(index=labels["nSBM"].unique(), columns=sorted(labels["typehisto_1"].unique())[::-1]).fillna(0)
    for sample in labels[["nSBM","typehisto_1"]].values:
        fraction_sites.at[sample[0],sample[1]] += 1

    fraction_sites = fraction_sites.sort_values(by=list(fraction_sites.columns), ascending=True)
    fraction_sites.columns=subtypes[::-1]
    fraction_sites.plot.bar(stacked=True, color=dict(zip(subtypes, nmi.set_colors(subtypes))),
                           width=1, alpha=0.75, ax=axs[level])    
    
    NMI=np.around(nmi.compute_normalised_mutual_information(labels.typehisto,labels["nSBM"]),decimals=3)
    nmi_rand=0
    for k in range(1000):
        a=labels["nSBM"].to_list()
        np.random.shuffle(a)
        nmi_rand+=nmi.compute_normalised_mutual_information(labels["typehisto"],a)/1000
    performances
    performances[f"Level {level}"]=[NMI,NMI/nmi_rand]

    axs[level].set_xlabel("cluster", size=25, weight='bold')
    axs[level].set_ylabel("number of cells", size=25, weight='bold')
    axs[level].yaxis.set_major_formatter(FormatStrFormatter('%.0f'))
    axs[level].tick_params(axis='both', which='major', labelsize=25, rotation=0)
    
    legend_properties = {'weight':'bold', "size":"x-large"}
    if level==1:
        axs[level].legend(loc=(-0.025,0.7), prop=legend_properties)
    else:
        axs[level].get_legend().remove()        
    
    axs[level].text(-0.055, 1.1, string.ascii_uppercase[level],
                 transform=axs[ax].transAxes, size=35, weight='bold',rotation=0)
    axs[level].xaxis.set_major_locator(plt.MaxNLocator(min(10, len(set(labels.nSBM))+1)))
    ax+=1   
    
title=f"triSBM-mRNA-lncRNA cluster visualisation"
plt.savefig(f"Results/Figures/{title}.pdf")
plt.show()

In [None]:
for key in performances.keys():
    print(f"{key} NMI: {performances[key][0]}, NMI/NMI*: {int(performances[key][1])}")

## Data for analysis

In [None]:
level=1
dataset="triSBM-mRNA-lncRNA"
df_clu=pd.read_csv(f"Results/{dataset}/nSBM/{dataset}-level-{level}-clusters.csv",
                       index_col=0)
labels=labels.loc[df_clu.columns]
labels["nSBM"]=[str(np.array(df_clu[col]).argmax()) for col in df_clu.columns]
    

Path(f"Results/{dataset}/nSBM/Data").mkdir(parents=True, exist_ok=True)
labels.to_csv(f"Results/{dataset}/nSBM/{dataset}-level-{level}-sample-cluster.csv")
clusters=sorted(list(set(labels.nSBM)))
subtypes=list(sorted(set(labels.typehisto)))

clusters=sorted(list(set(labels.nSBM)))
       
path_to_save=f"Results/{dataset}/nSBM/Outcome analysis/Level {level}"
Path(path_to_save).mkdir(parents=True, exist_ok=True)
clusters

In [None]:
files=sorted(glob.glob(f"Results/{dataset}/nSBM/{dataset}-level-{level}-*"))
files

In [None]:
mRNAtopic_gene, mRNAtopic_gene_prob, mRNAtopic_gene_genename, mRNAtopic_gene_raw = topic_gene_nSBM(files[1],"mRNA", info, 1, "triSBM-mRNA-lncRNA")
mRNAtopic_gene_genename

In [None]:
lncRNAtopic_gene, lncRNAtopic_gene_prob, lncRNAtopic_gene_genename, lncRNAtopic_gene_raw = topic_gene_nSBM(files[3],"lncRNA", info, 1, "triSBM-mRNA-lncRNA")
lncRNAtopic_gene_genename

In [None]:
p_c_mRNAtopic_class, p_c_mRNAtopic_cell, p_mRNAtopic_cell=p_c_t_c_nSBM(files[2], "mRNA", labels, 1, "triSBM-mRNA-lncRNA")
p_c_mRNAtopic_class

In [None]:
p_c_lncRNAtopic_class, p_c_lncRNAtopic_cell, p_lncRNAtopic_cell=p_c_t_c_nSBM(files[4], "lncRNA", labels, 1, "triSBM-mRNA-lncRNA")
p_c_lncRNAtopic_class

In [None]:
topic_arr_mRNA, threshold=loop_topics(clusters, p_c_mRNAtopic_class, direction="up")
with open(f"{path_to_save}/{dataset} level {level} mRNA-topic up clusters threshold {np.around(threshold, decimals=2)}.json", 'w') as fp:
    json.dump(topic_arr_mRNA, fp)
topic_arr_mRNA

In [None]:
topic_arr_lncRNA, threshold=loop_topics(clusters, p_c_lncRNAtopic_class, direction="up")
with open(f"{path_to_save}/{dataset} level {level} lncRNA-topic up clusters threshold {np.around(threshold, decimals=2)}.json", 'w') as fp:
    json.dump(topic_arr_lncRNA, fp)
topic_arr_lncRNA

## Enriched topics

### mRNA

In [None]:
path=f"Results/{dataset}/nSBM/Outcome analysis/Level {level}"
Path(f"{path}/Enrichment test topics").mkdir(parents=True, exist_ok=True)

In [None]:
with open("HelperFiles/GSEA.json") as f:
        all_lists_mRNA=json.load(f)
len(all_lists_mRNA.keys())

In [None]:
mRNAtopic_gene_genename_ok=mRNAtopic_gene_genename[flat_list(topic_arr_mRNA.values())]
mRNAtopic_gene_raw_ok=mRNAtopic_gene_raw[flat_list(topic_arr_mRNA.values())]

In [None]:
#It takes about 20-30 seconds for each topic on a i5-8265U 4 cores 1.60 GHz laptop
enrichment_test(all_lists_mRNA, mRNAtopic_gene_genename_ok, mRNAtopic_gene_raw_ok, info, "mRNA",
               f"{path}/Enrichment test topics", dataset, 1)

In [None]:
path=f"Results/{dataset}/nSBM/Outcome analysis/Level {level}"
dfs_hgt_mRNA={}
for topic in mRNAtopic_gene_genename_ok.columns:
    dfs_hgt_mRNA[topic]=pd.read_csv(f"{path}/Enrichment test topics/{dataset} level {level} Enrichment Test mRNA-topic {topic}.csv",index_col=0)
    dfs_hgt_mRNA[topic].drop(dfs_hgt_mRNA[topic][dfs_hgt_mRNA[topic].fdr<3].dropna().index,inplace=True)
    dfs_hgt_mRNA[topic].sort_values(by="fdr",inplace=True,ascending=False)
for key in dfs_hgt_mRNA.keys():
    print(key, dfs_hgt_mRNA[key].shape)

In [None]:
topic_name_mRNA=topics_names(topics=flat_list(topic_arr_mRNA.values()),enr_test_outcome=dfs_hgt_mRNA,
                        database=all_lists_mRNA)
topic_name_mRNA.to_csv(f"{path_to_save}/{dataset} level {level} mRNA-topic-name_raw.csv")
topic_name_mRNA

In [None]:
to_change="mRNA-Topic 10"
new="c2_KOBAYASHI_EGFR_SIGNALING_24HR_DN"
topic_name_mRNA.loc[to_change]=[dfs_hgt_mRNA[to_change].loc[new].fdr,
                                new,dfs_hgt_mRNA[to_change].loc[new].inter,
                                dfs_hgt_mRNA[to_change].loc[new].len_inter, 
                                len(all_lists_mRNA[new]),
                                dfs_hgt_mRNA[to_change].loc[new].p_gene_topic]

to_change="mRNA-Topic 9"
new="c2_SMID_BREAST_CANCER_BASAL_DN"
topic_name_mRNA.loc[to_change]=[dfs_hgt_mRNA[to_change].loc[new].fdr,
                                new,dfs_hgt_mRNA[to_change].loc[new].inter,
                                dfs_hgt_mRNA[to_change].loc[new].len_inter, 
                                len(all_lists_mRNA[new]),
                                dfs_hgt_mRNA[to_change].loc[new].p_gene_topic]

topic_name_mRNA.to_csv(f"{path_to_save}/{dataset} level {level} mRNA-topic-name.csv")
topic_name_mRNA.sort_index()

In [None]:
topic_name_mRNA=pd.read_csv(f"{path_to_save}/{dataset} level {level} mRNA-topic-name.csv", index_col=0)
heading_properties = [('font-size', '12px')]

cell_properties = [('font-size', '16px')]

dfstyle = [dict(selector="th", props=heading_properties),\
dict(selector="td", props=cell_properties)]

topic_name_mRNA[["fdr","name","len_inter", "len_gene_set"]].style.set_table_styles(dfstyle)

### lncRNA

In [None]:
path=f"Results/{dataset}/nSBM/Outcome analysis/Level {level}"
Path(f"{path}/Enrichment test topics").mkdir(parents=True, exist_ok=True)

In [None]:
with open("HelperFiles/lncSEA_red.json") as f:
        all_lists_lncRNA=json.load(f)
len(all_lists_lncRNA.keys())

In [None]:
lncRNAtopic_gene_genename_ok=lncRNAtopic_gene_genename[flat_list(topic_arr_lncRNA.values())]
lncRNAtopic_gene_raw_ok=lncRNAtopic_gene_raw[flat_list(topic_arr_lncRNA.values())]

In [None]:
#It takes about 20-30 seconds for each topic on a i5-8265U 4 cores 1.60 GHz laptop
enrichment_test(all_lists_lncRNA, lncRNAtopic_gene_genename_ok, lncRNAtopic_gene_raw_ok, info, "lncRNA",
               f"{path}/Enrichment test topics", f"{dataset}", 1)

In [None]:
path=f"Results/{dataset}/nSBM/Outcome analysis/Level {level}"
dfs_hgt_lncRNA={}
for topic in sorted(flat_list(topic_arr_lncRNA.values())):
    dfs_hgt_lncRNA[topic]=pd.read_csv(f"{path}/Enrichment test topics/{dataset} level {level} Enrichment Test lncRNA-topic {topic}.csv",index_col=0)
    dfs_hgt_lncRNA[topic].drop(dfs_hgt_lncRNA[topic][dfs_hgt_lncRNA[topic].fdr<3].dropna().index,inplace=True)
    dfs_hgt_lncRNA[topic].sort_values(by="fdr",inplace=True,ascending=False)
for key in dfs_hgt_lncRNA.keys():
    print(key, dfs_hgt_lncRNA[key].shape)

In [None]:
topic_name_lncRNA=topics_names(topics=flat_list(topic_arr_lncRNA.values()),enr_test_outcome=dfs_hgt_lncRNA,
                        database=all_lists_lncRNA)
topic_name_lncRNA.to_csv(f"{path_to_save}/{dataset} level {level} lncRNA-topic-name_raw.csv")
topic_name_lncRNA=pd.read_csv(f"{path_to_save}/{dataset} level {level} lncRNA-topic-name_raw.csv", index_col=0)
heading_properties = [('font-size', '12px')]

cell_properties = [('font-size', '16px')]

dfstyle = [dict(selector="th", props=heading_properties),\
dict(selector="td", props=cell_properties)]

topic_name_lncRNA[["fdr","name","len_inter", "len_gene_set"]].style.set_table_styles(dfstyle)

In [None]:
"CTA-392C11" in info.astype(str)["Gene name"].values

In [None]:
info[info["Gene name"]=="OSER1-AS1"]

In [None]:
top="lncRNA-Topic 19"
d=pd.DataFrame(topic_name_lncRNA.loc[top]["inter"][3:-3].replace("'", "").split(", "), columns=["genes"])
d["pgt"]=topic_name_lncRNA.loc[top]["p_gene_topic"][1:-1].replace("'", "").split(", ")
d.sort_values(by="pgt", ascending=False)

In [None]:
"PART1" in d["genes"].values

In [None]:
lncRNAtopic_gene

In [None]:
topic_name_lncRNA.to_csv(f"{path_to_save}/{dataset} level {level} lncRNA-topic-name.csv")

topic_name_lncRNA=pd.read_csv(f"{path_to_save}/{dataset} level {level} lncRNA-topic-name.csv", index_col=0)
heading_properties = [('font-size', '12px')]

cell_properties = [('font-size', '16px')]

dfstyle = [dict(selector="th", props=heading_properties),\
dict(selector="td", props=cell_properties)]

topic_name_lncRNA[["fdr","name","len_inter", "len_gene_set"]].style.set_table_styles(dfstyle)

### Topic cluster association

In [None]:
p_mRNAtopic_cell=pd.read_csv(f"Results/{dataset}/nSBM/Data/{dataset}-level-{level}-mRNA-p_topic_sample.csv", index_col=0)
p_c_mRNAtopic_cell=p_mRNAtopic_cell-p_mRNAtopic_cell.mean()
p_c_mRNAtopic_class=pd.DataFrame(index=clusters, columns=p_c_mRNAtopic_cell.columns)

for cla in clusters:
    insample=labels[labels["nSBM"]==cla].index
    p_c_mRNAtopic_class.loc[cla]=p_c_mRNAtopic_cell.loc[insample].mean()
p_c_mRNAtopic_class.index=[f"cluster {i}" for i in range(len(p_c_mRNAtopic_class.index))]

topic_name_mRNA=pd.read_csv(f"{path_to_save}/{dataset} level {level} mRNA-topic-name.csv", index_col=0)
file=glob.glob(f"{path_to_save}/{dataset} level {level} mRNA-topic up clusters threshold *.json")[0]
with open(file) as f:
    topic_arr_mRNA=json.load(f)
    
topic_cluster_association_mRNA=pd.DataFrame(p_c_mRNAtopic_class)

for key, value in zip(topic_arr_mRNA.keys(), topic_arr_mRNA.values()):
    not_in=list(set(topic_cluster_association_mRNA.columns) - set(value))
    topic_cluster_association_mRNA.loc[f"cluster {key}"][not_in]=0

column="name"
new_cols=[f"{col} - {topic_name_mRNA.loc[col][column]}" if col in topic_name_mRNA.index else col for col in topic_cluster_association_mRNA]
topic_cluster_association_mRNA.columns=new_cols
topic_cluster_association_mRNA.head(3)

In [None]:
p_lncRNAtopic_cell=pd.read_csv(f"Results/{dataset}/nSBM/Data/{dataset}-level-{level}-lncRNA-p_topic_sample.csv", index_col=0)
p_c_lncRNAtopic_cell=p_lncRNAtopic_cell-p_lncRNAtopic_cell.mean()
p_c_lncRNAtopic_class=pd.DataFrame(index=clusters, columns=p_c_lncRNAtopic_cell.columns)

for cla in clusters:
    insample=labels[labels["nSBM"]==cla].index
    p_c_lncRNAtopic_class.loc[cla]=p_c_lncRNAtopic_cell.loc[insample].mean()
p_c_lncRNAtopic_class.index=[f"cluster {i}" for i in range(len(p_c_lncRNAtopic_class.index))]

topic_name_lncRNA=pd.read_csv(f"{path_to_save}/{dataset} level {level} lncRNA-topic-name.csv", index_col=0)
file=glob.glob(f"{path_to_save}/{dataset} level {level} lncRNA-topic up clusters threshold *.json")[0]
with open(file) as f:
    topic_arr_lncRNA=json.load(f)
    
topic_cluster_association_lncRNA=pd.DataFrame(p_c_lncRNAtopic_class)

for key, value in zip(topic_arr_lncRNA.keys(), topic_arr_lncRNA.values()):
    not_in=list(set(topic_cluster_association_lncRNA.columns) - set(value))
    topic_cluster_association_lncRNA.loc[f"cluster {key}"][not_in]=0

column="name"
new_cols=[f"{col} - {topic_name_lncRNA.loc[col][column]}" if col in topic_name_lncRNA.index else col for col in topic_cluster_association_lncRNA]
topic_cluster_association_lncRNA.columns=new_cols
topic_cluster_association_lncRNA.head(3)

In [None]:
topic_cluster_association=pd.concat([topic_cluster_association_mRNA,topic_cluster_association_lncRNA], axis=1).astype(float)

topic_cluster_association=pd.concat([topic_cluster_association_mRNA,topic_cluster_association_lncRNA], axis=1)
topic_cluster_association=topic_cluster_association.astype(float)
lut = dict(zip(topic_cluster_association.index, col_clusters(labels, "nSBM","typehisto")))
col_colors=pd.DataFrame.from_dict(lut, orient="index", columns=["typehisto"]).sort_values(by="typehisto")

col_colors["Second subpopulation"]=["blue","blue","blue","darkturquoise","red","orange","orange"]
col_colors.columns=["First subpopulation","Second subpopulation"]


row_colors=pd.DataFrame(index=topic_cluster_association.loc[col_colors.index].T.index, columns=["Gene type", "RNA family"])
row_colors["Gene type"].loc[topic_cluster_association_mRNA.columns]="mRNA-topic"
row_colors["RNA family"].loc[topic_cluster_association_mRNA.columns]="green"
row_colors["Gene type"].loc[topic_cluster_association_lncRNA.columns]="lncRNA-topic"
row_colors["RNA family"].loc[topic_cluster_association_lncRNA.columns]="violet"


vmin=topic_cluster_association.loc[col_colors.index].T.round(decimals=3).replace({'0':np.nan, 0:np.nan}).min().min()
vmax=topic_cluster_association.loc[col_colors.index].T.round(decimals=3).replace({'0':np.nan, 0:np.nan}).max().max()

In [None]:
kws = dict(cbar_kws=dict(ticks=[vmin, vmax], orientation='horizontal'), 
            annot_kws={"fontsize":20, "fontweight":"bold"}, figsize=(15,14))
order=["cluster 4","cluster 6","cluster 3","cluster 2","cluster 1","cluster 0","cluster 5"]
cg=sns.clustermap(topic_cluster_association.loc[order].T.round(decimals=4).replace({'0':np.nan, 0:np.nan}).dropna(axis=0,  how='all'),
                  row_cluster=False, col_cluster=False,  annot=True, vmin=vmin, vmax=vmax, fmt=".3f",
                  row_colors=pd.DataFrame(row_colors["RNA family"]),
                  xticklabels=True, yticklabels=True, **kws)

cg.ax_cbar.set_title("Pc(topic|cluster)", fontsize="x-large")
cg.ax_cbar.set_position((0.6, 0.9, 0.2, 0.025))
plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=45, fontweight="bold", fontsize="16")
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), fontweight="bold", fontsize="20")
for spine in cg.ax_cbar.spines:
    cg.ax_cbar.spines[spine].set_color('crimson')
    cg.ax_cbar.spines[spine].set_linewidth(2)
cg.ax_heatmap.axes.set_xticklabels([f"cluster {i}" for i in range(7)])

leg=dict(zip(["mRNA-topic","lncRNA-topic"],["green","violet"]))
handles = [Patch(facecolor=leg[name]) for name in leg]
leg_1=plt.legend(handles, leg, title="Partition (RNA Family)", title_fontsize="x-large", 
                fontsize=18,
                bbox_transform=plt.gcf().transFigure, loc=(1.3,-1.75))
plt.gca().add_artist(leg_1)

title=f"triSBM_Heatmap_HGT"
plt.savefig(f"Results/Figures/{title}.jpeg", dpi=600)
#plt.savefig(f"Results/Figures/{title}.pdf", dpi=600)
plt.show()