In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import json
import glob
import random
import nmi
import string

from pathlib import Path
from matplotlib.ticker import FormatStrFormatter
from helps import *

import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')
plt.rcParams['font.size'] = 15
plt.rcParams["xtick.labelsize"]=13
plt.rcParams["ytick.labelsize"]=13
plt.rcParams["axes.titlesize"]=15
plt.rcParams["figure.dpi"]=600
plt.rcParams["savefig.format"]="pdf"
plt.rcParams["savefig.bbox"]="tight"

# Info-ENS

In [None]:
info=pd.read_csv("HelperFiles/mart_export.txt",sep="\t", index_col=0)
info = info[~info.index.duplicated(keep='first')]
print(set(info["Gene type"]))
ok=info[(info["Gene type"] == "protein_coding") ^ (info["Gene type"] == "lincRNA")]
info.head()

In [None]:
gene_names=ok["Gene name"].to_numpy()
ribs=[]
for name in gene_names:
    if name[:3]=="RPL" or name[:3]=="RPS":
        ribs.append(name)
len(ribs)

In [None]:
temp=info
temp["ENS"]=temp.index
temp=temp.set_index("Gene name")
temp["Gene name"]=temp.index
temp["Gene type"].loc[ribs]="RIB-pt"
temp.index.names=["Name"]
temp.set_index("ENS", inplace=True)
temp.to_csv("HelperFiles/ENS-Info.txt",sep="\t")
temp

In [None]:
info=pd.read_csv("HelperFiles/ENS-Info.txt",sep="\t", index_col=0)
info = info[~info.index.duplicated(keep='first')]
print(set(info["Gene type"]))
ok=info[(info["Gene type"] == "protein_coding") ^ (info["Gene type"] == "lincRNA")]
info.head()

In [None]:
gene_names=ok["Gene name"].to_numpy()
mt=[]
for name in gene_names:
    if name[:3]=="MT-":
        mt.append(name)
len(mt)

In [None]:
temp=info
temp["ENS"]=temp.index
temp=temp.set_index("Gene name")
temp["Gene name"]=temp.index
temp["Gene type"].loc[mt]="MT"
temp.index.names=["Name"]
temp.set_index("ENS", inplace=True)
temp.to_csv("HelperFiles/ENS-Info.txt",sep="\t")
temp

In [None]:
info=pd.read_csv("HelperFiles/ENS-Info.txt",sep="\t", index_col=0)
info = info[~info.index.duplicated(keep='first')]
print(set(info["Gene type"]))
ok=info[info["Gene type"] == "lincRNA"]
info.head()

In [None]:
gene_names=ok["Gene name"].to_numpy()
mir=[]
for name in gene_names:
    if name[:3]=="MIR" or name[:3]=="hsa":
        mir.append(name)
len(mir)

In [None]:
mir

In [None]:
to_change=["MIR296","MIR24-2","MIR940","MIR146A","MIR219-2","MIR1302-11","MIR4313",
           "MIR145","MIR194-2","MIR194-2",'MIR3179-1', 'MIR3180-1', 'MIR4453', 'MIR3180-4', 'MIR3179-2',
 'MIR3180-2','MIR3180-3', 'MIR3179-3','MIR2117', 'MIR29A', 'MIR451B', 'MIR1302-2','MIR1587',
 'MIR1302-10','MIR22HG', 'hsa-mir-1253', 'hsa-mir-7515','hsa-mir-8072','MIR371B', 'MIR378D2',]

In [None]:
temp=info
temp["ENS"]=temp.index
temp=temp.set_index("Gene name")
temp["Gene name"]=temp.index
temp["Gene type"].loc[to_change]="miRNA"
temp.index.names=["Name"]
temp.set_index("ENS", inplace=True)
temp.to_csv("HelperFiles/ENS-Info.txt",sep="\t")
temp

# Enrichment test

## GSEA mRNA

In [None]:
d=pd.read_csv("Datasets/DatabasesEnrichmentTest/Gsea/msigdb.v7.5.1.symbols.gmt",sep="http://www.gsea-msigdb.org/gsea/msigdb/cards/")
split_strings=[]
for i in range(len(d)):
    split_strings.append(d.iloc[i][1].split("\t"))
diz={}
for lista in split_strings:
    diz[lista[0]]=lista[1:]
len(diz.keys())

In [None]:
with open('HelperFiles/mSEA.json', 'w') as f:
    json.dump(diz, f)

## lncSEA

In [None]:
files=glob.glob("Datasets/DatabasesEnrichmentTest/lncSEA/*csv.gz")
files

In [None]:
dfs=[pd.read_csv(file, sep="\t", index_col=0) for file in files]

In [None]:
for df, file in zip(dfs, files):
    print(file[40:], df.shape)

In [None]:
shapes=pd.DataFrame(columns=["shape", "list_type"])
shapes["shape"]=[df.shape[0] for df in dfs]
shapes["list_type"]=[file[40:-4] for file in files]
shapes.sort_values(by="shape", inplace=True)
sns.scatterplot(data=shapes, y="shape", x="list_type")
plt.xticks(rotation=90)
plt.yscale("log")
plt.show()

In [None]:
df_tot=pd.concat(dfs, axis=0)
df_tot.reset_index(drop=True, inplace=True)
print(df_tot.shape)
df_tot.head()

In [None]:
len(flat_list([df_tot.loc[i].LncRNA.split(";") for i in df_tot.index]))

In [None]:
diz={}
for line in df_tot.index:
    a=df_tot.loc[line]["Class"]
    b=df_tot.loc[line]["Sub Class"]
    c=df_tot.loc[line]["Set"]
    key=f"{a}_{b}_{c}"
    diz[key]=df_tot.loc[line].LncRNA.split(";")
len(set(diz.keys())), len(diz.keys())

In [None]:
with open('HelperFiles/lncSEA.json', 'w') as f:
    json.dump(diz, f)

# hSBM vs nSBM with mRNA and lncRNA

In [None]:
dfm=pd.read_csv("Results/hSBM-mRNA/hSBM-mRNA.csv", index_col=0)
dflnc=pd.read_csv("Results/hSBM-lncRNA/hSBM-lncRNA.csv", index_col=0)
dfmlnc=pd.read_csv("Results/hSBM-mRNA-lncRNA/hSBM-mRNA-lncRNA.csv", index_col=0)
labels=pd.read_csv("HelperFiles/All-datasets-labels.csv", index_col=0)
labels=labels.loc[dfm.columns]
labels.shape

In [None]:
level=1
with open(f"Results/hSBM-mRNA/hSBM/hSBM-mRNA-cluster-level-{level}.txt") as f:
    clusters=json.load(f)
cluster_df=pd.DataFrame.from_dict(clusters,orient="index")
labels["hSBM_m"]="--"
for i in range(len(clusters)):
    labels["hSBM_m"].loc[np.asarray(np.asarray(clusters[str(i)])[:,0])]=i
labels.hSBM_m=labels.hSBM_m.astype(int)

with open(f"Results/hSBM-lncRNA/hSBM/hSBM-lncRNA-cluster-level-{level}.txt") as f:
    clusters=json.load(f)
cluster_df=pd.DataFrame.from_dict(clusters,orient="index")
labels["hSBM_lnc"]="--"
for i in range(len(clusters)):
    labels["hSBM_lnc"].loc[np.asarray(np.asarray(clusters[str(i)])[:,0])]=i
labels.hSBM_lnc=labels.hSBM_lnc.astype(int)

with open(f"Results/hSBM-mRNA-lncRNA/hSBM/hSBM-mRNA-lncRNA-cluster-level-{level}.txt") as f:
    clusters=json.load(f)
cluster_df=pd.DataFrame.from_dict(clusters,orient="index")
labels["hSBM_m_lnc"]="--"
for i in range(len(clusters)):
    labels["hSBM_m_lnc"].loc[np.asarray(np.asarray(clusters[str(i)])[:,0])]=i
labels.hSBM_m_lnc=labels.hSBM_m_lnc.astype(int)

df_clu=pd.read_csv(f"Results/triSBM-mRNA-lncRNA/nSBM/triSBM-mRNA-lncRNA-level-{level}-clusters.csv",
                       index_col=0)
df_clu=df_clu[labels.index]
labels["nSBM"]=[int(df_clu[col].argmax()) for col in df_clu.columns]
subtypes=sorted(list(set(labels.typehisto)))
labels.head()

In [None]:
nmis={}
nmis["NMI"]=pd.DataFrame(index=labels.columns, columns=labels.columns)
nmis["NMI_star"]=pd.DataFrame(index=labels.columns, columns=labels.columns)
nmis["NMI-NMI_star"]=pd.DataFrame(index=labels.columns, columns=labels.columns)
for col in labels.columns:
    print(col)
    for co in labels.columns:
        NMI=np.around(nmi.compute_normalised_mutual_information(labels[col],labels[co]),decimals=4)
        nmis["NMI"].at[col,co]=NMI
        nmis["NMI"].at[co,col]=NMI
        nmi_rand=0
        for k in range(1000):
            a=labels[co].to_list()
            np.random.shuffle(a)
            nmi_rand+=nmi.compute_normalised_mutual_information(labels[col],a)/1000
        nmis["NMI_star"].at[col,co]=np.around(nmi_rand,decimals=4)
        nmis["NMI_star"].at[co,col]=np.around(nmi_rand,decimals=4)
nmis["NMI"].to_csv(f"Results/mRNA-lncRNA comparison NMI level {level}.csv")
nmis["NMI_star"].to_csv(f"Results/mRNA-lncRNA comparison NMI_star level {level}.csv")
(nmis["NMI"]/nmis["NMI_star"]).to_csv(f"Results/mRNA-lncRNA comparison NMI-NMI_star level {level}.csv")

In [None]:
datas={}
tipi=["NMI","NMI_star", "NMI-NMI_star"]
nomi=["NMI","NMI*", "NMI/NMI*"]
names=["hSBM-mRNA","hSBM-lncRNAs","hSBM-mRNA-lncRNA","triSBM-mRNA-lncRNA"]
for tipo, nome in zip(tipi, nomi):
    print(tipo, nome)
    datas[nome]=pd.read_csv(f"Results/mRNA-lncRNA comparison {tipo} level {level}.csv",index_col=0)

In [None]:
fig = plt.figure(figsize=(29,8))

gs_left = gridspec.GridSpec(1, 1)
gs_right = gridspec.GridSpec(1, 2)

ax0 = fig.add_subplot(gs_left[0,0])
ax1 = fig.add_subplot(gs_right[0,0])
ax2 = fig.add_subplot(gs_right[0,1],sharey=ax1)
axs=[ax0,ax1,ax2]

for col, ax in zip(["hSBM_m_lnc","nSBM"], [1,2]):
    labels["typehisto_1"]=pd.Series(list(labels["typehisto"])).astype('category').cat.codes.values    
    fraction_sites = pd.DataFrame(index=labels[col].unique(), columns=sorted(labels["typehisto_1"].unique())[::-1]).fillna(0)
    for sample in labels[[col,"typehisto_1"]].values:
        fraction_sites.at[sample[0],sample[1]] += 1

    fraction_sites = fraction_sites.sort_values(by=list(fraction_sites.columns), ascending=True)
    fraction_sites.columns=subtypes[::-1]
    fraction_sites.plot.bar(stacked=True, color=dict(zip(subtypes, nmi.set_colors(subtypes))),
                           width=1, alpha=0.75, ax=axs[ax])    
    
    axs[ax].set_xlabel("cluster", size=25, weight='bold')
    axs[ax].set_ylabel("number of cells", size=25, weight='bold')
    axs[ax].yaxis.set_major_formatter(FormatStrFormatter('%.0f'))
    axs[ax].tick_params(axis='both', which='major', labelsize=25, rotation=0)
    
    legend_properties = {'weight':'bold', "size":"x-large"}
    if ax==1:
        axs[ax].legend(loc=(0.67,0.7), prop=legend_properties)
    else:
        axs[ax].get_legend().remove()        
    
    axs[ax].text(-0.045, 1.03, string.ascii_uppercase[ax],
                 transform=axs[ax].transAxes, size=35, weight='bold',rotation=0)
    axs[ax].set_xticklabels([i for i in range(len(fraction_sites))])
    ax+=1

axx=sns.heatmap(datas["NMI/NMI*"], annot=True, fmt=".0f",ax=ax0, 
                vmin= datas["NMI/NMI*"].min().min(), 
                vmax=datas["NMI/NMI*"].replace(datas["NMI/NMI*"].max().max(),datas["NMI/NMI*"].min().min()).max().max(),
                annot_kws={"size":25}, cmap=sns.color_palette("rocket", as_cmap=True))
axx.tick_params(labelright=False, labelleft=True)
ax0.text(-0.045, 1.03, string.ascii_uppercase[0], 
             transform=ax0.transAxes, size=35, weight='bold')
cbar = axx.collections[0].colorbar
cbar.ax.tick_params(labelsize=25)
cbar.set_label("NMI/NMI*", size=25, weight='bold')
    
axs[0].set_xticks([])    
axs[0].set_yticklabels(["typehisto"]+names, rotation=0, size=25, weight='bold')    

gs_left.update(right=0.35, wspace=0)
gs_right.update(left=0.44, wspace=0.1)

title=f"Heatmap 4 exps and multiomic results"
plt.savefig(f"Results/{title}.pdf")
plt.show()