In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import graph_tool.all as gt


import json
import random
import nmi
import string
import glob

from pathlib import Path
from matplotlib import gridspec
from matplotlib.patches import Patch
from time import localtime, strftime
from matplotlib.ticker import FormatStrFormatter
from sbmtm import sbmtm
from nsbm import nsbm

from helps import *

import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')
plt.rcParams['font.size'] = 15
plt.rcParams["xtick.labelsize"]=13
plt.rcParams["ytick.labelsize"]=13
plt.rcParams["axes.titlesize"]=15
plt.rcParams["figure.dpi"]=600
plt.rcParams["savefig.format"]="pdf"
plt.rcParams["savefig.bbox"]="tight"

# Batch effect: experiment run

In [None]:
files=sorted(glob.glob("Results/Batch effect/*csv"))
files

In [None]:
mRNA=pd.read_csv(files[1],index_col=0) #pay attention to files' order in list "files"
print(mRNA.shape)
mRNA.head(3)

In [None]:
lncRNA=pd.read_csv(files[0],index_col=0) #pay attention to files' order in list "files"
print(lncRNA.shape)
lncRNA.head(3)

In [None]:
print(strftime("%Y-%m-%d %H:%M:%S", localtime()))

model = nsbm()
model.make_graph_multiple_df(mRNA, [lncRNA]) #about 13-15 minutes to build the graph with a i5-8265U 4 cores 1.60 GHz lptop

print(strftime("%Y-%m-%d %H:%M:%S", localtime()))

In [None]:
print(strftime("%Y-%m-%d %H:%M:%S", localtime()))
model.fit(n_init=1,verbose=False) #it takes about 30 minutes to fit the graph (n_init=1) with a i5-8265U 4 cores 1.60 GHz laptop
print(strftime("%Y-%m-%d %H:%M:%S", localtime()))

In [None]:
path_to_save="Results/Batch effect/nSBM/triSBM batch effect"
Path(path_to_save).mkdir(parents=True, exist_ok=True)
save_levels_nSBM(model,path_to_save)

# Batch effect: analysis

In [None]:
df_mRNA=pd.read_csv("Results/Batch effect/Health-sc-mRNA-test-batch.csv", index_col=0)
df_lncRNA=pd.read_csv("Results/Batch effect/Health-sc-lncRNA-test-batch.csv", index_col=0)
df_lncRNA=df_lncRNA[df_mRNA.columns]
df_mRNA.shape, df_lncRNA.shape

In [None]:
labels=pd.read_csv("HelperFiles/All-datasets-labels.csv",index_col=0)
labels=labels.loc[df_lncRNA.columns]
subtypes=list(sorted(set(labels.typehisto)))
info=pd.read_csv("HelperFiles/ENS-Info.txt",sep="\t", index_col=0)
info = info[~info.index.duplicated(keep='first')]

In [None]:
fig, axs = plt.subplots(2,2,figsize=(20,20))
axs=axs.flatten()
ax=0

for level in range(0,4):
    df_clu=pd.read_csv(f"Results/Batch effect/nSBM/trisbm-batch-effect-level-{level}-clusters.csv",
                       index_col=0) 
    labels=labels.loc[df_clu.columns]
    labels["nSBM"]=[str(np.array(df_clu[col]).argmax()) for col in df_clu.columns]
    
    labels["typehisto_1"]=pd.Series(list(labels["typehisto"])).astype('category').cat.codes.values    
    fraction_sites = pd.DataFrame(index=labels["nSBM"].unique(), columns=sorted(labels["typehisto_1"].unique())[::-1]).fillna(0)
    for sample in labels[["nSBM","typehisto_1"]].values:
        fraction_sites.at[sample[0],sample[1]] += 1

    fraction_sites = fraction_sites.sort_values(by=list(fraction_sites.columns), ascending=True)
    fraction_sites.columns=subtypes[::-1]
    fraction_sites.plot.bar(stacked=True, color=dict(zip(subtypes, nmi.set_colors(subtypes))),
                           width=1, alpha=0.75, ax=axs[level])    
       
    NMI=np.around(nmi.compute_normalised_mutual_information(labels.typehisto,labels["nSBM"]),decimals=3)
    nmi_rand=0
    for k in range(1000):
        a=labels["nSBM"].to_list()
        np.random.shuffle(a)
        nmi_rand+=nmi.compute_normalised_mutual_information(labels["typehisto"],a)/1000
    axs[level].set_title(f"NMI {NMI} NMI* {np.around(nmi_rand,decimals=4)} NMI/NMI* {np.round(NMI/nmi_rand,decimals=2)}", size=25, weight='bold')
  
    axs[level].set_xlabel("clusters", size=25, weight='bold')
    axs[level].set_ylabel("number of cells", size=25, weight='bold')
    axs[level].yaxis.set_major_formatter(FormatStrFormatter('%.0f'))
    axs[level].tick_params(axis='both', which='major', labelsize=25, rotation=0)
    axs[level].set_xticks([])
    
    legend_properties = {'weight':'bold', "size":25}
    if level==0:
        axs[level].legend(loc=(0.05,0.55), prop=legend_properties)
    else:
        axs[level].get_legend().remove()        
    
    axs[level].text(-0.055, 1.05, string.ascii_uppercase[level],
                 transform=axs[ax].transAxes, size=35, weight='bold',rotation=0)
    ax+=1   
    
fig.tight_layout(pad=1)
title=f"triSBM batch effect"
plt.savefig(f"Results/{title}.pdf")
plt.show()