# Data preprocessing

This notebook shows how to build the matrices to reproduce our experiments.

In [None]:
import pandas as pd
import scanpy as sc
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import glob

from pathlib import Path
from scipy import io
from helps import *

import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')

# single-cell Grosselin2019
The four raw matrices can be found at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE117309

They are GSM3290893, GSM3290894, GSM3290895 and GSM3290896

In [None]:
path="Datasets/Grosselin2019"
folders=sorted(glob.glob("Datasets/Grosselin2019/*"))[:4]
folders

In [None]:
subtypes=["Basal","Basal-R","Lum","Lum-R"]
dfs=[]
labels=pd.DataFrame(columns=["typehisto"])
for fold, sub in zip(folders, subtypes):
    print(fold, sub)
    genes=pd.read_csv(f"{fold}/genes.tsv",sep="\t", header=None)
    barcodes=pd.read_csv(f"{fold}/barcodes.tsv",sep="\t", header=None)
    mat=io.mmread(f"{fold}/matrix.mtx")
    d=pd.DataFrame(data=mat.toarray(), columns=barcodes[0],
                   index=[n[5:] for n in genes[0]])
    d.columns=["hs-"+sub+"-"+col for col in d.columns]
    
    temp=pd.DataFrame(index=d.columns)
    temp["typehisto"]=sub
    labels=pd.concat([labels,temp])
    
    dfs.append(d)
    print(d.shape)

In [None]:
sns.countplot(y=labels["typehisto"], palette=["orange","red","blue","cyan"])

In [None]:
df=pd.concat(dfs, axis=1)
a=df.columns.to_list()
np.random.shuffle(a)
df=df[a]
print(df.shape)
df.head()

In [None]:
labels = labels[~labels.index.duplicated(keep='first')]
labels.to_csv("HelperFiles/All-datasets-labels.csv")
#Save the raw dataset
df.to_csv("Datasets/Grosselin2019/Grosselin-sc-RNA-raw.csv.gz",compression="gzip")
labels.shape

In [None]:
info=pd.read_csv("HelperFiles/ENS-Info.txt",sep="\t", index_col=0)
info = info[~info.index.duplicated(keep='first')]
sns.countplot(info.loc[df.index]["Gene type"])
plt.yscale("log")
plt.show()

## mRNA

In [None]:
df=pd.read_csv("Datasets/Grosselin2019/Grosselin-sc-RNA-raw.csv.gz",index_col=0)
print(df.shape)
df.head()

In [None]:
info=pd.read_csv("HelperFiles/ENS-Info.txt",sep="\t", index_col=0)
info = info[~info.index.duplicated(keep='first')]

In [None]:
df_pt=df.loc[intersection([df.index, info[info["Gene type"]=="protein_coding"].index])]
df_pt.shape

In [None]:
adata=sc.AnnData(X=df_pt.T)
print(adata.X.shape)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata, copy=False)
sc.pp.highly_variable_genes(adata, min_mean=0.0, max_mean=50, min_disp=0.75)
sc.pl.highly_variable_genes(adata)
adata = adata[:, adata.var.highly_variable]
plt.hist(info.loc[adata.var.index]["Gene type"].to_numpy())
print("Final dimension:", adata.X.shape)
plt.show()

In [None]:
temp=pd.DataFrame(data=adata.X.T, index=adata.var.index, columns=adata.obs.index)
print(temp.shape)
name="hSBM-mRNA"
Path(f"Results/{name}").mkdir(parents=True, exist_ok=True)
temp.to_csv(f"Results/{name}/{name}.csv.gz",compression="gzip")

## lncRNA

In [None]:
df=pd.read_csv("Datasets/Grosselin2019/Grosselin-sc-RNA-raw.csv.gz",index_col=0)
print(df.shape)
df.head()

In [None]:
info=pd.read_csv("HelperFiles/ENS-Info.txt",sep="\t", index_col=0)
info = info[~info.index.duplicated(keep='first')]

In [None]:
df_lnc=df.loc[intersection([df.index, info[info["Gene type"]=="lincRNA"].index])]
df_lnc.shape

In [None]:
adata=sc.AnnData(X=df_lnc.T)
print(adata.X.shape)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata, copy=False)
sc.pp.highly_variable_genes(adata, min_mean=0.0, max_mean=50, min_disp=0.25)
sc.pl.highly_variable_genes(adata)
adata = adata[:, adata.var.highly_variable]
plt.hist(info.loc[adata.var.index]["Gene type"].to_numpy())
print("Final dimension:", adata.X.shape)
plt.show()

In [None]:
temp=pd.DataFrame(data=adata.X.T, index=adata.var.index, columns=adata.obs.index)
print(temp.shape)
name="hSBM-lncRNA"
Path(f"Results/{name}").mkdir(parents=True, exist_ok=True)
temp.to_csv(f"Results/{name}/{name}.csv.gz",compression="gzip")

## mRNA-lncRNA

In [None]:
df=pd.read_csv("Datasets/Grosselin2019/Grosselin-sc-RNA-raw.csv.gz",index_col=0)
print(df.shape)
df.head()

In [None]:
lncs=pd.read_csv("Results/hSBM-lncRNA/hSBM-lncRNA.csv.gz",index_col=0, usecols=[0])
lncs.head()

In [None]:
ms=pd.read_csv("Results/hSBM-mRNA/hSBM-mRNA.csv.gz",index_col=0, usecols=[0])
ms.head()

In [None]:
# Manually normalise the total count in each cell and apply log
#then keep only the mRNAs and lncRNAs previously selected
df=df*1e4/df.sum()
df=np.log(1+df)
df_tot=df.loc[flat_list([ms.index, lncs.index])]
df_tot.shape, df_tot.min().min(), df_tot.max().max()

In [None]:
info=pd.read_csv("HelperFiles/ENS-Info.txt",sep="\t", index_col=0)
sns.countplot(info.loc[intersection([df_tot.index, info.index])]["Gene type"])
plt.show()

In [None]:
name="hSBM-mRNA-lncRNA"
Path(f"Results/{name}").mkdir(parents=True, exist_ok=True)
df_tot.to_csv(f"Results/{name}/{name}.csv.gz",compression="gzip")

# Batch effect
Here we show the procedure to desgin the batch effect test.
All the data can be found at https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE161529

We decided to analyse the transctiptome of seven donors:
GSM4909253
GSM4909254
GSM4909257
GSM4909263
GSM4909265
GSM4909266
GSM4909268

In [None]:
feat=pd.read_csv("Datasets/HealthyDonors/GSE161529_features.tsv.gz",sep="\t",header=None)
feat.columns=["ENS","HUGO","info"]
feat.head()

In [None]:
barcodes=sorted(glob.glob("Datasets/HealthyDonors/Barcodes/*"))
barcodes

In [None]:
matrixs=sorted(glob.glob("Datasets/HealthyDonors/Matrixs/*"))
matrixs

In [None]:
all_lab=pd.read_csv("HelperFiles/All-datasets-labels.csv",index_col=0)
all_lab.shape

In [None]:
info=pd.read_csv("HelperFiles/ENS-Info.txt",sep="\t", index_col=0)
info_pt=info[info["Gene type"]=="protein_coding"]
info_lnc=info[info["Gene type"]=="lincRNA"]
info_RNA=pd.concat([info_pt,info_lnc])
info_pt.shape, info_lnc.shape, info_RNA.shape

In [None]:
matrixs[0][44:50]

In [None]:
# This is the slowest step: it takes about 10-12 minutes to read and process the 
# seven matrices with a i5-8265U laptop

for bar, mat in zip (barcodes, matrixs):
    print(bar,"\n",mat)
    b=pd.read_csv(bar, header=None, index_col=0)
    m=scipy.io.mmread(mat)
    
    df=pd.DataFrame(data=m.todense(),
                    index=feat["ENS"],
                    columns=["Health-"+mat[44:50]+s for s in b.index])
    
    lab=pd.DataFrame(index=df.columns)
    lab["typehisto"]="Health-"+mat[44:50]
    all_lab=all_lab.append(lab)
    
    df=df.loc[intersection([df.index,info_RNA.index])]
    print(mat[44:50], df.shape, df.min().min(), df.max().max())
    df.to_csv(f"Datasets/HealthyDonors/Health-{mat[44:50]}-sc-RNA-raw.csv.gz",compression="gzip")
    del df

In [None]:
print(all_lab.shape)
all_lab = all_lab[~all_lab.index.duplicated(keep='first')]
all_lab.to_csv("HelperFiles/All-datasets-labels.csv")

## mRNA

In [None]:
files=sorted(glob.glob("Datasets/HealthyDonors/*.gz"))
files=files[1:]
files

In [None]:
info=pd.read_csv("HelperFiles/ENS-Info.txt",sep="\t",index_col=0)
info = info[~info.index.duplicated(keep='first')]
pt=info[info["Gene type"]=="protein_coding"]
info.shape, pt.shape

In [None]:
dfs=[]
for i in range(len(files)):
    dfs.append(pd.read_csv(files[i], index_col=0))
    dfs[i]=dfs[i].loc[intersection([dfs[i].index,pt.index])].sample(n=1500,axis=1)
    print(dfs[i].shape)
    sns.countplot(info.loc[dfs[i].index]["Gene type"])
    plt.show()

In [None]:
df_tot=pd.concat(dfs,axis=1)
print(df_tot.shape)
del dfs

In [None]:
adata=sc.AnnData(X=df_tot.T)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0, max_mean=50, min_disp=0.75)
sc.pl.highly_variable_genes(adata)
adata = adata[:, adata.var.highly_variable]
adata.X.T.shape

In [None]:
temp=pd.DataFrame(data=adata.X.T, index=adata.var.index, columns=adata.obs.index)
print(temp.shape)
Path(f"Results/Batch effect").mkdir(parents=True, exist_ok=True)
temp.to_csv("Results/Batch effect/Health-sc-mRNA-test-batch.csv.gz",compression="gzip")

## lncRNA

In [None]:
files=sorted(glob.glob("Datasets/HealthyDonors/*.gz"))
files=files[1:]
files

In [None]:
info=pd.read_csv("HelperFiles/ENS-Info.txt",sep="\t",index_col=0)
info = info[~info.index.duplicated(keep='first')]
info_pt=info[info["Gene type"]=="protein_coding"]
info_lnc=info[info["Gene type"]=="lincRNA"]
info_pt.shape, info_lnc.shape

In [None]:
mRNA=pd.read_csv("Results/Batch effect/Health-sc-mRNA-test-batch.csv.gz", index_col=0)
mRNA=mRNA.columns
mRNA

In [None]:
dfs=[]
for i in range(len(files)):
    print(files[i])
    df=pd.read_csv(files[i], index_col=0)
    print(df.shape)
    df=df[intersection([mRNA, df.columns])]
    print(df.shape)
    df=df.loc[intersection([df.index,info_lnc.index])]
    print(df.shape)
    sns.countplot(info.loc[df.index]["Gene type"])
    dfs.append(df)
    plt.show()

In [None]:
df_tot=pd.concat(dfs,axis=1)
print(df_tot.shape)
del dfs

In [None]:
adata=sc.AnnData(X=df_tot.T)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0, max_mean=50, min_disp=0.25)
sc.pl.highly_variable_genes(adata)
adata = adata[:, adata.var.highly_variable]
adata.X.T.shape

In [None]:
temp=pd.DataFrame(data=adata.X.T, index=adata.var.index, columns=adata.obs.index)
print(temp.shape)
Path(f"Results/Batch effect").mkdir(parents=True, exist_ok=True)
temp.to_csv(f"Results/Batch effect/Health-sc-lncRNA-test-batch.csv.gz",compression="gzip")

After you built the mRNA and the lncRNA matrices it's possible to test nSBM for batch effect following notebook "B3 - Batch effect"