In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import nbinom, poisson, binom
import pymc as pm
import arviz as az
from Bio import Phylo
from tqdm.notebook import tqdm
from Bio import SeqIO



## merge all Ns into One

In [7]:
PHYDIR="/data2/kantian/LineageTracing/SMALT/0.Results/2.filteredNT_re/"
###############################################################
##------------------------merge all IBD_CRC N (3K)------------------------
###############################################################
samples=["2_N","4_N","5_N","16_N","19_N","47_N1","47_N4","47_N5","47_N6","47_N8",
                    "49_N","50_N","65_N","66_N","17_N","151_N"] ## 18_N, 132_N
PHYDIR="/data2/kantian/LineageTracing/SMALT/0.Results/2.final_phy_re/"
data=pd.DataFrame()
for mySample in samples:
    phy = pd.read_csv(PHYDIR+mySample+"_filtered_re.phy",sep = " ",header = None,skiprows=[0,1], names = ["nam","bi"],index_col = 0)
    phy.index = [mySample+"_"+i.split("_")[1] for i in phy.index]
    data = pd.concat([data, phy])
numR = data.shape[0]
header = pd.DataFrame({"bi":{"ref":''.join(map(str,[0]*3004)),str(numR+1):"3004"}})
data.to_csv(PHYDIR+"IBD_N_merged_3k.phy",sep = " ",index=True, header = None)
###############################################################
##------------------------merge all IBD_CRC N (1.5K)-----------------------
###############################################################
samples=["142_N","148_N"] ## 18_N, 132_N
PHYDIR="/data2/kantian/LineageTracing/SMALT/0.Results/2.final_phy_re/"
data=pd.DataFrame()
for mySample in samples:
    phy = pd.read_csv(PHYDIR+mySample+"_filtered_re.phy",sep = " ",header = None,skiprows=[0,1], names = ["nam","bi"],index_col = 0)
    phy.index = [mySample+"_"+i.split("_")[1] for i in phy.index]
    data = pd.concat([data, phy])
numR = data.shape[0]
header = pd.DataFrame({"bi":{"ref":''.join(map(str,[0]*1421)),str(numR+1):"1421"}})
data.to_csv(PHYDIR+"IBD_N_merged_1.5k.phy",sep = " ",index=True, header = None)


## 1. downsampling: if more than 1000 cells, downsampling 1000 cells

In [10]:
PHYDIR="/data2/kantian/LineageTracing/SMALT/0.Results/2.final_phy_re/"
OUTDIR="/data2/kantian/LineageTracing/SMALT/0.Results/3.IBD_mergeN1000_T1000/"
if not os.path.exists(OUTDIR):
    os.mkdir(OUTDIR)
###############################################################
##------------------------CRC 10X (3K)------------------------
###############################################################
samples=["132_T1","151_T4","17_T4","18_T3","18_T4"]

samplingSize=1000
for mySample in samples:
    iter=1
    while iter<=20:
        N = pd.read_csv(PHYDIR+"IBD_N_merged_3k.phy",sep = " ",header = None,skiprows=[0,1], names = ["nam","bi"],index_col = 0)
        T = pd.read_csv(PHYDIR+mySample+"_filtered_re.phy",sep = " ",header = None,skiprows=[0,1], names = ["nam","bi"],index_col = 0)
        if len(N)>samplingSize:
            N=N.sample(n=samplingSize)
        if len(T)>samplingSize:
            T=T.sample(n=samplingSize)
        M=pd.concat([N,T])
        numR = M.shape[0]
        # print(numR)
        header = pd.DataFrame({"bi":{"ref":''.join(map(str,[0]*3004)),str(numR+1):"3004"}})
        data = pd.concat([header, M])
        data.to_csv(OUTDIR+mySample+"_sampling"+str(samplingSize)+"_"+str(iter)+".phy",sep = " ",index=True, header = None)
        iter+=1
###############################################################
##------------------------merge all IBD_CRC N (1.5K)-----------------------
###############################################################
samples=["142_T1","142_T2","142_T5-4","148_T1","148_T3"]

samplingSize=1000
for mySample in samples:
    iter=1
    while iter<=20:
        N = pd.read_csv(PHYDIR+"IBD_N_merged_1.5k.phy",sep = " ",header = None,skiprows=[0,1], names = ["nam","bi"],index_col = 0)
        T = pd.read_csv(PHYDIR+mySample+"_filtered_re.phy",sep = " ",header = None,skiprows=[0,1], names = ["nam","bi"],index_col = 0)
        if len(N)>samplingSize:
            N=N.sample(n=samplingSize)
        if len(T)>samplingSize:
            T=T.sample(n=samplingSize)
        M=pd.concat([N,T])
        numR = M.shape[0]
        # print(numR)
        header = pd.DataFrame({"bi":{"ref":''.join(map(str,[0]*1421)),str(numR+1):"1421"}})
        data = pd.concat([header, M])
        data.to_csv(OUTDIR+mySample+"_sampling"+str(samplingSize)+"_"+str(iter)+".phy",sep = " ",index=True, header = None)
        iter+=1


## 2. downsampling: min(len(N),len(T),1000)

In [13]:
samples=["132_T1","151_T4","17_T4","18_T3","18_T4","142_T1","142_T2","142_T5-4","148_T1","148_T3"]
for mySample in samples:
    T = pd.read_csv(PHYDIR+mySample+"_filtered_re.phy",sep = " ",header = None,skiprows=[0,1], names = ["nam","bi"],index_col = 0)
    print(mySample)
    print(len(T))


132_T1
2629
151_T4
426
17_T4
290
18_T3
1007
18_T4
161
142_T1
5691
142_T2
5853
142_T5-4
749
148_T1
1626
148_T3
2324


In [16]:
PHYDIR="/data2/kantian/LineageTracing/SMALT/0.Results/2.final_phy_re/"
OUTDIR="/data2/kantian/LineageTracing/SMALT/0.Results/3.IBD_merge_NTsame/"
if not os.path.exists(OUTDIR):
    os.mkdir(OUTDIR)
###############################################################
##------------------------CRC 10X (3K)------------------------
###############################################################
samples=["151_T4","17_T4","18_T4"]
for mySample in samples:
    iter=1
    while iter<=20:
        N = pd.read_csv(PHYDIR+"IBD_N_merged_3k.phy",sep = " ",header = None,skiprows=[0,1], names = ["nam","bi"],index_col = 0)
        T = pd.read_csv(PHYDIR+mySample+"_filtered_re.phy",sep = " ",header = None,skiprows=[0,1], names = ["nam","bi"],index_col = 0)
        samplingSize=min(len(N),len(T),1000)
        if len(N)>samplingSize:
            N=N.sample(n=samplingSize)
        if len(T)>samplingSize:
            T=T.sample(n=samplingSize)
        M=pd.concat([N,T])
        numR = M.shape[0]
        # print(numR)
        header = pd.DataFrame({"bi":{"ref":''.join(map(str,[0]*3004)),str(numR+1):"3004"}})
        data = pd.concat([header, M])
        data.to_csv(OUTDIR+mySample+"_sampling"+str(samplingSize)+"_"+str(iter)+".phy",sep = " ",index=True, header = None)
        iter+=1
###############################################################
##------------------------merge all IBD_CRC N (1.5K)-----------------------
###############################################################
samples=["142_T5-4"]
for mySample in samples:
    iter=1
    while iter<=20:
        N = pd.read_csv(PHYDIR+"IBD_N_merged_1.5k.phy",sep = " ",header = None,skiprows=[0,1], names = ["nam","bi"],index_col = 0)
        T = pd.read_csv(PHYDIR+mySample+"_filtered_re.phy",sep = " ",header = None,skiprows=[0,1], names = ["nam","bi"],index_col = 0)
        samplingSize=min(len(N),len(T),1000)
        if len(N)>samplingSize:
            N=N.sample(n=samplingSize)
        if len(T)>samplingSize:
            T=T.sample(n=samplingSize)
        M=pd.concat([N,T])
        numR = M.shape[0]
        # print(numR)
        header = pd.DataFrame({"bi":{"ref":''.join(map(str,[0]*1421)),str(numR+1):"1421"}})
        data = pd.concat([header, M])
        data.to_csv(OUTDIR+mySample+"_sampling"+str(samplingSize)+"_"+str(iter)+".phy",sep = " ",index=True, header = None)
        iter+=1

7410