# Used Data

|Cells     |DNA-SLX   |RNA-IDX |
|----------|----------|--------|
|PEO1-MIX  |SLX-24532 |SITTF7  |
|PEO1-MIS  |SLX-24491 |SITTC7  |
|PEO1-STOP |SLX-24518 |SITTG8  |
|NA12878   |SLX-25395 |SITTB3  |


## Setup dataset

In [None]:
import pandas as pd
from pathlib import Path
import subprocess

In [None]:
profiles_dir = Path("") #path to directory which will contain the profile tsvs
workflow_dir = Path("") #absolute path to the root of the cell matching workflow (cell_matching)
scrna_cnv_dir = Path("")
genes_ref = scrna_cnv_dir/"resources/annotate_genes_hg19_update_20230126.txt" # Path to gene reference file (should be present in scRNA-seq processing workflow)


In [None]:
samples_dna = { # second element should be absolute path to scAbsolute rds object
    "SLX-24532" : ("PEO1",""),
    "SLX-24491" : ("PEO1-Mis",""),
    "SLX-24518" : ("PEO1-Stop",""),
    "SLX-25395" : ("NA12878","")
}
samples_rna = {
    "SITTF7" : "PEO1",
    "SITTC7" : "PEO1-Mis",
    "SITTG8" : "PEO1-Stop",
    "SITTB3" : "NA12878"
}
rna_methods = { # first element is a list of profile modes to create (at least one from [cn,cat], second element is path to the output dir for the method, which should contain one directory for each SLX)
    "rna": (["cn"],scrna_cnv_dir/"results/cellranger_post"), #qc filtered gene expression profiles
    "copykat":(["cn"],scrna_cnv_dir/"results/copykat"), 
    "copyvae":(["cn"],scrna_cnv_dir/"results/copyvae"), 
    "numbat":(["cat"],scrna_cnv_dir/"results/numbat")
}
rna_cells_per_sample = 1400 #set to number of cells per RNA sample to use. All DNA cells are used
random_seed = 42


## Extract all profiles (scAbsolute, gene expression and scRNA-seq based CNV)

In [None]:
#scAbsolute
for slx in samples_dna:
    cell_line,scabs_path = samples_dna[slx]
    subprocess.run([
        str(workflow_dir/"cell_matching/scripts/transform_data.py"),
        "-i",scabs_path,
        "-n",cell_line,
        "-o",str(profiles_dir.absolute()),
        "-t","scabsolute",
        "-g",genes_ref,
        "-m","cn"
    ])
    subprocess.run([
        str(workflow_dir/"cell_matching/scripts/transform_data.py"),
        "-i",scabs_path,
        "-n",cell_line,
        "-o",str(profiles_dir.absolute()),
        "-t","scabsolute",
        "-g",genes_ref,
        "-m","cat"
    ])
    print(cell_line)    

In [None]:
#RNA-based methods
for sample_id in samples_rna:
    cell_line = samples_rna[sample_id]
    for method in rna_methods:
        for mode in rna_methods[method][0]:
            subprocess.run([
                str(workflow_dir/"cell_matching/scripts/transform_data.py"),
                "-i",rna_methods[method][1]/sample_id,
                "-n",cell_line,
                "-o",str(profiles_dir.absolute()),
                "-t",method,
                "-g",genes_ref,
                "-m",mode
            ])
        print(method,sample_id)

## Combine per-sample profiles into one large dataset

In [None]:
dna_cn = pd.concat([pd.read_csv(profiles_dir/f"{slx}_scabsolute_cn.tsv",index_col=0,sep="\t").rename(columns=lambda x: f"{samples_dna[slx]}_{x}") for slx in samples_dna],axis=1).astype(int)
dna_cn = dna_cn.loc[dna_cn.apply(sum,axis=1)>0]
dna_cn.to_csv(profiles_dir/"/dna_cn_allgenes.tsv",sep="\t")
dna_cn

In [None]:
dna_cat = pd.concat([pd.read_csv(profiles_dir/f"{slx}_scabsolute_cat.tsv",index_col=0,sep="\t").rename(columns=lambda x: f"{samples_dna[slx]}_{x}") for slx in samples_dna],axis=1).astype(int)
dna_cat = dna_cat.loc[dna_cat.apply(sum,axis=1)>0]
dna_cat.to_csv(profiles_dir/"dna_cat_allgenes.tsv",sep="\t")
dna_cat

In [None]:
for method in rna_methods:
    for mode in rna_methods[method][0]:
        temp_df = pd.concat(
            [
                pd.read_csv(profiles_dir/f"{sam}_{method}_{mode}.tsv",index_col=0,sep="\t").rename(columns=lambda x: f"{samples_rna[sam]}_{x}").sample(n=rna_cells_per_sample,random_state=random_seed,axis=1) 
                for sam in samples_rna
            ],axis=1
        ).dropna().astype(float).to_csv(profiles_dir/f"{method}_{mode}.tsv",sep="\t")
        print(method)
        if method == "rna" and mode == "cn": #required later on
            rna_cn = temp_df.copy()


## Get DNA and RNA profiles on shared set of genes
* Necessary for MaCroDNA and clonealign
* speeds up similarity calculation for gene expression vs. DNA -> no NA cells

In [None]:
shared_genes = list(set(dna_cn.index).intersection(rna_cn.index))
len(shared_genes)

In [None]:
dna_cn.loc[shared_genes].to_csv(profiles_dir/"dna_cn_shared.tsv",sep="\t")
rna_cn.loc[shared_genes].to_csv(profiles_dir/"rna_cn_shared.tsv",sep="\t")

## Create reduced-complexity set of clones for clonealign (DNA only)

In [None]:
clonealign_data = dna_cn.loc[shared_genes].copy()


In [None]:
clonealign_data.to_csv(profiles_dir/"dna_cn_clonealign.tsv",sep="\t")