In [1]:
import pandas as pd

This notebook is used to :

- Prepare the downsampled barcodes
- De-anonymize the donors after deconvolution


Create barcodes for downsampling.


In [2]:
donor_barcodes = pd.read_csv(
    "/Users/xichenwu/hagen-reproducibility/data/pipeline_output/gx12/donor_match/all_assignment_after_match.csv"
)
for i in range(1, 7, 1):
    hash = "Hash45" + str(i) + "_TotalSeqA"
    downsampled = pd.concat(
        [
            donor_barcodes[~(donor_barcodes["vireo_1"] == hash)],
            donor_barcodes[donor_barcodes["vireo_1"] == hash].sample(
                frac=0.5, random_state=42
            ),
        ]
    )
    print(f"{i}: {len(downsampled)}")
    pd.DataFrame(downsampled["Barcode"]).to_csv(
        "downsampled_" + hash + "_0.5.tsv", sep="\t", header=None, index=None
    )

1: 4286
2: 4534
3: 4486
4: 4506
5: 4766
6: 4567


Merge result with hashing tools.


In [3]:
hashing = pd.read_csv(
    "../data/pipeline_output/gx12/hash_assignment_all.csv", index_col=0
)
for i in range(1, 7, 1):
    downsampled_res = pd.read_csv(
        "../data/downsampled/genetic_summary_45"
        + str(i)
        + "/genetic_assignment_all.csv",
        index_col=0,
    )
    downsampled_res_all = pd.merge(
        hashing, downsampled_res, how="outer", left_index=True, right_index=True
    )
    downsampled_res_all.to_csv(
        "../data/downsampled/genetic_summary_45" + str(i) + "/downsampled_res_all.csv"
    )

Compare with the previous result.


In [4]:
res = pd.read_csv(
    "../data/pipeline_output/gx12/donor_match/all_assignment_after_match.csv",
    usecols=["Barcode", "vireo_1"],
    index_col=0,
)
res.rename(columns={"vireo_1": "Full Dataset"}, inplace=True)
barcodes_remain = [4286, 4534, 4486, 4506, 4766, 4567]

In [5]:
compare_downsample_vireo = res.copy()
for i in range(1, 7, 1):
    output_dir = "../data/downsampled/genetic_summary_45" + str(i)
    vireo_demuxem_match = pd.read_csv(
        output_dir + "/donor_match/vireo_1_vs_demuxem_1/all_assignment_after_match.csv",
        usecols=["Barcode", "vireo_1"],
        index_col=0,
    )
    if len(vireo_demuxem_match) != barcodes_remain[i - 1]:
        raise ValueError(f"The number is not correct for hash 45{i}")
    vireo_demuxem_match.rename(
        columns={"vireo_1": "Hash45" + str(i) + " Downsampled Dataset"}, inplace=True
    )
    compare_downsample_vireo = pd.merge(
        compare_downsample_vireo,
        vireo_demuxem_match,
        left_index=True,
        right_index=True,
        how="outer",
    )

In [6]:
compare_downsample_vireo.to_csv("../data/downsampled/downsampled_compare_vireo_all.csv")

In [7]:
compare_downsample_souporcell = res.copy()
for i in range(1, 7, 1):
    output_dir = "../data/downsampled/genetic_summary_45" + str(i)
    souporcell_demuxem_match = pd.read_csv(
        output_dir
        + "/donor_match/souporcell_1_vs_demuxem_1/all_assignment_after_match.csv",
        usecols=["Barcode", "souporcell_1"],
        index_col=0,
    )
    if len(souporcell_demuxem_match) != barcodes_remain[i - 1]:
        raise ValueError(f"The number is not correct for hash 45{i}")
    souporcell_demuxem_match.rename(
        columns={"souporcell_1": "Hash45" + str(i) + " Downsampled Dataset"},
        inplace=True,
    )
    compare_downsample_souporcell = pd.merge(
        compare_downsample_souporcell,
        souporcell_demuxem_match,
        left_index=True,
        right_index=True,
        how="outer",
    )

In [8]:
compare_downsample_souporcell.to_csv(
    "../data/downsampled/downsampled_compare_souporcell_all.csv"
)

In [9]:
compare_downsample_freemuxlet = res.copy()
for i in range(1, 7, 1):
    output_dir = "../data/downsampled/genetic_summary_45" + str(i)
    freemuxlet_demuxem_match = pd.read_csv(
        output_dir
        + "/donor_match/freemuxlet_1_vs_demuxem_1/all_assignment_after_match.csv",
        usecols=["Barcode", "freemuxlet_1"],
        index_col=0,
    )
    if len(freemuxlet_demuxem_match) != barcodes_remain[i - 1]:
        raise ValueError(f"The number is not correct for hash 45{i}")
    freemuxlet_demuxem_match.rename(
        columns={"freemuxlet_1": "Hash45" + str(i) + " Downsampled Dataset"},
        inplace=True,
    )
    compare_downsample_freemuxlet = pd.merge(
        compare_downsample_freemuxlet,
        freemuxlet_demuxem_match,
        left_index=True,
        right_index=True,
        how="outer",
    )

In [10]:
compare_downsample_freemuxlet.to_csv(
    "../data/downsampled/downsampled_compare_freemuxlet_all.csv"
)

In [11]:
compare_downsample_scsplit = res.copy()
for i in range(1, 7, 1):
    output_dir = "../data/downsampled/genetic_summary_45" + str(i)
    scsplit_demuxem_match = pd.read_csv(
        output_dir
        + "/donor_match/scsplit_1_vs_demuxem_1/all_assignment_after_match.csv",
        usecols=["Barcode", "scsplit_1"],
        index_col=0,
    )
    scsplit_demuxem_match.rename(
        columns={"scsplit_1": "Hash45" + str(i) + " Downsampled Dataset"}, inplace=True
    )
    singlets = pd.read_csv(
        "/Users/xichenwu/hagen-reproducibility/data/downsampled/genetic_summary_45"
        + str(i)
        + "/scsplit_assignment.csv"
    )
    singlets = singlets[~singlets["scsplit_1"].str.contains("doublet")]
    scsplit_demuxem_match = scsplit_demuxem_match.reset_index()
    merged_df = scsplit_demuxem_match.merge(
        singlets[["Barcode"]], on="Barcode", how="left", indicator=True
    )
    merged_df.loc[
        merged_df["_merge"] == "both", "Hash45" + str(i) + " Downsampled Dataset"
    ] = merged_df.loc[
        merged_df["_merge"] == "both", "Hash45" + str(i) + " Downsampled Dataset"
    ].fillna(
        "Not matched"
    )
    scsplit_demuxem_match = merged_df.drop(columns=["_merge"]).set_index("Barcode")
    compare_downsample_scsplit = pd.merge(
        compare_downsample_scsplit,
        scsplit_demuxem_match,
        left_index=True,
        right_index=True,
        how="outer",
    )

In [12]:
compare_downsample_scsplit.to_csv(
    "../data/downsampled/downsampled_compare_scsplit_all.csv"
)