Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

convert to use umi_count #358

Merged
merged 6 commits into from Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
107 changes: 35 additions & 72 deletions dandelion/preprocessing/_preprocessing.py
Expand Up @@ -2033,7 +2033,7 @@ def filter_contigs(

Depends on a `AnnData`.obs slot populated with 'filter_rna' column. If the aligned sequence is an exact match
between contigs, the contigs will be merged into the one with the highest umi count, adding the summing the
umi count of the duplicated contigs to duplicate_count column. After this check, if there are still multiple
umi count of the duplicated contigs to duplicate_count/umi_count column. After this check, if there are still multiple
contigs, cells with multiple contigs are filtered unless `keep_highest_umi` is False, where by the umi counts for
each contig will then be compared and only the highest is retained. The contig with the highest umi that is
> umi_foldchange_cutoff (default is empirically set at 2) will be retained. For productive heavy/long chains,
Expand Down Expand Up @@ -2171,7 +2171,7 @@ def filter_contigs(
umi_adjustment = tofilter.umi_adjustment.copy()

if len(umi_adjustment) > 0:
dat["duplicate_count"].update(umi_adjustment)
dat["umi_count"].update(umi_adjustment)

poorqual = {c: "False" for c in adata_.obs_names}
hdoublet = {c: "False" for c in adata_.obs_names}
Expand Down Expand Up @@ -3021,9 +3021,7 @@ def __init__(
],
)
h_p = list(data1["sequence_id"])
h_umi_p = [
int(x) for x in pd.to_numeric(data1["duplicate_count"])
]
h_umi_p = [int(x) for x in pd.to_numeric(data1["umi_count"])]
h_ccall_p = list(data1["c_call"])
h_locus_p = list(data1["locus"])
if len(h_p) > 1:
Expand Down Expand Up @@ -3060,12 +3058,12 @@ def __init__(
if all(x in ["IGHM", "IGHD"] for x in h_ccall_p):
h_ccall_p_igm_count = dict(
data1[data1["c_call"] == "IGHM"][
"duplicate_count"
"umi_count"
]
)
h_ccall_p_igd_count = dict(
data1[data1["c_call"] == "IGHD"][
"duplicate_count"
"umi_count"
]
)

Expand Down Expand Up @@ -3148,14 +3146,10 @@ def __init__(
elif all(x in ["TRB", "TRD"] for x in h_locus_p):
if len(list(set(h_locus_p))) == 2:
h_locus_p_trb_count = dict(
data1[data1["locus"] == "TRB"][
"duplicate_count"
]
data1[data1["locus"] == "TRB"]["umi_count"]
)
h_locus_p_trd_count = dict(
data1[data1["locus"] == "TRD"][
"duplicate_count"
]
data1[data1["locus"] == "TRD"]["umi_count"]
)

if len(h_locus_p_trb_count) > 1:
Expand Down Expand Up @@ -3283,9 +3277,7 @@ def __init__(
],
)
h_np = list(data2["sequence_id"])
h_umi_np = [
int(x) for x in pd.to_numeric(data2["duplicate_count"])
]
h_umi_np = [int(x) for x in pd.to_numeric(data2["umi_count"])]
if len(h_np) > 1:
highest_umi_h = max(h_umi_np)
highest_umi_idx = [
Expand All @@ -3312,8 +3304,7 @@ def __init__(
data2 = pd.DataFrame([data2.loc[keep_hc_contig]])
h_np = list(data2["sequence_id"])
h_umi_np = [
int(x)
for x in pd.to_numeric(data2["duplicate_count"])
int(x) for x in pd.to_numeric(data2["umi_count"])
]
if len(self.Cell[cell]["VJ"]["P"]) > 0:
data3 = pd.DataFrame(
Expand All @@ -3329,9 +3320,7 @@ def __init__(
],
)
l_p = list(data3["sequence_id"])
l_umi_p = [
int(x) for x in pd.to_numeric(data3["duplicate_count"])
]
l_umi_p = [int(x) for x in pd.to_numeric(data3["umi_count"])]
if len(l_p) > 1:
if "sequence_alignment" in data3:
(
Expand Down Expand Up @@ -3406,9 +3395,7 @@ def __init__(
],
)
l_np = list(data4["sequence_id"])
l_umi_np = [
int(x) for x in pd.to_numeric(data4["duplicate_count"])
]
l_umi_np = [int(x) for x in pd.to_numeric(data4["umi_count"])]
if len(l_np) > 1:
highest_umi_l = max(l_umi_np)
highest_umi_l_idx = [
Expand Down Expand Up @@ -3812,9 +3799,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
],
)
h_p = list(data1["sequence_id"])
h_umi_p = [
int(x) for x in pd.to_numeric(data1["duplicate_count"])
]
h_umi_p = [int(x) for x in pd.to_numeric(data1["umi_count"])]
h_ccall_p = list(data1["c_call"])
if len(h_p) > 1:
if "sequence_alignment" in data1:
Expand All @@ -3832,7 +3817,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
h_p[:keep_index_h] + h_p[keep_index_h:]
)
keep_hc_contig = h_p[keep_index_h]
data1[keep_hc_contig, "duplicate_count"] = int(
data1[keep_hc_contig, "umi_count"] = int(
np.sum(
h_umi_p[:keep_index_h]
+ h_umi_p[keep_index_h:]
Expand All @@ -3855,9 +3840,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
h_p = list(data1["sequence_id"])
h_umi_p = [
int(x)
for x in pd.to_numeric(
data1["duplicate_count"]
)
for x in pd.to_numeric(data1["umi_count"])
]
if len(self.Cell[cell]["VDJ"]["NP"]) > 0:
data2 = pd.DataFrame(
Expand All @@ -3873,9 +3856,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
],
)
h_np = list(data2["sequence_id"])
h_umi_np = [
int(x) for x in pd.to_numeric(data2["duplicate_count"])
]
h_umi_np = [int(x) for x in pd.to_numeric(data2["umi_count"])]
if len(self.Cell[cell]["VJ"]["P"]) > 0:
data3 = pd.DataFrame(
[
Expand All @@ -3890,9 +3871,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
],
)
l_p = list(data3["sequence_id"])
l_umi_p = [
int(x) for x in pd.to_numeric(data3["duplicate_count"])
]
l_umi_p = [int(x) for x in pd.to_numeric(data3["umi_count"])]
if len(l_p) > 1:
if "sequence_alignment" in data3:
l_seq_p = list(data3["sequence_alignment"])
Expand All @@ -3908,7 +3887,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
l_p[:keep_index_l] + l_p[keep_index_l:]
)
keep_lc_contig = l_p[keep_index_l]
data3.at[keep_lc_contig, "duplicate_count"] = int(
data3.at[keep_lc_contig, "umi_count"] = int(
np.sum(
l_umi_p[:keep_index_l]
+ l_umi_p[keep_index_l:]
Expand All @@ -3929,7 +3908,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
l_p = list(data3["sequence_id"])
l_umi_p = [
int(x)
for x in pd.to_numeric(data3["duplicate_count"])
for x in pd.to_numeric(data3["umi_count"])
]
if len(self.Cell[cell]["VJ"]["NP"]) > 0:
data4 = pd.DataFrame(
Expand All @@ -3945,9 +3924,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
],
)
l_np = list(data4["sequence_id"])
l_umi_np = [
int(x) for x in pd.to_numeric(data4["duplicate_count"])
]
l_umi_np = [int(x) for x in pd.to_numeric(data4["umi_count"])]

if "h_p" not in locals():
h_p = []
Expand Down Expand Up @@ -5225,7 +5202,7 @@ def check_contigs(
extra = contig_status.extra_contigs.copy()
umi_adjustment = contig_status.umi_adjustment.copy()
if len(umi_adjustment) > 0:
dat["duplicate_count"].update(umi_adjustment)
dat["umi_count"].update(umi_adjustment)

ambi = {c: "F" for c in dat_.sequence_id}
ambiguous_ = {x: "T" for x in ambigous}
Expand Down Expand Up @@ -5261,7 +5238,7 @@ def check_contigs(
)

if productive_only:
dat_["duplicate_count"].update(dat["duplicate_count"])
dat_["umi_count"].update(dat["umi_count"])
for column in ["ambiguous", "extra"]:
dat_[column] = dat[column]
dat_[column].fillna("T", inplace=True)
Expand Down Expand Up @@ -5374,9 +5351,7 @@ def __init__(
],
)
vdj_p = list(data1["sequence_id"])
vdj_umi_p = [
int(x) for x in pd.to_numeric(data1["duplicate_count"])
]
vdj_umi_p = [int(x) for x in pd.to_numeric(data1["umi_count"])]
vdj_ccall_p = list(data1["c_call"])
vdj_locus_p = list(data1["locus"])
if len(vdj_p) > 1:
Expand All @@ -5400,12 +5375,12 @@ def __init__(
if len(list(set(vdj_ccall_p))) == 2:
vdj_ccall_p_igm_count = dict(
data1[data1["c_call"] == "IGHM"][
"duplicate_count"
"umi_count"
]
)
vdj_ccall_p_igd_count = dict(
data1[data1["c_call"] == "IGHD"][
"duplicate_count"
"umi_count"
]
)

Expand Down Expand Up @@ -5445,9 +5420,7 @@ def __init__(
extra_vdj = extra_igm + extra_igd
ambiguous_vdj = ambiguous_igm + ambiguous_igd
else:
vdj_ccall_p_count = dict(
data1["duplicate_count"]
)
vdj_ccall_p_count = dict(data1["umi_count"])
if len(vdj_ccall_p_count) > 1:
(
vdj_p,
Expand All @@ -5461,14 +5434,10 @@ def __init__(
elif all(x in ["TRB", "TRD"] for x in vdj_locus_p):
if len(list(set(vdj_locus_p))) == 2:
vdj_locus_p_trb_count = dict(
data1[data1["locus"] == "TRB"][
"duplicate_count"
]
data1[data1["locus"] == "TRB"]["umi_count"]
)
vdj_locus_p_trd_count = dict(
data1[data1["locus"] == "TRD"][
"duplicate_count"
]
data1[data1["locus"] == "TRD"]["umi_count"]
)

if len(vdj_locus_p_trb_count) > 1:
Expand Down Expand Up @@ -5507,9 +5476,7 @@ def __init__(
extra_vdj = extra_trb + extra_trd
ambiguous_vdj = ambiguous_trb + ambiguous_trd
else:
vdj_ccall_p_count = dict(
data1["duplicate_count"]
)
vdj_ccall_p_count = dict(data1["umi_count"])
if len(vdj_ccall_p_count) > 1:
(
vdj_p,
Expand All @@ -5521,7 +5488,7 @@ def __init__(
else:
vdj_p, extra_vdj, ambiguous_vdj = [], [], []
else:
vdj_ccall_p_count = dict(data1["duplicate_count"])
vdj_ccall_p_count = dict(data1["umi_count"])
if len(vdj_ccall_p_count) > 1:
(
vdj_p,
Expand Down Expand Up @@ -5581,9 +5548,7 @@ def __init__(
],
)
vj_p = list(data3["sequence_id"])
vj_umi_p = [
int(x) for x in pd.to_numeric(data3["duplicate_count"])
]
vj_umi_p = [int(x) for x in pd.to_numeric(data3["umi_count"])]
if len(vj_p) > 1:
if "sequence_alignment" in data3:
(
Expand All @@ -5600,7 +5565,7 @@ def __init__(
for avj in ambi_cont_vj:
self.ambiguous_contigs.append(avj)
if len(vj_p) > 1:
vj_ccall_p_count = dict(data3["duplicate_count"])
vj_ccall_p_count = dict(data3["umi_count"])
# maximum keep 2?
vj_p, extra_vj, ambiguous_vj = check_productive_vj(
vj_ccall_p_count
Expand Down Expand Up @@ -5972,7 +5937,7 @@ def check_update_same_seq(
k: r for k, r in dict(data[sequencecol]).items() if present(r)
}
_count = {
k: r for k, r in dict(data.duplicate_count).items() if k in _seq
k: r for k, r in dict(data.umi_count).items() if k in _seq
}
rep_seq = [
seq
Expand Down Expand Up @@ -6006,9 +5971,7 @@ def check_update_same_seq(
for dk in dup_keys[1:]:
ambi_cont.append(dk)
keep_seqs_ids.append(keep_index_vj)
data.duplicate_count.update(
{keep_index_vj: keep_index_count}
)
data.umi_count.update({keep_index_vj: keep_index_count})
# refresh
empty_seqs_ids = [
k
Expand All @@ -6019,11 +5982,11 @@ def check_update_same_seq(
keep_seqs_ids = keep_seqs_ids + empty_seqs_ids
data = data.loc[keep_seqs_ids]
keep_id = list(data.sequence_id)
keep_umi = [int(x) for x in pd.to_numeric(data.duplicate_count)]
keep_umi = [int(x) for x in pd.to_numeric(data.umi_count)]
keep_ccall = list(data.c_call)
else:
keep_id = list(data.sequence_id)
keep_umi = [int(x) for x in pd.to_numeric(data.duplicate_count)]
keep_umi = [int(x) for x in pd.to_numeric(data.umi_count)]
keep_ccall = list(data.c_call)

return (data, keep_id, keep_umi, keep_ccall, umi_adjust, ambi_cont)
Expand Down
2 changes: 1 addition & 1 deletion dandelion/tools/_tools.py
Expand Up @@ -727,7 +727,7 @@ def _lightCluster(heavy_file, light_file, out_file, doublets, fileformat):
v_call = "v_call"
j_call = "j_call"
junction_length = "junction_length"
umi_count = "duplicate_count"
umi_count = "umi_count"
else:
sys.exit("Invalid format %s" % fileformat)

Expand Down
14 changes: 7 additions & 7 deletions dandelion/utilities/_core.py
Expand Up @@ -123,12 +123,12 @@ def __init__(
except:
pass
if (
pd.Series(["cell_id", "duplicate_count", "productive"])
pd.Series(["cell_id", "umi_count", "productive"])
.isin(self.data.columns)
.all()
): # sort so that the productive contig with the largest umi is first
self.data.sort_values(
by=["cell_id", "productive", "duplicate_count"],
by=["cell_id", "productive", "umi_count"],
inplace=True,
ascending=[True, False, False],
)
Expand Down Expand Up @@ -874,12 +874,12 @@ def update_metadata(
"d_call",
"j_call",
"c_call",
"duplicate_count",
"umi_count",
"junction",
"junction_aa",
]

if "duplicate_count" not in self.data:
if "umi_count" not in self.data:
raise ValueError(
"Unable to initialize metadata due to missing keys. "
"Please ensure either 'umi_count' or 'duplicate_count' is in the input data."
Expand Down Expand Up @@ -1841,7 +1841,7 @@ def initialize_metadata(
"productive",
]:
meta_[k + "_split"] = querier.retrieve_celltype(**v)
if k in ["duplicate_count"]:
if k in ["umi_count"]:
v.update({"retrieve_mode": "split and sum"})
meta_[k] = querier.retrieve_celltype(**v)
if k in ["mu_count", "mu_freq"]:
Expand Down Expand Up @@ -2251,12 +2251,12 @@ def update_metadata(
"d_call",
"j_call",
"c_call",
"duplicate_count",
"umi_count",
"junction",
"junction_aa",
]

if "duplicate_count" not in vdj_data.data:
if "umi_count" not in vdj_data.data:
raise ValueError(
"Unable to initialize metadata due to missing keys. "
"Please ensure either 'umi_count' or 'duplicate_count' is in the input data."
Expand Down