Skip to content

Commit

Permalink
convert to use umi_count
Browse files Browse the repository at this point in the history
  • Loading branch information
zktuong committed Jan 30, 2024
1 parent f967b82 commit 00f3173
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 63 deletions.
70 changes: 35 additions & 35 deletions dandelion/preprocessing/_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2033,7 +2033,7 @@ def filter_contigs(
Depends on a `AnnData`.obs slot populated with 'filter_rna' column. If the aligned sequence is an exact match
between contigs, the contigs will be merged into the one with the highest umi count, adding the summing the
umi count of the duplicated contigs to duplicate_count column. After this check, if there are still multiple
umi count of the duplicated contigs to duplicate_count/umi_count column. After this check, if there are still multiple
contigs, cells with multiple contigs are filtered unless `keep_highest_umi` is False, where by the umi counts for
each contig will then be compared and only the highest is retained. The contig with the highest umi that is
> umi_foldchange_cutoff (default is empirically set at 2) will be retained. For productive heavy/long chains,
Expand Down Expand Up @@ -2171,7 +2171,7 @@ def filter_contigs(
umi_adjustment = tofilter.umi_adjustment.copy()

if len(umi_adjustment) > 0:
dat["duplicate_count"].update(umi_adjustment)
dat["umi_count"].update(umi_adjustment)

poorqual = {c: "False" for c in adata_.obs_names}
hdoublet = {c: "False" for c in adata_.obs_names}
Expand Down Expand Up @@ -3022,7 +3022,7 @@ def __init__(
)
h_p = list(data1["sequence_id"])
h_umi_p = [
int(x) for x in pd.to_numeric(data1["duplicate_count"])
int(x) for x in pd.to_numeric(data1["umi_count"])
]
h_ccall_p = list(data1["c_call"])
h_locus_p = list(data1["locus"])
Expand Down Expand Up @@ -3060,12 +3060,12 @@ def __init__(
if all(x in ["IGHM", "IGHD"] for x in h_ccall_p):
h_ccall_p_igm_count = dict(
data1[data1["c_call"] == "IGHM"][
"duplicate_count"
"umi_count"
]
)
h_ccall_p_igd_count = dict(
data1[data1["c_call"] == "IGHD"][
"duplicate_count"
"umi_count"
]
)

Expand Down Expand Up @@ -3149,12 +3149,12 @@ def __init__(
if len(list(set(h_locus_p))) == 2:
h_locus_p_trb_count = dict(
data1[data1["locus"] == "TRB"][
"duplicate_count"
"umi_count"
]
)
h_locus_p_trd_count = dict(
data1[data1["locus"] == "TRD"][
"duplicate_count"
"umi_count"
]
)

Expand Down Expand Up @@ -3284,7 +3284,7 @@ def __init__(
)
h_np = list(data2["sequence_id"])
h_umi_np = [
int(x) for x in pd.to_numeric(data2["duplicate_count"])
int(x) for x in pd.to_numeric(data2["umi_count"])
]
if len(h_np) > 1:
highest_umi_h = max(h_umi_np)
Expand Down Expand Up @@ -3313,7 +3313,7 @@ def __init__(
h_np = list(data2["sequence_id"])
h_umi_np = [
int(x)
for x in pd.to_numeric(data2["duplicate_count"])
for x in pd.to_numeric(data2["umi_count"])
]
if len(self.Cell[cell]["VJ"]["P"]) > 0:
data3 = pd.DataFrame(
Expand All @@ -3330,7 +3330,7 @@ def __init__(
)
l_p = list(data3["sequence_id"])
l_umi_p = [
int(x) for x in pd.to_numeric(data3["duplicate_count"])
int(x) for x in pd.to_numeric(data3["umi_count"])
]
if len(l_p) > 1:
if "sequence_alignment" in data3:
Expand Down Expand Up @@ -3407,7 +3407,7 @@ def __init__(
)
l_np = list(data4["sequence_id"])
l_umi_np = [
int(x) for x in pd.to_numeric(data4["duplicate_count"])
int(x) for x in pd.to_numeric(data4["umi_count"])
]
if len(l_np) > 1:
highest_umi_l = max(l_umi_np)
Expand Down Expand Up @@ -3813,7 +3813,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
)
h_p = list(data1["sequence_id"])
h_umi_p = [
int(x) for x in pd.to_numeric(data1["duplicate_count"])
int(x) for x in pd.to_numeric(data1["umi_count"])
]
h_ccall_p = list(data1["c_call"])
if len(h_p) > 1:
Expand All @@ -3832,7 +3832,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
h_p[:keep_index_h] + h_p[keep_index_h:]
)
keep_hc_contig = h_p[keep_index_h]
data1[keep_hc_contig, "duplicate_count"] = int(
data1[keep_hc_contig, "umi_count"] = int(
np.sum(
h_umi_p[:keep_index_h]
+ h_umi_p[keep_index_h:]
Expand All @@ -3856,7 +3856,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
h_umi_p = [
int(x)
for x in pd.to_numeric(
data1["duplicate_count"]
data1["umi_count"]
)
]
if len(self.Cell[cell]["VDJ"]["NP"]) > 0:
Expand All @@ -3874,7 +3874,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
)
h_np = list(data2["sequence_id"])
h_umi_np = [
int(x) for x in pd.to_numeric(data2["duplicate_count"])
int(x) for x in pd.to_numeric(data2["umi_count"])
]
if len(self.Cell[cell]["VJ"]["P"]) > 0:
data3 = pd.DataFrame(
Expand All @@ -3891,7 +3891,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
)
l_p = list(data3["sequence_id"])
l_umi_p = [
int(x) for x in pd.to_numeric(data3["duplicate_count"])
int(x) for x in pd.to_numeric(data3["umi_count"])
]
if len(l_p) > 1:
if "sequence_alignment" in data3:
Expand All @@ -3908,7 +3908,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
l_p[:keep_index_l] + l_p[keep_index_l:]
)
keep_lc_contig = l_p[keep_index_l]
data3.at[keep_lc_contig, "duplicate_count"] = int(
data3.at[keep_lc_contig, "umi_count"] = int(
np.sum(
l_umi_p[:keep_index_l]
+ l_umi_p[keep_index_l:]
Expand All @@ -3929,7 +3929,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
l_p = list(data3["sequence_id"])
l_umi_p = [
int(x)
for x in pd.to_numeric(data3["duplicate_count"])
for x in pd.to_numeric(data3["umi_count"])
]
if len(self.Cell[cell]["VJ"]["NP"]) > 0:
data4 = pd.DataFrame(
Expand All @@ -3946,7 +3946,7 @@ def __init__(self, data: pd.DataFrame, verbose: bool):
)
l_np = list(data4["sequence_id"])
l_umi_np = [
int(x) for x in pd.to_numeric(data4["duplicate_count"])
int(x) for x in pd.to_numeric(data4["umi_count"])
]

if "h_p" not in locals():
Expand Down Expand Up @@ -5225,7 +5225,7 @@ def check_contigs(
extra = contig_status.extra_contigs.copy()
umi_adjustment = contig_status.umi_adjustment.copy()
if len(umi_adjustment) > 0:
dat["duplicate_count"].update(umi_adjustment)
dat["umi_count"].update(umi_adjustment)

ambi = {c: "F" for c in dat_.sequence_id}
ambiguous_ = {x: "T" for x in ambigous}
Expand Down Expand Up @@ -5261,7 +5261,7 @@ def check_contigs(
)

if productive_only:
dat_["duplicate_count"].update(dat["duplicate_count"])
dat_["umi_count"].update(dat["umi_count"])
for column in ["ambiguous", "extra"]:
dat_[column] = dat[column]
dat_[column].fillna("T", inplace=True)
Expand Down Expand Up @@ -5375,7 +5375,7 @@ def __init__(
)
vdj_p = list(data1["sequence_id"])
vdj_umi_p = [
int(x) for x in pd.to_numeric(data1["duplicate_count"])
int(x) for x in pd.to_numeric(data1["umi_count"])
]
vdj_ccall_p = list(data1["c_call"])
vdj_locus_p = list(data1["locus"])
Expand All @@ -5400,12 +5400,12 @@ def __init__(
if len(list(set(vdj_ccall_p))) == 2:
vdj_ccall_p_igm_count = dict(
data1[data1["c_call"] == "IGHM"][
"duplicate_count"
"umi_count"
]
)
vdj_ccall_p_igd_count = dict(
data1[data1["c_call"] == "IGHD"][
"duplicate_count"
"umi_count"
]
)

Expand Down Expand Up @@ -5446,7 +5446,7 @@ def __init__(
ambiguous_vdj = ambiguous_igm + ambiguous_igd
else:
vdj_ccall_p_count = dict(
data1["duplicate_count"]
data1["umi_count"]
)
if len(vdj_ccall_p_count) > 1:
(
Expand All @@ -5462,12 +5462,12 @@ def __init__(
if len(list(set(vdj_locus_p))) == 2:
vdj_locus_p_trb_count = dict(
data1[data1["locus"] == "TRB"][
"duplicate_count"
"umi_count"
]
)
vdj_locus_p_trd_count = dict(
data1[data1["locus"] == "TRD"][
"duplicate_count"
"umi_count"
]
)

Expand Down Expand Up @@ -5508,7 +5508,7 @@ def __init__(
ambiguous_vdj = ambiguous_trb + ambiguous_trd
else:
vdj_ccall_p_count = dict(
data1["duplicate_count"]
data1["umi_count"]
)
if len(vdj_ccall_p_count) > 1:
(
Expand All @@ -5521,7 +5521,7 @@ def __init__(
else:
vdj_p, extra_vdj, ambiguous_vdj = [], [], []
else:
vdj_ccall_p_count = dict(data1["duplicate_count"])
vdj_ccall_p_count = dict(data1["umi_count"])
if len(vdj_ccall_p_count) > 1:
(
vdj_p,
Expand Down Expand Up @@ -5582,7 +5582,7 @@ def __init__(
)
vj_p = list(data3["sequence_id"])
vj_umi_p = [
int(x) for x in pd.to_numeric(data3["duplicate_count"])
int(x) for x in pd.to_numeric(data3["umi_count"])
]
if len(vj_p) > 1:
if "sequence_alignment" in data3:
Expand All @@ -5600,7 +5600,7 @@ def __init__(
for avj in ambi_cont_vj:
self.ambiguous_contigs.append(avj)
if len(vj_p) > 1:
vj_ccall_p_count = dict(data3["duplicate_count"])
vj_ccall_p_count = dict(data3["umi_count"])
# maximum keep 2?
vj_p, extra_vj, ambiguous_vj = check_productive_vj(
vj_ccall_p_count
Expand Down Expand Up @@ -5972,7 +5972,7 @@ def check_update_same_seq(
k: r for k, r in dict(data[sequencecol]).items() if present(r)
}
_count = {
k: r for k, r in dict(data.duplicate_count).items() if k in _seq
k: r for k, r in dict(data.umi_count).items() if k in _seq
}
rep_seq = [
seq
Expand Down Expand Up @@ -6006,7 +6006,7 @@ def check_update_same_seq(
for dk in dup_keys[1:]:
ambi_cont.append(dk)
keep_seqs_ids.append(keep_index_vj)
data.duplicate_count.update(
data.umi_count.update(
{keep_index_vj: keep_index_count}
)
# refresh
Expand All @@ -6019,11 +6019,11 @@ def check_update_same_seq(
keep_seqs_ids = keep_seqs_ids + empty_seqs_ids
data = data.loc[keep_seqs_ids]
keep_id = list(data.sequence_id)
keep_umi = [int(x) for x in pd.to_numeric(data.duplicate_count)]
keep_umi = [int(x) for x in pd.to_numeric(data.umi_count)]
keep_ccall = list(data.c_call)
else:
keep_id = list(data.sequence_id)
keep_umi = [int(x) for x in pd.to_numeric(data.duplicate_count)]
keep_umi = [int(x) for x in pd.to_numeric(data.umi_count)]
keep_ccall = list(data.c_call)

return (data, keep_id, keep_umi, keep_ccall, umi_adjust, ambi_cont)
Expand Down
2 changes: 1 addition & 1 deletion dandelion/tools/_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,7 +727,7 @@ def _lightCluster(heavy_file, light_file, out_file, doublets, fileformat):
v_call = "v_call"
j_call = "j_call"
junction_length = "junction_length"
umi_count = "duplicate_count"
umi_count = "umi_count"
else:
sys.exit("Invalid format %s" % fileformat)

Expand Down
14 changes: 7 additions & 7 deletions dandelion/utilities/_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,12 @@ def __init__(
except:
pass
if (
pd.Series(["cell_id", "duplicate_count", "productive"])
pd.Series(["cell_id", "umi_count", "productive"])
.isin(self.data.columns)
.all()
): # sort so that the productive contig with the largest umi is first
self.data.sort_values(
by=["cell_id", "productive", "duplicate_count"],
by=["cell_id", "productive", "umi_count"],
inplace=True,
ascending=[True, False, False],
)
Expand Down Expand Up @@ -874,12 +874,12 @@ def update_metadata(
"d_call",
"j_call",
"c_call",
"duplicate_count",
"umi_count",
"junction",
"junction_aa",
]

if "duplicate_count" not in self.data:
if "umi_count" not in self.data:
raise ValueError(
"Unable to initialize metadata due to missing keys. "
"Please ensure either 'umi_count' or 'duplicate_count' is in the input data."
Expand Down Expand Up @@ -1841,7 +1841,7 @@ def initialize_metadata(
"productive",
]:
meta_[k + "_split"] = querier.retrieve_celltype(**v)
if k in ["duplicate_count"]:
if k in ["umi_count"]:
v.update({"retrieve_mode": "split and sum"})
meta_[k] = querier.retrieve_celltype(**v)
if k in ["mu_count", "mu_freq"]:
Expand Down Expand Up @@ -2251,12 +2251,12 @@ def update_metadata(
"d_call",
"j_call",
"c_call",
"duplicate_count",
"umi_count",
"junction",
"junction_aa",
]

if "duplicate_count" not in vdj_data.data:
if "umi_count" not in vdj_data.data:
raise ValueError(
"Unable to initialize metadata due to missing keys. "
"Please ensure either 'umi_count' or 'duplicate_count' is in the input data."
Expand Down
9 changes: 5 additions & 4 deletions dandelion/utilities/_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
"junction",
"junction_aa",
"consensus_count",
"duplicate_count",
"umi_count",
"cdr3_start",
"cdr3_end",
"sequence_length_10x",
Expand Down Expand Up @@ -598,7 +598,8 @@ def parse_json(data: list) -> defaultdict:
}
main_dict2 = {
"read_count": "consensus_count",
"umi_count": "duplicate_count",
"umi_count": "umi_count",
"duplicate_count": "umi_count",
"cdr3_start": "cdr3_start",
"cdr3_stop": "cdr3_end",
}
Expand Down Expand Up @@ -1145,8 +1146,8 @@ def to_scirpy(
The converted data in either AnnData or MuData format.
"""

if "duplicate_count" not in data.data and "umi_count" in data.data:
data.data["duplicate_count"] = data.data["umi_count"]
if "umi_count" not in data.data and "duplicate_count" in data.data:
data.data["umi_count"] = data.data["duplicate_count"]
for h in [
"sequence",
"rev_comp",
Expand Down
Loading

0 comments on commit 00f3173

Please sign in to comment.