fix future warning for multimappers (#316)

* Update _preprocessing.py * what about this? this should make it faster * fix docstring
zktuong · Sep 28, 2023 · 4e2aaca · 4e2aaca
1 parent 05c9fff
commit 4e2aaca
Showing 1 changed file with 42 additions and 23 deletions.
diff --git a/dandelion/preprocessing/_preprocessing.py b/dandelion/preprocessing/_preprocessing.py
@@ -5860,7 +5860,8 @@ def multimapper(filename: str) -> pd.DataFrame:
     df_new = df.loc[
         df["j_support"] < 1e-3, :
     ]  # maybe not needing to filter if j_support has already been filtered
-    mapped = pd.DataFrame(
+
+    tmp = pd.DataFrame(
         index=list(set(df_new["sequence_id"])),
         columns=[
             "multimappers",
@@ -5871,32 +5872,50 @@ def multimapper(filename: str) -> pd.DataFrame:
         ],
     )
 
-    for j in range(mapped.shape[0]):
-        id = mapped.index[j]
-        tmp = df_new.loc[
-            df_new["sequence_id"] == id,
-            ["j_sequence_start", "j_sequence_end", "j_support", "j_call"],
-        ]
+    # Define a function to apply to each group
+    def process_group(group: pd.DataFrame) -> pd.Series:
+        """
+        Create a dictionary for the multimappers results.
 
-        starts = tmp["j_sequence_start"]
-        ends = tmp["j_sequence_end"]
-        scores = -tmp["j_support"]
-        chosen_ind = choose_segments(starts, ends, scores)
-        tmp = tmp.iloc[chosen_ind, :]
-        tmp = tmp.sort_values(by=["j_sequence_start"], ascending=True)
+        Parameters
+        ----------
+        group : pd.DataFrame
+            input dataframe for a given sequence_id.
 
-        mapped["multimappers"][j] = ";".join(tmp["j_call"])
-        mapped["multiplicity"][j] = tmp.shape[0]
-        mapped["sequence_start_multimappers"][j] = ";".join(
-            tmp["j_sequence_start"].astype(str)
-        )
-        mapped["sequence_end_multimappers"][j] = ";".join(
-            tmp["j_sequence_end"].astype(str)
-        )
-        mapped["support_multimappers"][j] = ";".join(
-            tmp["j_support"].astype(str)
+        Returns
+        -------
+        pd.Series
+            A pandas series with the multimappers results.
+        """
+        starts = group["j_sequence_start"]
+        ends = group["j_sequence_end"]
+        scores = -group["j_support"]
+        chosen_ind = choose_segments(starts, ends, scores)
+        group = group.iloc[chosen_ind, :]
+        group = group.sort_values(by=["j_sequence_start"], ascending=True)
+
+        return pd.Series(
+            {
+                "multimappers": ";".join(group["j_call"]),
+                "multiplicity": group.shape[0],
+                "sequence_start_multimappers": ";".join(
+                    group["j_sequence_start"].astype(str)
+                ),
+                "sequence_end_multimappers": ";".join(
+                    group["j_sequence_end"].astype(str)
+                ),
+                "support_multimappers": ";".join(
+                    group["j_support"].astype(str)
+                ),
+            }
         )
 
+    # Group by "sequence_id" and apply the processing function, then reset the index
+    mapped = df_new.groupby("sequence_id").apply(process_group).reset_index()
+    # Set the index explicitly
+    mapped.set_index("sequence_id", drop=True, inplace=True)
+    mapped = mapped.reindex(tmp.index)
+
     return mapped