Skip to content

Commit

Permalink
v0.1.6 (#90)
Browse files Browse the repository at this point in the history
* Update _utilities.py

* Update _utilities.py

* Update _utilities.py

* Update _utilities.py

* Update _utilities.py

try again

* Revert "Update _utilities.py"

This reverts commit 1b4a8ef.

* Update _utilities.py

* Update _utilities.py

speed up validate_airr
  • Loading branch information
zktuong committed Aug 5, 2021
1 parent 017dcf4 commit 0ae47a7
Showing 1 changed file with 21 additions and 9 deletions.
30 changes: 21 additions & 9 deletions dandelion/utilities/_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# @Author: kt16
# @Date: 2020-05-12 14:01:32
# @Last Modified by: Kelvin
# @Last Modified time: 2021-08-01 12:28:39
# @Last Modified time: 2021-08-05 12:02:47

import os
from collections import defaultdict, Iterable
Expand Down Expand Up @@ -321,7 +321,7 @@ def sanitize_data(data, ignore='clone_id'):
if data[d].dtype == "float64":
try:
data[d].replace(np.nan, pd.NA, inplace=True)
data[d] = data[d].astype("Int64")
data[d] = data[d].astype("int64")
except:
pass
if data[d].dtype == 'object':
Expand All @@ -331,9 +331,10 @@ def sanitize_data(data, ignore='clone_id'):
data[d] = pd.to_numeric(data[d])
try:
data[d].replace(np.nan, pd.NA, inplace=True)
data[d] = data[d].astype("Int64")
data[d] = data[d].astype("int64")
except:
data[d] = data[d].astype("Float64")
data[d].replace(pd.NA, np.nan, inplace=True)
data[d] = data[d].astype("float64")
except:
data[d].replace(to_replace=[None, np.nan, pd.NA],
value='',
Expand All @@ -347,15 +348,26 @@ def sanitize_data(data, ignore='clone_id'):

def validate_airr(data):
"""Validate dtypes in airr table."""
int_columns = []
for d in data:
try:
data[d].replace(np.nan, pd.NA).astype("Int64")
int_columns.append(d)
except:
pass
for _, row in data.iterrows():
contig = dict(row)
for k, v in contig.items():
if data[k].dtype == 'Int64':
if (data[k].dtype == np.int64) or (k in int_columns):
if pd.isnull(v):
contig.update({k: str('')})
if data[k].dtype == 'Float64':
if pd.isnull(v):
contig.update({k: np.nan})
if data[k].dtype == np.float64:
if k in int_columns:
if pd.isnull(v):
contig.update({k: str('')})
else:
if pd.isnull(v):
contig.update({k: np.nan})
for required in [
'sequence', 'rev_comp', 'sequence_alignment',
'germline_alignment', 'v_cigar', 'd_cigar', 'j_cigar'
Expand Down Expand Up @@ -441,5 +453,5 @@ def best_guess_locus(data):

def sanitize_dtype(data):
for col in data:
if data[col].dtype == 'Int64' or data[col].dtype == 'Float64':
if data[col].dtype == np.int64 or data[col].dtype == np.float64:
data[col] = data[col].astype(float)

0 comments on commit 0ae47a7

Please sign in to comment.