diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index a82e2dd3e..9110f992e 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -50,17 +50,16 @@ class TextVars(BaseModel): words: bool = True characters: bool = True redact: bool = False - # if text has more than threshold categories, its not category - categorical_threshold: int = 50 - # if text has more than threshold % distinct values, its not category - percentage_cat_threshold: float = 0.5 class CatVars(BaseModel): length: bool = True characters: bool = True words: bool = True + # if var has more than threshold categories, it's a text var cardinality_threshold: int = 50 + # if var has more than threshold % distinct values, it's a text var + percentage_cat_threshold: float = 0.5 imbalance_threshold: float = 0.5 n_obs: int = 5 # Set to zero to disable diff --git a/src/ydata_profiling/model/typeset_relations.py b/src/ydata_profiling/model/typeset_relations.py index 509dacf79..831954c65 100644 --- a/src/ydata_profiling/model/typeset_relations.py +++ b/src/ydata_profiling/model/typeset_relations.py @@ -73,8 +73,8 @@ def string_is_category(series: pd.Series, state: dict, k: Settings) -> bool: - (distinct values / count of all values) is less than threshold - is not bool""" n_unique = series.nunique() - unique_threshold = k.vars.text.percentage_cat_threshold - threshold = k.vars.text.categorical_threshold + unique_threshold = k.vars.cat.percentage_cat_threshold + threshold = k.vars.cat.cardinality_threshold return ( 1 <= n_unique <= threshold and (