Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nf floats #1011

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 63 additions & 1 deletion src/pandas_profiling/config.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
"""Configuration for the package."""
from enum import Enum
from typing import Any, Dict, List, Optional

import warnings
from pydantic import BaseModel, BaseSettings, Field

# Comment this function to see Warnings in console
def warn(*args, **kwargs):
pass
warnings.warn = warn
Comment on lines +7 to +10
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A feature to disable warnings is desirable, but it should not be enabled/disabled by commenting code, also the default should be having the warnings enabled.


def _merge_dictionaries(dict1: dict, dict2: dict) -> dict:
"""
Expand Down Expand Up @@ -185,6 +189,22 @@ class Html(BaseModel):

full_width: bool = False

class JsonNonFiniteEncoding(Enum):
# Use the default python behaviour, which violates the official JSON standard, basically allow_nan = False
__default = 0
# Encode non-finite numbers as null values, allow_nan = True
__num_null = 1
# Encode non-finite floats as null values, allow_nan = True
__float_null = 2

def fetch_python(self):
return self.__default

def fetch_null_values(self):
return self.__num_null

def fetch_float_values(self):
return self.__float_null
Comment on lines +192 to +207
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe Lukas' version is a bit more clear in this part. Can you help me understand why we would want private fields instead of Lukas' version?


class Duplicates(BaseModel):
head: int = 10
Expand Down Expand Up @@ -299,6 +319,11 @@ class Config:
n_freq_table_max: int = 10
n_extreme_obs: int = 10

#JSON for non finite values

Jsnf_instance = JsonNonFiniteEncoding
json_non_finite_encoding: Jsnf_instance = Jsnf_instance._JsonNonFiniteEncoding__num_null.value

# Report rendering
report: Report = Report()
html: Html = Html()
Expand All @@ -308,6 +333,43 @@ def update(self, updates: dict) -> "Settings":
update = _merge_dictionaries(self.dict(), updates)
return self.parse_obj(self.copy(update=update))

class PandasSettings(Settings):
pass

class SparkSettings(Settings):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this would make more sense in the spark-branch

# TO-DO write description
vars: Univariate = Univariate()

vars.num.low_categorical_threshold = 0

infer_dtypes = False

correlations: Dict[str, Correlation] = {
"spearman": Correlation(key="spearman"),
"pearson": Correlation(key="pearson"),
"kendall": Correlation(key="kendall"),
"cramers": Correlation(key="cramers"),
"phi_k": Correlation(key="phi_k"),
}
correlations["pearson"].calculate = True
correlations["spearman"].calculate = True
correlations["kendall"].calculate = False
correlations["cramers"].calculate = False
correlations["phi_k"].calculate = False

interactions: Interactions = Interactions()
interactions.continuous = False

missing_diagrams: Dict[str, bool] = {
"bar": False,
"matrix": False,
"dendrogram": False,
"heatmap": False,
}

samples: Samples = Samples()
samples.tail = 0
samples.random = 0

class Config:
arg_groups: Dict[str, Any] = {
Expand Down
36 changes: 32 additions & 4 deletions src/pandas_profiling/profile_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,19 @@
import warnings
from pathlib import Path
from typing import Any, Dict, Optional, Union

import math
import numpy as np
import pandas as pd
import yaml
from tqdm.auto import tqdm
from visions import VisionsTypeset

from pandas_profiling.config import Config, Settings
from pandas_profiling.config import (
Config,
PandasSettings,
Settings,
SparkSettings,
JsonNonFiniteEncoding,
)
from pandas_profiling.expectations_report import ExpectationsReport
from pandas_profiling.model.alerts import AlertType
from pandas_profiling.model.describe import describe as describe_df
Expand Down Expand Up @@ -329,8 +334,24 @@ def encode_it(o: Any) -> Any:
if isinstance(o, dict):
return {encode_it(k): encode_it(v) for k, v in o.items()}
else:
if isinstance(o, (bool, int, float, str)):
if isinstance(o, (bool, int, str)):
return o
elif isinstance(o, float):
if not math.isfinite(o):
# Special handling for non-finite floats.
# This is necessary because JSON does not support NaN/Infinity values.
# The default in Python is to generate invalid JSON.
# Depending on the configuration, we can encode them as null values,
# stringify the non-finite value, or output it as is to keep the default ,Python behaviour.
Jsnf_instance = JsonNonFiniteEncoding
if self.config.json_non_finite_encoding.value == Jsnf_instance._JsonNonFiniteEncoding__num_null.value:
return None
elif self.config.json_non_finite_encoding.value == Jsnf_instance._JsonNonFiniteEncoding__float_null.value:
return str(o)
else:
return o
else:
return o
elif isinstance(o, list):
return [encode_it(v) for v in o]
elif isinstance(o, set):
Expand Down Expand Up @@ -420,3 +441,10 @@ def _repr_html_(self) -> None:
def __repr__(self) -> str:
"""Override so that Jupyter Notebook does not print the object."""
return ""

def get_default_settings(self, df) -> Settings:
if isinstance(df, (pd.DataFrame, pd.Series)):
return PandasSettings()
else:
return SparkSettings()

8 changes: 8 additions & 0 deletions tests/issues/test_issue983.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np
df = pd.DataFrame([1, 1, np.nan], columns=["a"])

profile = ProfileReport(df, title="Pandas Profiling Report", minimal=True)

print(profile.to_json())
Comment on lines +4 to +8
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should validate the expected behavior for each encoding