Skip to content

Commit

Permalink
Update LuxSQLTable __len__() and metadata computation
Browse files Browse the repository at this point in the history
Rather than referencing the _length parameter throughout the code, update and use the LuxSQLTable len() function.

Added _setup_done parameter to the LuxSQLTable. This will check if the initial setup of the table, retrieving and populating attributes, is completed. This will inform which len() function to use, as the parent len() is required while populating the columns of the LuxSQLTable.
  • Loading branch information
thyneb19 committed Mar 26, 2021
1 parent d296c9b commit 7c7dcd3
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 15 deletions.
2 changes: 1 addition & 1 deletion lux/action/correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def correlation(ldf: LuxDataFrame, ignore_transpose: bool = True):
}
ignore_rec_flag = False
# Doesn't make sense to compute correlation if less than 4 data values
if ldf._length < 5:
if len(ldf) < 5:
ignore_rec_flag = True
# Then use the data populated in the vis list to compute score
for vis in vlist:
Expand Down
4 changes: 2 additions & 2 deletions lux/action/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def univariate(ldf, *args):
"long_description": f"Distribution displays univariate histogram distributions of all quantitative attributes{examples}. Visualizations are ranked from most to least skewed.",
}
# Doesn't make sense to generate a histogram if there is less than 5 datapoints (pre-aggregated)
if ldf._length < 5:
if len(ldf) < 5:
ignore_rec_flag = True
elif data_type_constraint == "nominal":
possible_attributes = [
Expand Down Expand Up @@ -98,7 +98,7 @@ def univariate(ldf, *args):
"long_description": "Temporal displays line charts for all attributes related to datetimes in the dataframe.",
}
# Doesn't make sense to generate a line chart if there is less than 3 datapoints (pre-aggregated)
if ldf._length < 3:
if len(ldf) < 3:
ignore_rec_flag = True
if ignore_rec_flag:
recommendation["collection"] = []
Expand Down
1 change: 0 additions & 1 deletion lux/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ def __init__(self, *args, **kw):
self._toggle_pandas_display = True
self._message = Message()
self._pandas_only = False
self._length = len(self)
# Metadata
self._data_type = {}
self.unique_values = None
Expand Down
12 changes: 9 additions & 3 deletions lux/core/sqltable.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class LuxSQLTable(lux.LuxDataFrame):
"_pandas_only",
"pre_aggregated",
"_type_override",
"_length",
"_setup_done",
]

def __init__(self, *args, table_name="", **kw):
Expand All @@ -64,12 +66,16 @@ def __init__(self, *args, table_name="", **kw):
lux.config.executor = SQLExecutor()

self._length = 0
self._setup_done = False
if table_name != "":
self.set_SQL_table(table_name)
warnings.formatwarning = lux.warning_format

def len(self):
return self._length
def __len__(self):
if self._setup_done:
return self._length
else:
return super(LuxSQLTable, self).__len__()

def set_SQL_table(self, t_name):
# function that ties the Lux Dataframe to a SQL database table
Expand Down Expand Up @@ -126,7 +132,7 @@ def _repr_html_(self):
layout=widgets.Layout(width="200px", top="6px", bottom="6px"),
)
self.output = widgets.Output()
lux.config.executor.execute_preview(self)
self._sampled = lux.config.executor.execute_preview(self)
display(button, self.output)

def on_button_clicked(b):
Expand Down
13 changes: 7 additions & 6 deletions lux/executor/SQLExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@ def __repr__(self):
return f"<SQLExecutor>"

@staticmethod
def execute_preview(tbl: LuxSQLTable):
tbl._sampled = pandas.read_sql(
"SELECT * from {} LIMIT 5".format(tbl.table_name), lux.config.SQLconnection
def execute_preview(tbl: LuxSQLTable, preview_size=5):
output = pandas.read_sql(
"SELECT * from {} LIMIT {}".format(tbl.table_name, preview_size), lux.config.SQLconnection
)
return output

@staticmethod
def execute_sampling(tbl: LuxSQLTable):
Expand Down Expand Up @@ -611,9 +612,8 @@ def compute_dataset_metadata(self, tbl: LuxSQLTable):
-------
None
"""
self.get_SQL_attributes(tbl)
for attr in list(tbl.columns):
tbl[attr] = None
if not tbl._setup_done:
self.get_SQL_attributes(tbl)
tbl._data_type = {}
#####NOTE: since we aren't expecting users to do much data processing with the SQL database, should we just keep this
##### in the initialization and do it just once
Expand Down Expand Up @@ -644,6 +644,7 @@ def get_SQL_attributes(self, tbl: LuxSQLTable):
attributes = list(pandas.read_sql(attr_query, lux.config.SQLconnection)["column_name"])
for attr in attributes:
tbl[attr] = None
tbl._setup_done = True

def compute_stats(self, tbl: LuxSQLTable):
"""
Expand Down
2 changes: 1 addition & 1 deletion lux/interestingness/interestingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def deviation_from_overall(
from lux.executor.SQLExecutor import SQLExecutor

v_filter_size = SQLExecutor.get_filtered_size(filter_specs, ldf)
v_size = ldf.len()
v_size = len(ldf)
vdata = vis.data
v_filter = vdata[msr_attribute]
total = v_filter.sum()
Expand Down
8 changes: 7 additions & 1 deletion lux/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
import pandas as pd
import matplotlib.pyplot as plt
import lux


def convert_to_list(x):
Expand Down Expand Up @@ -83,7 +84,12 @@ def check_if_id_like(df, attribute):
if is_string:
# For string IDs, usually serial numbers or codes with alphanumerics have a consistent length (eg., CG-39405) with little deviation. For a high cardinality string field but not ID field (like Name or Brand), there is less uniformity across the string lengths.
if len(df) > 50:
sampled = df[attribute].sample(50, random_state=99)
if lux.config.executor.name == "PandasExecutor":
sampled = df[attribute].sample(50, random_state=99)
else:
from lux.executor.SQLExecutor import SQLExecutor

sampled = SQLExecutor.execute_preview(df, preview_size=50)
else:
sampled = df[attribute]
str_length_uniformity = sampled.apply(lambda x: type(x) == str and len(x)).std() < 3
Expand Down

0 comments on commit 7c7dcd3

Please sign in to comment.