Update LuxSQLTable __len__() and metadata computation

Rather than referencing the _length parameter throughout the code, update and use the LuxSQLTable len() function. Added _setup_done parameter to the LuxSQLTable. This will check if the initial setup of the table, retrieving and populating attributes, is completed. This will inform which len() function to use, as the parent len() is required while populating the columns of the LuxSQLTable.
thyneb19 · Mar 26, 2021 · 7c7dcd3 · 7c7dcd3
1 parent d296c9b
commit 7c7dcd3
Show file tree

Hide file tree

Showing 7 changed files with 27 additions and 15 deletions.
diff --git a/lux/action/correlation.py b/lux/action/correlation.py
@@ -62,7 +62,7 @@ def correlation(ldf: LuxDataFrame, ignore_transpose: bool = True):
     }
     ignore_rec_flag = False
     # Doesn't make sense to compute correlation if less than 4 data values
-    if ldf._length < 5:
+    if len(ldf) < 5:
         ignore_rec_flag = True
     # Then use the data populated in the vis list to compute score
     for vis in vlist:

diff --git a/lux/action/univariate.py b/lux/action/univariate.py
@@ -59,7 +59,7 @@ def univariate(ldf, *args):
             "long_description": f"Distribution displays univariate histogram distributions of all quantitative attributes{examples}. Visualizations are ranked from most to least skewed.",
         }
         # Doesn't make sense to generate a histogram if there is less than 5 datapoints (pre-aggregated)
-        if ldf._length < 5:
+        if len(ldf) < 5:
             ignore_rec_flag = True
     elif data_type_constraint == "nominal":
         possible_attributes = [
@@ -98,7 +98,7 @@ def univariate(ldf, *args):
             "long_description": "Temporal displays line charts for all attributes related to datetimes in the dataframe.",
         }
         # Doesn't make sense to generate a line chart if there is less than 3 datapoints (pre-aggregated)
-        if ldf._length < 3:
+        if len(ldf) < 3:
             ignore_rec_flag = True
     if ignore_rec_flag:
         recommendation["collection"] = []

diff --git a/lux/core/frame.py b/lux/core/frame.py
@@ -82,7 +82,6 @@ def __init__(self, *args, **kw):
         self._toggle_pandas_display = True
         self._message = Message()
         self._pandas_only = False
-        self._length = len(self)
         # Metadata
         self._data_type = {}
         self.unique_values = None

diff --git a/lux/core/sqltable.py b/lux/core/sqltable.py
@@ -55,6 +55,8 @@ class LuxSQLTable(lux.LuxDataFrame):
         "_pandas_only",
         "pre_aggregated",
         "_type_override",
+        "_length",
+        "_setup_done",
     ]
 
     def __init__(self, *args, table_name="", **kw):
@@ -64,12 +66,16 @@ def __init__(self, *args, table_name="", **kw):
         lux.config.executor = SQLExecutor()
 
         self._length = 0
+        self._setup_done = False
         if table_name != "":
             self.set_SQL_table(table_name)
         warnings.formatwarning = lux.warning_format
 
-    def len(self):
-        return self._length
+    def __len__(self):
+        if self._setup_done:
+            return self._length
+        else:
+            return super(LuxSQLTable, self).__len__()
 
     def set_SQL_table(self, t_name):
         # function that ties the Lux Dataframe to a SQL database table
@@ -126,7 +132,7 @@ def _repr_html_(self):
                 layout=widgets.Layout(width="200px", top="6px", bottom="6px"),
             )
             self.output = widgets.Output()
-            lux.config.executor.execute_preview(self)
+            self._sampled = lux.config.executor.execute_preview(self)
             display(button, self.output)
 
             def on_button_clicked(b):

diff --git a/lux/executor/SQLExecutor.py b/lux/executor/SQLExecutor.py
@@ -25,10 +25,11 @@ def __repr__(self):
         return f"<SQLExecutor>"
 
     @staticmethod
-    def execute_preview(tbl: LuxSQLTable):
-        tbl._sampled = pandas.read_sql(
-            "SELECT * from {} LIMIT 5".format(tbl.table_name), lux.config.SQLconnection
+    def execute_preview(tbl: LuxSQLTable, preview_size=5):
+        output = pandas.read_sql(
+            "SELECT * from {} LIMIT {}".format(tbl.table_name, preview_size), lux.config.SQLconnection
         )
+        return output
 
     @staticmethod
     def execute_sampling(tbl: LuxSQLTable):
@@ -611,9 +612,8 @@ def compute_dataset_metadata(self, tbl: LuxSQLTable):
         -------
         None
         """
-        self.get_SQL_attributes(tbl)
-        for attr in list(tbl.columns):
-            tbl[attr] = None
+        if not tbl._setup_done:
+            self.get_SQL_attributes(tbl)
         tbl._data_type = {}
         #####NOTE: since we aren't expecting users to do much data processing with the SQL database, should we just keep this
         #####      in the initialization and do it just once
@@ -644,6 +644,7 @@ def get_SQL_attributes(self, tbl: LuxSQLTable):
         attributes = list(pandas.read_sql(attr_query, lux.config.SQLconnection)["column_name"])
         for attr in attributes:
             tbl[attr] = None
+        tbl._setup_done = True
 
     def compute_stats(self, tbl: LuxSQLTable):
         """

diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py
@@ -235,7 +235,7 @@ def deviation_from_overall(
         from lux.executor.SQLExecutor import SQLExecutor
 
         v_filter_size = SQLExecutor.get_filtered_size(filter_specs, ldf)
-        v_size = ldf.len()
+        v_size = len(ldf)
         vdata = vis.data
     v_filter = vdata[msr_attribute]
     total = v_filter.sum()

diff --git a/lux/utils/utils.py b/lux/utils/utils.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 import pandas as pd
 import matplotlib.pyplot as plt
+import lux
 
 
 def convert_to_list(x):
@@ -83,7 +84,12 @@ def check_if_id_like(df, attribute):
     if is_string:
         # For string IDs, usually serial numbers or codes with alphanumerics have a consistent length (eg., CG-39405) with little deviation. For a high cardinality string field but not ID field (like Name or Brand), there is less uniformity across the string lengths.
         if len(df) > 50:
-            sampled = df[attribute].sample(50, random_state=99)
+            if lux.config.executor.name == "PandasExecutor":
+                sampled = df[attribute].sample(50, random_state=99)
+            else:
+                from lux.executor.SQLExecutor import SQLExecutor
+
+                sampled = SQLExecutor.execute_preview(df, preview_size=50)
         else:
             sampled = df[attribute]
         str_length_uniformity = sampled.apply(lambda x: type(x) == str and len(x)).std() < 3