In [1]:
import importlib
from tfx.dsl.component.experimental.decorators import component
from tfx.types import artifact, artifact_utils
from tfx.types.standard_artifacts import Examples
from tfx.v1.dsl.components import OutputArtifact, InputArtifact, Parameter

In [2]:
"""Custom Artifact type"""
class FeatureSelectionArtifact(artifact.Artifact):
  """Output artifact containing feature scores from the Feature Selection component"""
  TYPE_NAME = 'Feature Selection'
  PROPERTIES = {
      'scores': artifact.Property(type=artifact.PropertyType.JSON_VALUE),
      'p_values': artifact.Property(type=artifact.PropertyType.JSON_VALUE),
      'selected_features': artifact.Property(type=artifact.PropertyType.JSON_VALUE),
      'selected_data': artifact.Property(type=artifact.PropertyType.JSON_VALUE)
  }

In [3]:

"""
Feature selection component
"""
@component
def FeatureSelection(module_file: Parameter[str],
    orig_examples: InputArtifact[Examples],
    feature_selection: OutputArtifact[FeatureSelectionArtifact]):
  """Feature Selection component
      Args (from the module file):
        NUM_PARAM: Parameter for the corresponding mode in SelectorFunc
          example: value of 'k' in SelectKBest
        INPUT_DATA: Two dimensional array containing the data vectors
          shape: (number of data points, number of input features)
        OUTPUT_DATA: Two dimensional array containing the target vector
          shape: (number of data points,)
        FEATURE_KEYS: List containing feature names corresponding to each data point in INPUT_DATA
        SelectorFunc: Selector function for univariate feature selection
          example: SelectKBest, SelectPercentile from sklearn.feature_selection
        ScoreFunc: Scoring function for various features with INPUT_DATA and OUTPUT_DATA as parameters
  """

  splits_list = artifact_utils.decode_split_names(split_names=orig_examples.split_names)

  # importing the required functions and variables from
  modules = importlib.import_module(module_file)
  mod_names = ["NUM_PARAM", "INPUT_DATA", "TARGET_DATA", "FEATURE_KEYS", "SelectorFunc", "ScoreFunc"]
  NUM_PARAM, INPUT_DATA, TARGET_DATA, FEATURE_KEYS, SelectorFunc, ScoreFunc = [getattr(modules, i) for i in mod_names]

  # Select features based on scores
  selector = SelectorFunc(ScoreFunc, k=NUM_PARAM)
  selected_data = selector.fit_transform(INPUT_DATA, TARGET_DATA).tolist()

  # generate a list of selected features by matching _FEATURE_KEYS to selected indices
  selected_features = [val for (idx, val) in enumerate(FEATURE_KEYS) if idx in selector.get_support(indices=True)]

  # get scores and p-values for artifacts
  selector_scores = selector.scores_
  selector_p_values = selector.pvalues_

  # merge scores and pvalues with feature keys to create a dictionary
  selector_scores_dict = dict(zip(FEATURE_KEYS, selector_scores))
  selector_pvalues_dict = dict(zip(FEATURE_KEYS, selector_p_values))

  # populate artifact with the required properties
  feature_selection.scores = selector_scores_dict
  feature_selection.p_values = selector_pvalues_dict
  feature_selection.selected_features = selected_features
  feature_selection.selected_data = selected_data


### Getting example data

In [4]:
import urllib.request
import tempfile
import os

DATA_ROOT = tempfile.mkdtemp(prefix='tfx-data')  # Create a temporary directory.
_data_url = 'https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/penguin/data/labelled/penguins_processed.csv'
_data_filepath = os.path.join(DATA_ROOT, "data.csv")
urllib.request.urlretrieve(_data_url, _data_filepath)

('/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-datax2k90kfu/data.csv',
 <http.client.HTTPMessage at 0x7f86392689d0>)

In [6]:
from tfx.components import CsvExampleGen

In [7]:
example_gen = CsvExampleGen(input_base=DATA_ROOT)

In [33]:
example_gen.outputs["examples"].get()[0]

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0


In [10]:
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
context = InteractiveContext()



In [11]:
context.run(example_gen)





0,1
.execution_id,1
.component,"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } CsvExampleGen at 0x7f863a422e50.inputs{}.outputs['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f863a422fd0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1) at 0x7f86472d32d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0.exec_properties['input_base']/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-datax2k90kfu['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:25648,xor_checksum:1629238070,sum_checksum:1629238070"
.component.inputs,{}
.component.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f863a422fd0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1) at 0x7f86472d32d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.inputs,{}
.outputs,"['examples'] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f863a422fd0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1) at 0x7f86472d32d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"
.exec_properties,"['input_base']/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-datax2k90kfu['input_config']{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }['output_config']{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }['output_data_format']6['custom_config']None['range_config']None['span']0['version']None['input_fingerprint']split:single_split,num_files:1,total_bytes:25648,xor_checksum:1629238070,sum_checksum:1629238070"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f863a422fd0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1) at 0x7f86472d32d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1) at 0x7f86472d32d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1) at 0x7f86472d32d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0

0,1
['input_base'],/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-datax2k90kfu
['input_config'],"{  ""splits"": [  {  ""name"": ""single_split"",  ""pattern"": ""*""  }  ] }"
['output_config'],"{  ""split_config"": {  ""splits"": [  {  ""hash_buckets"": 2,  ""name"": ""train""  },  {  ""hash_buckets"": 1,  ""name"": ""eval""  }  ]  } }"
['output_data_format'],6
['custom_config'],
['range_config'],
['span'],0
['version'],
['input_fingerprint'],"split:single_split,num_files:1,total_bytes:25648,xor_checksum:1629238070,sum_checksum:1629238070"

0,1
['examples'],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Channel of type 'Examples' (1 artifact) at 0x7f863a422fd0.type_nameExamples._artifacts[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1) at 0x7f86472d32d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type_name,Examples
._artifacts,"[0] function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1) at 0x7f86472d32d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
[0],"function toggleTfxObject(element) {  var objElement = element.parentElement;  if (objElement.classList.contains('collapsed')) {  objElement.classList.remove('collapsed');  objElement.classList.add('expanded');  } else {  objElement.classList.add('collapsed');  objElement.classList.remove('expanded');  } } Artifact of type 'Examples' (uri: /var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1) at 0x7f86472d32d0.type<class 'tfx.types.standard_artifacts.Examples'>.uri/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1.span0.split_names[""train"", ""eval""].version0"

0,1
.type,<class 'tfx.types.standard_artifacts.Examples'>
.uri,/var/folders/55/gk05zk9j6596lk4gv9p8sk680000gn/T/tfx-interactive-2021-08-18T03_38_39.541130-sveqlwps/CsvExampleGen/examples/1
.span,0
.split_names,"[""train"", ""eval""]"
.version,0


In [24]:
import tensorflow as tf

In [39]:
train_uri = example_gen.outputs["examples"].get()[0].uri + "/Split-train/data_tfrecord-00000-of-00001.gz"

In [40]:
raw_dataset = tf.data.TFRecordDataset(train_uri, compression_type='GZIP')

In [53]:
np_dataset = list(dataset.as_numpy_iterator())

In [49]:
def extract_fn(data_record):
    features = {
        'species': tf.io.FixedLenFeature([], tf.int64),
        'culmen_length_mm':tf.io.FixedLenFeature([], tf.float32),
        'culmen_depth_mm':tf.io.FixedLenFeature([], tf.float32),
        'flipper_length_mm':tf.io.FixedLenFeature([], tf.float32),
        'body_mass_g':tf.io.FixedLenFeature([], tf.float32)
    }
    sample = tf.io.parse_single_example(data_record, features) 
    return sample

In [51]:
dataset = raw_dataset.map(extract_fn)

In [60]:
tf.io.parse_tensor(dataset, out_type=tf.float64).numpy()

ValueError: Attempt to convert a value (<MapDataset shapes: {body_mass_g: (), culmen_depth_mm: (), culmen_length_mm: (), flipper_length_mm: (), species: ()}, types: {body_mass_g: tf.float32, culmen_depth_mm: tf.float32, culmen_length_mm: tf.float32, flipper_length_mm: tf.float32, species: tf.int64}>) with an unsupported type (<class 'tensorflow.python.data.ops.dataset_ops.MapDataset'>) to a Tensor.

In [56]:
for i in dataset.take(10):
    print(i)

{'body_mass_g': <tf.Tensor: shape=(), dtype=float32, numpy=0.29166666>, 'culmen_depth_mm': <tf.Tensor: shape=(), dtype=float32, numpy=0.6666667>, 'culmen_length_mm': <tf.Tensor: shape=(), dtype=float32, numpy=0.25454545>, 'flipper_length_mm': <tf.Tensor: shape=(), dtype=float32, numpy=0.15254237>, 'species': <tf.Tensor: shape=(), dtype=int64, numpy=0>}
{'body_mass_g': <tf.Tensor: shape=(), dtype=float32, numpy=0.30555555>, 'culmen_depth_mm': <tf.Tensor: shape=(), dtype=float32, numpy=0.5119048>, 'culmen_length_mm': <tf.Tensor: shape=(), dtype=float32, numpy=0.26909092>, 'flipper_length_mm': <tf.Tensor: shape=(), dtype=float32, numpy=0.23728813>, 'species': <tf.Tensor: shape=(), dtype=int64, numpy=0>}
{'body_mass_g': <tf.Tensor: shape=(), dtype=float32, numpy=0.15277778>, 'culmen_depth_mm': <tf.Tensor: shape=(), dtype=float32, numpy=0.5833333>, 'culmen_length_mm': <tf.Tensor: shape=(), dtype=float32, numpy=0.29818183>, 'flipper_length_mm': <tf.Tensor: shape=(), dtype=float32, numpy=0.38