-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ML system testing concepts #121
Comments
Testing ML Systems
Key testing principles for ML: pre-deployemnt
|
test data schemairis_schema = {
'sepal length': {
'range': {
'min': 4.0, # determined by looking at the dataframe .describe() method
'max': 8.0
},
'dtype': float,
},
'sepal width': {
'range': {
'min': 1.0,
'max': 5.0
},
'dtype': float,
},
'petal length': {
'range': {
'min': 1.0,
'max': 7.0
},
'dtype': float,
},
'petal width': {
'range': {
'min': 0.1,
'max': 3.0
},
'dtype': float,
}
} import unittest
import sys
class TestIrisInputData(unittest.TestCase):
def setUp(self):
# `setUp` will be run before each test, ensuring that you
# have a new pipeline to access in your tests. See the
# unittest docs if you are unfamiliar with unittest.
# https://docs.python.org/3/library/unittest.html#unittest.TestCase.setUp
self.pipeline = SimplePipeline()
self.pipeline.run_pipeline()
def test_input_data_ranges(self):
# get df max and min values for each column
max_values = self.pipeline.frame.max()
min_values = self.pipeline.frame.min()
# loop over each feature (i.e. all 4 column names)
for feature in self.pipeline.feature_names:
# use unittest assertions to ensure the max/min values found in the dataset
# are less than/greater than those expected by the schema max/min.
self.assertTrue(max_values[feature] <= iris_schema[feature]['range']['max'])
self.assertTrue(min_values[feature] >= iris_schema[feature]['range']['min'])
def test_input_data_types(self):
data_types = self.pipeline.frame.dtypes # pandas dtypes method
for feature in self.pipeline.feature_names:
self.assertEqual(data_types[feature], iris_schema[feature]['dtype'])
|
testing data engineeringimport unittest
class TestIrisDataEngineering(unittest.TestCase):
def setUp(self):
self.pipeline = PipelineWithDataEngineering()
self.pipeline.load_dataset()
def test_scaler_preprocessing_brings_x_train_mean_near_zero(self):
# Given
# convert the dataframe to be a single column with pandas stack
original_mean = self.pipeline.X_train.stack().mean()
# When
self.pipeline.apply_scaler()
# Then
# The idea behind StandardScaler is that it will transform your data
# to center the distribution at 0 and scale the variance at 1.
# Therefore we test that the mean has shifted to be less than the original
# and close to 0 using assertAlmostEqual to check to 3 decimal places:
# https://docs.python.org/3/library/unittest.html#unittest.TestCase.assertAlmostEqual
self.assertTrue(original_mean > self.pipeline.X_train.mean()) # X_train is a numpy array at this point.
self.assertAlmostEqual(self.pipeline.X_train.mean(), 0.0, places=3)
print(f'Original X train mean: {original_mean}')
print(f'Transformed X train mean: {self.pipeline.X_train.mean()}')
def test_scaler_preprocessing_brings_x_train_std_near_one(self):
# When
self.pipeline.apply_scaler()
# Then
# We also check that the standard deviation is close to 1
self.assertAlmostEqual(self.pipeline.X_train.std(), 1.0, places=3)
print(f'Transformed X train standard deviation : {self.pipeline.X_train.std()}')
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
why test?
predicticting reliability
functionality
The text was updated successfully, but these errors were encountered: