diff --git a/docs/tabpy-tools.md b/docs/tabpy-tools.md index 5184f5e3..d26bc1a7 100755 --- a/docs/tabpy-tools.md +++ b/docs/tabpy-tools.md @@ -8,6 +8,7 @@ on TabPy server. - [Connecting to TabPy](#connecting-to-tabpy) - [Authentication](#authentication) - [Deploying a Function](#deploying-a-function) +- [Predeployed Functions](#predeployed-functions) - [Providing Schema Metadata](#providing-schema-metadata) - [Querying an Endpoint](#querying-an-endpoint) - [Evaluating Arbitrary Python Scripts](#evaluating-arbitrary-python-scripts) @@ -265,6 +266,50 @@ tabpy.query('Sentiment Analysis', _arg1, library='textblob')[‘response’] ``` +### T-Test + +A [t-test](https://en.wikipedia.org/wiki/Student%27s_t-test) is a statistical +hypothesis test that is used to compare two sample means or a sample’s mean against +a known population mean. The ttest should be used when the means of the samples +follows a normal distribution but the variance may not be known. + +TabPy’s pre-deployed t-test implementation can be called using the following syntax, + +```python + +tabpy.query(‘ttest’, _arg1, _arg2)[‘response’] + +``` + +and is capable of performing two types of t-tests: + + +1\. [A t-test for the means of two independent samples with equal variance](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html) +This is a two-sided t test with the null hypothesis being that the mean of +sample1 is equal to the mean of sample2. +_arg1 (list of numeric values): a list of independent observations +_arg2 (list of numeric values): a list of independent observations equal to +the length of _arg1 + +Alternatively, your data may not be split into separate measures. If that is +the case you can pass the following fields to ttest, + +_arg1 (list of numeric values): a list of independent observations +_arg2 (list of categorical variables with cardinality two): a binary factor +that maps each observation in _arg1 to either sample1 or sample2 (this list +should be equal to the length of _arg1) + +2\. [A t-test for the mean of one group](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.ttest_1samp.html) +_arg1 (list of numeric values): a list of independent observations +_arg2 (a numeric value): the known population mean +A two-sided t test with the null hypothesis being that the mean of a sample of +independent observations is equal to the given population mean. + +The function returns a two-tailed [p-value](https://en.wikipedia.org/wiki/P-value) +(between 0 and 1). Depending on your [significance level](https://en.wikipedia.org/wiki/Statistical_significance) +you may reject or fail to reject the null hypothesis. + + ## Providing Schema Metadata As soon as you share your deployed functions, you also need to share metadata diff --git a/models/scripts/PCA.py b/models/scripts/PCA.py index e54b2046..9c2d68b2 100644 --- a/models/scripts/PCA.py +++ b/models/scripts/PCA.py @@ -1,4 +1,3 @@ -from tabpy_tools.client import Client import pandas as pd from numpy import array from sklearn.decomposition import PCA as sklearnPCA @@ -60,25 +59,6 @@ def PCA(component, _arg1, _arg2, *_argN): if __name__ == '__main__': - # running from setup.py - if len(sys.argv) > 1: - config_file_path = sys.argv[1] - else: - config_file_path = setup_utils.get_default_config_file_path() - port, auth_on, prefix = setup_utils.parse_config(config_file_path) - - connection = Client(f'{prefix}://localhost:{port}/') - - if auth_on: - # credentials are passed in from setup.py - if len(sys.argv) == 4: - user, passwd = sys.argv[2], sys.argv[3] - # running PCA independently - else: - user, passwd = setup_utils.get_creds() - connection.set_credentials(user, passwd) - - connection.deploy('PCA', PCA, - 'Returns the specified principal component.', - override=True) - print("Successfully deployed PCA") + setup_utils.main('PCA', + PCA, + 'Returns the specified principal component') diff --git a/models/scripts/SentimentAnalysis.py b/models/scripts/SentimentAnalysis.py index 55c9416a..0b3c3ab6 100644 --- a/models/scripts/SentimentAnalysis.py +++ b/models/scripts/SentimentAnalysis.py @@ -1,4 +1,3 @@ -from tabpy_tools.client import Client from textblob import TextBlob import nltk from nltk.sentiment.vader import SentimentIntensityAnalyzer @@ -43,25 +42,7 @@ def SentimentAnalysis(_arg1, library='nltk'): if __name__ == '__main__': - # running from setup.py - if len(sys.argv) > 1: - config_file_path = sys.argv[1] - else: - config_file_path = setup_utils.get_default_config_file_path() - port, auth_on, prefix = setup_utils.parse_config(config_file_path) - - connection = Client(f'{prefix}://localhost:{port}/') - - if auth_on: - # credentials are passed in from setup.py - if len(sys.argv) == 4: - user, passwd = sys.argv[2], sys.argv[3] - # running Sentiment Analysis independently - else: - user, passwd = setup_utils.get_creds() - connection.set_credentials(user, passwd) - - connection.deploy('Sentiment Analysis', SentimentAnalysis, - 'Returns a sentiment score between -1 and ' - '1 for a given string.', override=True) - print("Successfully deployed SentimentAnalysis") + setup_utils.main('Sentiment Analysis', + SentimentAnalysis, + 'Returns a sentiment score between -1 and 1 for ' + 'a given string') diff --git a/models/scripts/tTest.py b/models/scripts/tTest.py new file mode 100644 index 00000000..d7082698 --- /dev/null +++ b/models/scripts/tTest.py @@ -0,0 +1,44 @@ +from scipy import stats +import sys +from pathlib import Path +sys.path.append(str(Path(__file__).resolve().parent.parent.parent / 'models')) +from utils import setup_utils + + +def ttest(_arg1, _arg2): + ''' + T-Test is a statistical hypothesis test that is used to compare + two sample means or a sample’s mean against a known population mean. + For more information on the function and how to use it please refer + to tabpy-tools.md + ''' + # one sample test with mean + if len(_arg2) == 1: + test_stat, p_value = stats.ttest_1samp(_arg1, _arg2) + return p_value + # two sample t-test where _arg1 is numeric and _arg2 is a binary factor + elif len(set(_arg2)) == 2: + # each sample in _arg1 needs to have a corresponding classification + # in _arg2 + if not (len(_arg1) == len(_arg2)): + raise ValueError + class1, class2 = set(_arg2) + sample1 = [] + sample2 = [] + for i in range(len(_arg1)): + if _arg2[i] == class1: + sample1.append(_arg1[i]) + else: + sample2.append(_arg1[i]) + test_stat, p_value = stats.ttest_ind(sample1, sample2, equal_var=False) + return p_value + # arg1 is a sample and arg2 is a sample + else: + test_stat, p_value = stats.ttest_ind(_arg1, _arg2, equal_var=False) + return p_value + + +if __name__ == '__main__': + setup_utils.main('ttest', + ttest, + 'Returns the p-value form a t-test') diff --git a/models/utils/setup_utils.py b/models/utils/setup_utils.py index de8416e7..e3da48de 100644 --- a/models/utils/setup_utils.py +++ b/models/utils/setup_utils.py @@ -2,6 +2,7 @@ from pathlib import Path import getpass import sys +from tabpy_tools.client import Client def get_default_config_file_path(): @@ -31,3 +32,25 @@ def get_creds(): user = sys.stdin.readline().rstrip() passwd = sys.stdin.readline().rstrip() return [user, passwd] + +def main(funcName, func, funcDescription): + # running from setup.py + if len(sys.argv) > 1: + config_file_path = sys.argv[1] + else: + config_file_path = get_default_config_file_path() + port, auth_on, prefix = parse_config(config_file_path) + + connection = Client(f'{prefix}://localhost:{port}/') + + if auth_on: + # credentials are passed in from setup.py + if len(sys.argv) == 4: + user, passwd = sys.argv[2], sys.argv[3] + # running Sentiment Analysis independently + else: + user, passwd = get_creds() + connection.set_credentials(user, passwd) + + connection.deploy(funcName, func, funcDescription, override=True) + print(f'Successfully deployed {funcName}') diff --git a/tests/integration/test_deploy_model_ssl_off_auth_off.py b/tests/integration/test_deploy_model_ssl_off_auth_off.py index e9059569..e35d81d8 100644 --- a/tests/integration/test_deploy_model_ssl_off_auth_off.py +++ b/tests/integration/test_deploy_model_ssl_off_auth_off.py @@ -5,16 +5,13 @@ class TestDeployModelSSLOffAuthOff(integ_test_base.IntegTestBase): def test_deploy_ssl_off_auth_off(self): + models = ['PCA', 'Sentiment%20Analysis', "ttest"] path = str(Path('models', 'setup.py')) subprocess.call([self.py, path, self._get_config_file_name()]) conn = self._get_connection() - conn.request("GET", "/endpoints/PCA") - PCA_request = conn.getresponse() - self.assertEqual(200, PCA_request.status) - PCA_request.read() - - conn.request("GET", "/endpoints/Sentiment%20Analysis") - SentimentAnalysis_request = conn.getresponse() - self.assertEqual(200, SentimentAnalysis_request.status) - SentimentAnalysis_request.read() + for m in models: + conn.request("GET", f'/endpoints/{m}') + m_request = conn.getresponse() + self.assertEqual(200, m_request.status) + m_request.read() diff --git a/tests/integration/test_deploy_model_ssl_off_auth_on.py b/tests/integration/test_deploy_model_ssl_off_auth_on.py index cbdcec6f..bb1268eb 100644 --- a/tests/integration/test_deploy_model_ssl_off_auth_on.py +++ b/tests/integration/test_deploy_model_ssl_off_auth_on.py @@ -9,6 +9,7 @@ def _get_pwd_file(self) -> str: return './tests/integration/resources/pwdfile.txt' def test_deploy_ssl_off_auth_on(self): + models = ['PCA', 'Sentiment%20Analysis', "ttest"] path = str(Path('models', 'setup.py')) p = subprocess.run([self.py, path, self._get_config_file_name()], input=b'user1\nP@ssw0rd\n') @@ -20,15 +21,11 @@ def test_deploy_ssl_off_auth_on(self): 'Basic ' + base64.b64encode('user1:P@ssw0rd'. encode('utf-8')).decode('utf-8') - } + } conn = self._get_connection() - conn.request("GET", "/endpoints/PCA", headers=headers) - PCA_request = conn.getresponse() - self.assertEqual(200, PCA_request.status) - PCA_request.read() - - conn.request("GET", "/endpoints/Sentiment%20Analysis", headers=headers) - SentimentAnalysis_request = conn.getresponse() - self.assertEqual(200, SentimentAnalysis_request.status) - SentimentAnalysis_request.read() + for m in models: + conn.request("GET", f'/endpoints/{m}', headers=headers) + m_request = conn.getresponse() + self.assertEqual(200, m_request.status) + m_request.read() diff --git a/tests/integration/test_deploy_model_ssl_on_auth_off.py b/tests/integration/test_deploy_model_ssl_on_auth_off.py index 65041b16..fe083849 100644 --- a/tests/integration/test_deploy_model_ssl_on_auth_off.py +++ b/tests/integration/test_deploy_model_ssl_on_auth_off.py @@ -15,6 +15,7 @@ def _get_key_file_name(self) -> str: return './tests/integration/resources/2019_04_24_to_3018_08_25.key' def test_deploy_ssl_on_auth_off(self): + models = ['PCA', 'Sentiment%20Analysis', "ttest"] path = str(Path('models', 'setup.py')) subprocess.call([self.py, path, self._get_config_file_name()]) @@ -24,12 +25,7 @@ def test_deploy_ssl_on_auth_off(self): # Do not warn about insecure request requests.packages.urllib3.disable_warnings() - PCA_response = session.get(url=f'{self._get_transfer_protocol()}://' - 'localhost:9004/endpoints/PCA') - self.assertEqual(200, PCA_response.status_code) - - SentimentAnalysis_response = session.get( - url=f'{self._get_transfer_protocol()}://' - 'localhost:9004/endpoints/' - 'Sentiment Analysis') - self.assertEqual(200, SentimentAnalysis_response.status_code) + for m in models: + m_response = session.get(url=f'{self._get_transfer_protocol()}://' + f'localhost:9004/endpoints/{m}') + self.assertEqual(200, m_response.status_code) diff --git a/tests/integration/test_deploy_model_ssl_on_auth_on.py b/tests/integration/test_deploy_model_ssl_on_auth_on.py index 081ace56..742abceb 100644 --- a/tests/integration/test_deploy_model_ssl_on_auth_on.py +++ b/tests/integration/test_deploy_model_ssl_on_auth_on.py @@ -19,6 +19,7 @@ def _get_pwd_file(self) -> str: return './tests/integration/resources/pwdfile.txt' def test_deploy_ssl_on_auth_on(self): + models = ['PCA', 'Sentiment%20Analysis', "ttest"] path = str(Path('models', 'setup.py')) p = subprocess.run([self.py, path, self._get_config_file_name()], input=b'user1\nP@ssw0rd\n') @@ -36,13 +37,8 @@ def test_deploy_ssl_on_auth_on(self): # Do not warn about insecure request requests.packages.urllib3.disable_warnings() - PCA_response = session.get(url=f'{self._get_transfer_protocol()}' - '://localhost:9004/endpoints/PCA', - headers=headers) - self.assertEqual(200, PCA_response.status_code) - - SentimentAnalysis_response = session.get( - url=f'{self._get_transfer_protocol()}' - '://localhost:9004/endpoints/' - 'Sentiment Analysis', headers=headers) - self.assertEqual(200, SentimentAnalysis_response.status_code) + for m in models: + m_response = session.get(url=f'{self._get_transfer_protocol()}://' + f'localhost:9004/endpoints/{m}', + headers=headers) + self.assertEqual(200, m_response.status_code)