From c031984c605aee51a6b2af62fec2b5edd5242386 Mon Sep 17 00:00:00 2001 From: helllllllder Date: Tue, 18 Jan 2022 08:42:49 -0300 Subject: [PATCH 1/6] add a new elasticsearch index and document for a simplified RepositoryExample to use in the requirements_to_train method --- bothub/common/documents/__init__.py | 2 + .../documents/repositorybasicexample.py | 58 ++++++++ bothub/common/models.py | 137 +++++++++++++++++- bothub/settings.py | 9 ++ 4 files changed, 199 insertions(+), 7 deletions(-) create mode 100644 bothub/common/documents/repositorybasicexample.py diff --git a/bothub/common/documents/__init__.py b/bothub/common/documents/__init__.py index ff97d6daa..e4302db05 100644 --- a/bothub/common/documents/__init__.py +++ b/bothub/common/documents/__init__.py @@ -1,7 +1,9 @@ from bothub.common.documents.repositorynlplog import RepositoryNLPLogDocument from bothub.common.documents.repositoryqanlplog import RepositoryQANLPLogDocument +from bothub.common.documents.repositorybasicexample import RepositoryExampleDocument __all__ = ( "RepositoryNLPLogDocument", "RepositoryQANLPLogDocument", + "RepositoryExampleDocument" ) diff --git a/bothub/common/documents/repositorybasicexample.py b/bothub/common/documents/repositorybasicexample.py new file mode 100644 index 000000000..f0800bf88 --- /dev/null +++ b/bothub/common/documents/repositorybasicexample.py @@ -0,0 +1,58 @@ +from django.conf import settings +from django_elasticsearch_dsl import Document, Index, fields + +from bothub.common.models import RepositoryExample, RepositoryExampleEntity, RepositoryIntent, RepositoryVersionLanguage + +REPOSITORYBASICEXAMPLE_INDEX = Index(settings.ELASTICSEARCH_INDEX_NAMES[__name__]) + + +@REPOSITORYBASICEXAMPLE_INDEX.doc_type +class RepositoryExampleDocument(Document): + repository_version_language = fields.ObjectField( + properties={ + "pk": fields.IntegerField(), + "language": fields.TextField(fields={"raw": fields.KeywordField()}) + } + ) + intent = fields.ObjectField( + properties={ + "text": fields.TextField( + fields={"raw": fields.KeywordField()} + ) + } + ) + entities = fields.NestedField( + properties={ + "entity": fields.ObjectField( + properties={ + "value": fields.TextField(fields={"raw": fields.KeywordField()}), + } + ), + } + ) + pk = fields.IntegerField() + + class Django: + model = RepositoryExample + fields = [ + "id", + "text", + ] + related_models = [RepositoryVersionLanguage, RepositoryIntent, RepositoryExampleEntity] + + def get_queryset(self): + return super(RepositoryExampleDocument, self).get_queryset().select_related( + "repository_version_language", + "intent", + ).prefetch_related( + "entities", + "translations", + ) + + def get_instances_from_related(self, related_instance): + if isinstance(related_instance, RepositoryVersionLanguage): + return related_instance.added.all() + elif isinstance(related_instance, RepositoryIntent): + return related_instance.repositoryexample_set.all() + elif isinstance(related_instance, RepositoryExampleEntity): + return related_instance.repository_example diff --git a/bothub/common/models.py b/bothub/common/models.py index f6b1c5e9e..280befa2f 100644 --- a/bothub/common/models.py +++ b/bothub/common/models.py @@ -14,6 +14,7 @@ from django.utils import timezone from django.utils.translation import ugettext_lazy as _ from django_elasticsearch_dsl_drf.wrappers import dict_to_obj +from elasticsearch_dsl import A from rest_framework import status from rest_framework.exceptions import APIException @@ -1064,7 +1065,54 @@ def examples(self): return examples.distinct() @property - def requirements_to_train(self): + def _search_weak_intents_and_entities(self): + from bothub.common.documents import RepositoryExampleDocument + + search = RepositoryExampleDocument.search().query( + "match", repository_version_language__pk=self.pk + ) + search.update_from_dict({"size": 0}) + + duplicated_limit_bucket = A( + "bucket_selector", + buckets_path={"doc_count": "_count"}, + script=f"params.doc_count < {self.MIN_EXAMPLES_PER_INTENT}", + ) + + search.aggs.bucket("duplicated_intents", "terms", field="intent.text.raw") + search.aggs["duplicated_intents"].bucket( + "filter_duplicated_intent_limit", duplicated_limit_bucket + ) + search.aggs.bucket( + "duplicated_intents_stats", + "stats_bucket", + buckets_path="duplicated_intents._count", + ) + + search.aggs.bucket("nested_entities", "nested", path="entities") + search.aggs["nested_entities"].bucket( + "duplicated_entities", "terms", field="entities.entity.value.raw" + ) + search.aggs["nested_entities"]["duplicated_entities"].bucket( + "filter_duplicated_entity_limit", duplicated_limit_bucket + ) + search.aggs["nested_entities"].bucket( + "duplicated_entities_stats", + "stats_bucket", + buckets_path="duplicated_entities._count", + ) + + return search.execute() + + @property + def _does_all_examples_have_intents(self): + from bothub.common.documents import RepositoryExampleDocument + + search = RepositoryExampleDocument.search().query("match", intent_text="") + return False if search.execute().hits.total.value != 0 else True + + @property + def _elasticsearch_requirements_to_train(self): try: self.validate_init_train() except RepositoryUpdateAlreadyTrained: # pragma: no cover @@ -1072,12 +1120,64 @@ def requirements_to_train(self): except RepositoryUpdateAlreadyStartedTraining: # pragma: no cover return [_("This bot version is being trained.")] - r = [] + warnings = [] + + if not self._does_all_examples_have_intents: + warnings.append(_("All examples need to have a intent.")) + + search_result = self._search_weak_intents_and_entities + + weak_intents_count = search_result.aggregations.duplicated_intents_stats.count + weak_intents = search_result.aggregations.duplicated_intents.buckets + + if weak_intents_count > 0: + for intent in weak_intents: + warnings.append( + _( + 'The "{}" intention has only {} sentence\nAdd 1 more sentence to that intention (minimum is {})' + ).format( + intent["key"], + intent["doc_count"], + self.MIN_EXAMPLES_PER_INTENT, + ) + ) + + weak_entities_count = ( + search_result.aggregations.nested_entities.duplicated_entities_stats.count + ) + weak_entities = ( + search_result.aggregations.nested_entities.duplicated_entities.buckets + ) + + if weak_entities_count > 0: + for intent in weak_entities: + warnings.append( + _( + 'The entity "{}" has only {} sentence\nAdd 1 more sentence to that entity (minimum is {})' + ).format( + intent["key"], + intent["doc_count"], + self.MIN_EXAMPLES_PER_INTENT, + ) + ) + + return warnings + + @property + def _relational_requirements_to_train(self): + try: + self.validate_init_train() + except RepositoryUpdateAlreadyTrained: # pragma: no cover + return [_("This bot version has already been trained.")] + except RepositoryUpdateAlreadyStartedTraining: # pragma: no cover + return [_("This bot version is being trained.")] + + warnings = [] intents = self.examples.values_list("intent__text", flat=True) if "" in intents: - r.append(_("All examples need have a intent.")) + warnings.append(_("All examples need to have a intent.")) weak_intents = ( self.examples.values("intent__text") @@ -1088,7 +1188,7 @@ def requirements_to_train(self): if weak_intents.exists(): for i in weak_intents: - r.append( + warnings.append( _( 'The "{}" intention has only {} sentence\nAdd 1 more sentence to that intention (minimum is {})' ).format( @@ -1108,7 +1208,7 @@ def requirements_to_train(self): if weak_entities.exists(): for e in weak_entities: - r.append( + warnings.append( _( 'The entity "{}" has only {} sentence\nAdd 1 more sentence to that entity (minimum is {})' ).format( @@ -1118,7 +1218,14 @@ def requirements_to_train(self): ) ) - return r + return warnings + + @property + def requirements_to_train(self): + if settings.USE_ELASTICSEARCH: + return self._elasticsearch_requirements_to_train + else: + return self._relational_requirements_to_train @property def ready_for_train(self): @@ -1462,7 +1569,7 @@ def get_text(self, language=None): # pragma: no cover return self.text return self.get_translation(language).text - def get_entities(self, language): # pragma: no cover + def get_entities(self, language=None): # pragma: no cover if not language or language == self.repository_version_language.language: return self.entities.all() return self.get_translation(language).entities.all() @@ -1483,6 +1590,22 @@ def delete(self, using=None, keep_parents=False): return instance + @property + def entities_field_indexing(self): + entities = self.entities.all() + entity_reduced_list = [] + for entity in entities: + reduced_entity_obj = dict_to_obj( + { + "entity": { + "value": entity.entity.value, + }, + } + ) + entity_reduced_list.append(reduced_entity_obj) + + return entity_reduced_list + class RepositoryTranslatedExampleManager(models.Manager): def create( diff --git a/bothub/settings.py b/bothub/settings.py index d988f72d9..ef4a600e2 100644 --- a/bothub/settings.py +++ b/bothub/settings.py @@ -80,7 +80,9 @@ REPOSITORY_EXAMPLE_TEXT_WORDS_LIMIT=(int, 200), ELASTICSEARCH_DSL=(str, "localhost:9200"), ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=(str, "ai_repositorynlplog"), + USE_ELASTICSEARCH=(bool, True), ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX=(str, "ai_repositoryqanlplog"), + ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX=(str, "ai_repositorybasicexample"), ELASTICSEARCH_NUMBER_OF_SHARDS=(int, 1), ELASTICSEARCH_NUMBER_OF_REPLICAS=(int, 0), ELASTICSEARCH_SIGNAL_PROCESSOR=(str, "realtime"), @@ -522,6 +524,10 @@ "default": {"hosts": env.str("ELASTICSEARCH_DSL", default="es:9200")} } +USE_ELASTICSEARCH = env.bool( + "USE_ELASTICSEARCH", default=True + ) + ELASTICSEARCH_DSL_INDEX_SETTINGS = { "number_of_shards": env.int("ELASTICSEARCH_NUMBER_OF_SHARDS", default=1), "number_of_replicas": env.int("ELASTICSEARCH_NUMBER_OF_REPLICAS", default=0), @@ -534,6 +540,9 @@ "bothub.common.documents.repositoryqanlplog": env.str( "ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX", default="ai_repositoryqanlplog" ), + "bothub.common.documents.repositorybasicexample": env.str( + "ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX", default="ai_repositorybasicexample" + ), } ELASTICSEARCH_SIGNAL_PROCESSOR_CLASSES = { From 7bd305affa02221f64040c517d7df15e7c54039f Mon Sep 17 00:00:00 2001 From: helllllllder Date: Tue, 18 Jan 2022 08:52:09 -0300 Subject: [PATCH 2/6] Update dockerfile with the new env vars --- bothub/settings.py | 4 ++-- docker-compose.yml | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bothub/settings.py b/bothub/settings.py index ef4a600e2..f1d741608 100644 --- a/bothub/settings.py +++ b/bothub/settings.py @@ -525,8 +525,8 @@ } USE_ELASTICSEARCH = env.bool( - "USE_ELASTICSEARCH", default=True - ) + "USE_ELASTICSEARCH", default=True +) ELASTICSEARCH_DSL_INDEX_SETTINGS = { "number_of_shards": env.int("ELASTICSEARCH_NUMBER_OF_SHARDS", default=1), diff --git a/docker-compose.yml b/docker-compose.yml index 651511d7e..ab7cc1564 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -107,7 +107,9 @@ services: - ELASTICSEARCH_NUMBER_OF_REPLICAS=${ELASTICSEARCH_NUMBER_OF_REPLICAS:-0} - ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYNLPLOG_INDEX:-ai_repositorynlplog} - ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX:-ai_repositoryqanlplog} + - ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX=${ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX:-ai_repositorybasicexample} - ELASTICSEARCH_SIGNAL_PROCESSOR=${ELASTICSEARCH_SIGNAL_PROCESSOR:-celery} + - USE_ELASTICSEARCH=${USE_ELASTICSEARCH:-true} bothub-engine-celery-redis: image: redis From c8c4b00e5eac26901a120f3afeb8e91876fcec84 Mon Sep 17 00:00:00 2001 From: helllllllder Date: Tue, 18 Jan 2022 15:45:46 -0300 Subject: [PATCH 3/6] fix _does_all_examples_have_intents method at RepositoryVersionLanguage --- bothub/common/models.py | 9 ++++++++- bothub/common/tests.py | 1 + 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/bothub/common/models.py b/bothub/common/models.py index 280befa2f..54540f592 100644 --- a/bothub/common/models.py +++ b/bothub/common/models.py @@ -15,6 +15,7 @@ from django.utils.translation import ugettext_lazy as _ from django_elasticsearch_dsl_drf.wrappers import dict_to_obj from elasticsearch_dsl import A +from elasticsearch_dsl import Q as elasticQ from rest_framework import status from rest_framework.exceptions import APIException @@ -1108,7 +1109,13 @@ def _search_weak_intents_and_entities(self): def _does_all_examples_have_intents(self): from bothub.common.documents import RepositoryExampleDocument - search = RepositoryExampleDocument.search().query("match", intent_text="") + search = RepositoryExampleDocument.search().query( + "bool", + must=[ + elasticQ("match", intent__text__raw=""), + elasticQ("match", repository_version_language__pk=self.pk), + ], + ) return False if search.execute().hits.total.value != 0 else True @property diff --git a/bothub/common/tests.py b/bothub/common/tests.py index 6aa1455bf..8b72ba049 100644 --- a/bothub/common/tests.py +++ b/bothub/common/tests.py @@ -959,6 +959,7 @@ def test_empty_intent(self): RepositoryExampleEntity.objects.create( repository_example=example, start=0, end=7, entity="name" ) + self.assertFalse(self.repository.current_version().ready_for_train) def test_intent_dont_have_min_examples(self): From c0dad94231497bdc11d1ea002d6f4540a50f3cd0 Mon Sep 17 00:00:00 2001 From: helllllllder Date: Wed, 19 Jan 2022 10:46:56 -0300 Subject: [PATCH 4/6] update readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7bc12e889..9261eb07a 100644 --- a/README.md +++ b/README.md @@ -166,6 +166,7 @@ You can set environment variables in your OS, write on ```.env``` file or pass v | ELASTICSEARCH_REPOSITORYNLPLOG_INDEX | ```string``` | ```repositorynlplog``` | Specify the index title for the RepositoryNlpLog document. | ELASTICSEARCH_SIGNAL_PROCESSOR | ```string``` | ```celery``` | Specify the signal processor responsible for updating the Elasticsearch data. | GUNICORN_WORKERS | ``` int ``` | ``` multiprocessing.cpu_count() * 2 + 1 ``` | Gunicorn number of workers +| USE_ELASTICSEARCH | ```boolean``` | ```true``` | Change the logic in requirements_to_train to use either elasticsearch or postgres. ## Roadmap From 69d21b1949bdadd62fab6c656d6a574767e516c2 Mon Sep 17 00:00:00 2001 From: helllllllder Date: Wed, 19 Jan 2022 10:53:07 -0300 Subject: [PATCH 5/6] update readme --- README.md | 4 +++- docker-compose.yml | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9261eb07a..8b9405f35 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,9 @@ You can set environment variables in your OS, write on ```.env``` file or pass v | ELASTICSEARCH_DSL | ```string``` | ```es:9200``` | URL Elasticsearch. | ELASTICSEARCH_NUMBER_OF_SHARDS | ```int``` | ```1``` | Specify the number of shards for the indexes. | ELASTICSEARCH_NUMBER_OF_REPLICAS | ```int``` | ```1``` | Specify the number of replicas for the indexes. -| ELASTICSEARCH_REPOSITORYNLPLOG_INDEX | ```string``` | ```repositorynlplog``` | Specify the index title for the RepositoryNlpLog document. +| ELASTICSEARCH_REPOSITORYNLPLOG_INDEX | ```string``` | ```ai_repositorynlplog``` | Specify the index title for the RepositoryNLPLog document. +| ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX | ```string``` | ```ai_repositoryqanlplog``` | Specify the index title for the RepositoryQANLPLog document. +| ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX | ```string``` | ```ai_repositorybasicexample``` | Specify the index title for the RepositoryBasicExample document. | ELASTICSEARCH_SIGNAL_PROCESSOR | ```string``` | ```celery``` | Specify the signal processor responsible for updating the Elasticsearch data. | GUNICORN_WORKERS | ``` int ``` | ``` multiprocessing.cpu_count() * 2 + 1 ``` | Gunicorn number of workers | USE_ELASTICSEARCH | ```boolean``` | ```true``` | Change the logic in requirements_to_train to use either elasticsearch or postgres. diff --git a/docker-compose.yml b/docker-compose.yml index ab7cc1564..4b1f1dc05 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -106,7 +106,7 @@ services: - ELASTICSEARCH_NUMBER_OF_SHARDS=${ELASTICSEARCH_NUMBER_OF_SHARDS:-1} - ELASTICSEARCH_NUMBER_OF_REPLICAS=${ELASTICSEARCH_NUMBER_OF_REPLICAS:-0} - ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYNLPLOG_INDEX:-ai_repositorynlplog} - - ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX:-ai_repositoryqanlplog} + - ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX:-ai_repositoryqanlplog} - ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX=${ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX:-ai_repositorybasicexample} - ELASTICSEARCH_SIGNAL_PROCESSOR=${ELASTICSEARCH_SIGNAL_PROCESSOR:-celery} - USE_ELASTICSEARCH=${USE_ELASTICSEARCH:-true} From 9c8d094e92857e725c8a097f59914684508604cf Mon Sep 17 00:00:00 2001 From: helllllllder Date: Wed, 19 Jan 2022 17:45:35 -0300 Subject: [PATCH 6/6] update readme and create quick_start guide --- README.md | 7 ++++ docs/quick-start.md | 82 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 docs/quick-start.md diff --git a/README.md b/README.md index 8b9405f35..9f39156b0 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,13 @@ [![Python Version](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/) [![License GPL-3.0](https://img.shields.io/badge/license-%20GPL--3.0-yellow.svg)](https://github.com/bothub-it/bothub-engine/blob/master/LICENSE) +# Requirements + +* Python (3.6) +* Pipenv +* Docker +* Docker-compose + ## Development Use ```make``` commands to ```check_environment```, ```install_requirements```, ```lint```, ```test```, ```migrate```, ```start```, ```migrations``` and ```collectstatic```. diff --git a/docs/quick-start.md b/docs/quick-start.md new file mode 100644 index 000000000..822cf2a1e --- /dev/null +++ b/docs/quick-start.md @@ -0,0 +1,82 @@ +

+ BLACK LIVES MATTER + +

BLACK LIVES MATTER

+ +

+ Black Lives Matter + · + Supporting the cause + · +

+

+ +
+

+ Bothub +

+ + +# Bothub + +[![Build Status](https://travis-ci.com/Ilhasoft/bothub-engine.svg?branch=master)](https://travis-ci.com/Ilhasoft/bothub-engine) +[![Coverage Status](https://coveralls.io/repos/github/bothub-it/bothub-engine/badge.svg?branch=master)](https://coveralls.io/github/bothub-it/bothub-engine?branch=master) +[![Code Climate](https://codeclimate.com/github/bothub-it/bothub-engine/badges/gpa.svg)](https://codeclimate.com/github/bothub-it/bothub-engine) +[![Python Version](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/) +[![License GPL-3.0](https://img.shields.io/badge/license-%20GPL--3.0-yellow.svg)](https://github.com/bothub-it/bothub-engine/blob/master/LICENSE) + + + +# Setting up for development + +* Download the project code + + $ git clone https://github.com/Ilhasoft/bothub-engine.git + +* Prepare the virtual environment and install dependencies: + + $ make install_requirements + $ make check_environment + +* Start the containers for the other dependencies(Postgresql, Elasticsearch and Redis): + + $ docker-compose up -d database es bothub-engine-celery-redis + +* Create the .env file following the [Enviroment Variables section](https://github.com/Ilhasoft/bothub-engine#environment-variables) + +* Run the migrations and create and populate the indices at elasticsearch + + $ make migrate + $ make search_index + +* Collect the static files into ```STATIC_ROOT``` + + $ make collectstatic + +* Run tests + + $ make test + +* run django in a separated terminal + + $ make start + +* run celery in a separated terminal + + $ make start_celery + +* run lint after making updates in the code + + $ make lint + +* (optional) Add initial data into the database + + $ pipenv run python ./manage.py fill_db_using_fake_data + +* If you did not follow the previous step, create a superuser to access the django admin interface + + $ pipenv run python ./manage.py createsuperuser + +The API will be running at ```http://localhost:8000``` and the admin interface can be accessed at ```http://localhost:8000/admin``` + +