From c031984c605aee51a6b2af62fec2b5edd5242386 Mon Sep 17 00:00:00 2001
From: helllllllder
Date: Tue, 18 Jan 2022 08:42:49 -0300
Subject: [PATCH 1/6] add a new elasticsearch index and document for a
simplified RepositoryExample to use in the requirements_to_train method
---
bothub/common/documents/__init__.py | 2 +
.../documents/repositorybasicexample.py | 58 ++++++++
bothub/common/models.py | 137 +++++++++++++++++-
bothub/settings.py | 9 ++
4 files changed, 199 insertions(+), 7 deletions(-)
create mode 100644 bothub/common/documents/repositorybasicexample.py
diff --git a/bothub/common/documents/__init__.py b/bothub/common/documents/__init__.py
index ff97d6daa..e4302db05 100644
--- a/bothub/common/documents/__init__.py
+++ b/bothub/common/documents/__init__.py
@@ -1,7 +1,9 @@
from bothub.common.documents.repositorynlplog import RepositoryNLPLogDocument
from bothub.common.documents.repositoryqanlplog import RepositoryQANLPLogDocument
+from bothub.common.documents.repositorybasicexample import RepositoryExampleDocument
__all__ = (
"RepositoryNLPLogDocument",
"RepositoryQANLPLogDocument",
+ "RepositoryExampleDocument"
)
diff --git a/bothub/common/documents/repositorybasicexample.py b/bothub/common/documents/repositorybasicexample.py
new file mode 100644
index 000000000..f0800bf88
--- /dev/null
+++ b/bothub/common/documents/repositorybasicexample.py
@@ -0,0 +1,58 @@
+from django.conf import settings
+from django_elasticsearch_dsl import Document, Index, fields
+
+from bothub.common.models import RepositoryExample, RepositoryExampleEntity, RepositoryIntent, RepositoryVersionLanguage
+
+REPOSITORYBASICEXAMPLE_INDEX = Index(settings.ELASTICSEARCH_INDEX_NAMES[__name__])
+
+
+@REPOSITORYBASICEXAMPLE_INDEX.doc_type
+class RepositoryExampleDocument(Document):
+ repository_version_language = fields.ObjectField(
+ properties={
+ "pk": fields.IntegerField(),
+ "language": fields.TextField(fields={"raw": fields.KeywordField()})
+ }
+ )
+ intent = fields.ObjectField(
+ properties={
+ "text": fields.TextField(
+ fields={"raw": fields.KeywordField()}
+ )
+ }
+ )
+ entities = fields.NestedField(
+ properties={
+ "entity": fields.ObjectField(
+ properties={
+ "value": fields.TextField(fields={"raw": fields.KeywordField()}),
+ }
+ ),
+ }
+ )
+ pk = fields.IntegerField()
+
+ class Django:
+ model = RepositoryExample
+ fields = [
+ "id",
+ "text",
+ ]
+ related_models = [RepositoryVersionLanguage, RepositoryIntent, RepositoryExampleEntity]
+
+ def get_queryset(self):
+ return super(RepositoryExampleDocument, self).get_queryset().select_related(
+ "repository_version_language",
+ "intent",
+ ).prefetch_related(
+ "entities",
+ "translations",
+ )
+
+ def get_instances_from_related(self, related_instance):
+ if isinstance(related_instance, RepositoryVersionLanguage):
+ return related_instance.added.all()
+ elif isinstance(related_instance, RepositoryIntent):
+ return related_instance.repositoryexample_set.all()
+ elif isinstance(related_instance, RepositoryExampleEntity):
+ return related_instance.repository_example
diff --git a/bothub/common/models.py b/bothub/common/models.py
index f6b1c5e9e..280befa2f 100644
--- a/bothub/common/models.py
+++ b/bothub/common/models.py
@@ -14,6 +14,7 @@
from django.utils import timezone
from django.utils.translation import ugettext_lazy as _
from django_elasticsearch_dsl_drf.wrappers import dict_to_obj
+from elasticsearch_dsl import A
from rest_framework import status
from rest_framework.exceptions import APIException
@@ -1064,7 +1065,54 @@ def examples(self):
return examples.distinct()
@property
- def requirements_to_train(self):
+ def _search_weak_intents_and_entities(self):
+ from bothub.common.documents import RepositoryExampleDocument
+
+ search = RepositoryExampleDocument.search().query(
+ "match", repository_version_language__pk=self.pk
+ )
+ search.update_from_dict({"size": 0})
+
+ duplicated_limit_bucket = A(
+ "bucket_selector",
+ buckets_path={"doc_count": "_count"},
+ script=f"params.doc_count < {self.MIN_EXAMPLES_PER_INTENT}",
+ )
+
+ search.aggs.bucket("duplicated_intents", "terms", field="intent.text.raw")
+ search.aggs["duplicated_intents"].bucket(
+ "filter_duplicated_intent_limit", duplicated_limit_bucket
+ )
+ search.aggs.bucket(
+ "duplicated_intents_stats",
+ "stats_bucket",
+ buckets_path="duplicated_intents._count",
+ )
+
+ search.aggs.bucket("nested_entities", "nested", path="entities")
+ search.aggs["nested_entities"].bucket(
+ "duplicated_entities", "terms", field="entities.entity.value.raw"
+ )
+ search.aggs["nested_entities"]["duplicated_entities"].bucket(
+ "filter_duplicated_entity_limit", duplicated_limit_bucket
+ )
+ search.aggs["nested_entities"].bucket(
+ "duplicated_entities_stats",
+ "stats_bucket",
+ buckets_path="duplicated_entities._count",
+ )
+
+ return search.execute()
+
+ @property
+ def _does_all_examples_have_intents(self):
+ from bothub.common.documents import RepositoryExampleDocument
+
+ search = RepositoryExampleDocument.search().query("match", intent_text="")
+ return False if search.execute().hits.total.value != 0 else True
+
+ @property
+ def _elasticsearch_requirements_to_train(self):
try:
self.validate_init_train()
except RepositoryUpdateAlreadyTrained: # pragma: no cover
@@ -1072,12 +1120,64 @@ def requirements_to_train(self):
except RepositoryUpdateAlreadyStartedTraining: # pragma: no cover
return [_("This bot version is being trained.")]
- r = []
+ warnings = []
+
+ if not self._does_all_examples_have_intents:
+ warnings.append(_("All examples need to have a intent."))
+
+ search_result = self._search_weak_intents_and_entities
+
+ weak_intents_count = search_result.aggregations.duplicated_intents_stats.count
+ weak_intents = search_result.aggregations.duplicated_intents.buckets
+
+ if weak_intents_count > 0:
+ for intent in weak_intents:
+ warnings.append(
+ _(
+ 'The "{}" intention has only {} sentence\nAdd 1 more sentence to that intention (minimum is {})'
+ ).format(
+ intent["key"],
+ intent["doc_count"],
+ self.MIN_EXAMPLES_PER_INTENT,
+ )
+ )
+
+ weak_entities_count = (
+ search_result.aggregations.nested_entities.duplicated_entities_stats.count
+ )
+ weak_entities = (
+ search_result.aggregations.nested_entities.duplicated_entities.buckets
+ )
+
+ if weak_entities_count > 0:
+ for intent in weak_entities:
+ warnings.append(
+ _(
+ 'The entity "{}" has only {} sentence\nAdd 1 more sentence to that entity (minimum is {})'
+ ).format(
+ intent["key"],
+ intent["doc_count"],
+ self.MIN_EXAMPLES_PER_INTENT,
+ )
+ )
+
+ return warnings
+
+ @property
+ def _relational_requirements_to_train(self):
+ try:
+ self.validate_init_train()
+ except RepositoryUpdateAlreadyTrained: # pragma: no cover
+ return [_("This bot version has already been trained.")]
+ except RepositoryUpdateAlreadyStartedTraining: # pragma: no cover
+ return [_("This bot version is being trained.")]
+
+ warnings = []
intents = self.examples.values_list("intent__text", flat=True)
if "" in intents:
- r.append(_("All examples need have a intent."))
+ warnings.append(_("All examples need to have a intent."))
weak_intents = (
self.examples.values("intent__text")
@@ -1088,7 +1188,7 @@ def requirements_to_train(self):
if weak_intents.exists():
for i in weak_intents:
- r.append(
+ warnings.append(
_(
'The "{}" intention has only {} sentence\nAdd 1 more sentence to that intention (minimum is {})'
).format(
@@ -1108,7 +1208,7 @@ def requirements_to_train(self):
if weak_entities.exists():
for e in weak_entities:
- r.append(
+ warnings.append(
_(
'The entity "{}" has only {} sentence\nAdd 1 more sentence to that entity (minimum is {})'
).format(
@@ -1118,7 +1218,14 @@ def requirements_to_train(self):
)
)
- return r
+ return warnings
+
+ @property
+ def requirements_to_train(self):
+ if settings.USE_ELASTICSEARCH:
+ return self._elasticsearch_requirements_to_train
+ else:
+ return self._relational_requirements_to_train
@property
def ready_for_train(self):
@@ -1462,7 +1569,7 @@ def get_text(self, language=None): # pragma: no cover
return self.text
return self.get_translation(language).text
- def get_entities(self, language): # pragma: no cover
+ def get_entities(self, language=None): # pragma: no cover
if not language or language == self.repository_version_language.language:
return self.entities.all()
return self.get_translation(language).entities.all()
@@ -1483,6 +1590,22 @@ def delete(self, using=None, keep_parents=False):
return instance
+ @property
+ def entities_field_indexing(self):
+ entities = self.entities.all()
+ entity_reduced_list = []
+ for entity in entities:
+ reduced_entity_obj = dict_to_obj(
+ {
+ "entity": {
+ "value": entity.entity.value,
+ },
+ }
+ )
+ entity_reduced_list.append(reduced_entity_obj)
+
+ return entity_reduced_list
+
class RepositoryTranslatedExampleManager(models.Manager):
def create(
diff --git a/bothub/settings.py b/bothub/settings.py
index d988f72d9..ef4a600e2 100644
--- a/bothub/settings.py
+++ b/bothub/settings.py
@@ -80,7 +80,9 @@
REPOSITORY_EXAMPLE_TEXT_WORDS_LIMIT=(int, 200),
ELASTICSEARCH_DSL=(str, "localhost:9200"),
ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=(str, "ai_repositorynlplog"),
+ USE_ELASTICSEARCH=(bool, True),
ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX=(str, "ai_repositoryqanlplog"),
+ ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX=(str, "ai_repositorybasicexample"),
ELASTICSEARCH_NUMBER_OF_SHARDS=(int, 1),
ELASTICSEARCH_NUMBER_OF_REPLICAS=(int, 0),
ELASTICSEARCH_SIGNAL_PROCESSOR=(str, "realtime"),
@@ -522,6 +524,10 @@
"default": {"hosts": env.str("ELASTICSEARCH_DSL", default="es:9200")}
}
+USE_ELASTICSEARCH = env.bool(
+ "USE_ELASTICSEARCH", default=True
+ )
+
ELASTICSEARCH_DSL_INDEX_SETTINGS = {
"number_of_shards": env.int("ELASTICSEARCH_NUMBER_OF_SHARDS", default=1),
"number_of_replicas": env.int("ELASTICSEARCH_NUMBER_OF_REPLICAS", default=0),
@@ -534,6 +540,9 @@
"bothub.common.documents.repositoryqanlplog": env.str(
"ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX", default="ai_repositoryqanlplog"
),
+ "bothub.common.documents.repositorybasicexample": env.str(
+ "ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX", default="ai_repositorybasicexample"
+ ),
}
ELASTICSEARCH_SIGNAL_PROCESSOR_CLASSES = {
From 7bd305affa02221f64040c517d7df15e7c54039f Mon Sep 17 00:00:00 2001
From: helllllllder
Date: Tue, 18 Jan 2022 08:52:09 -0300
Subject: [PATCH 2/6] Update dockerfile with the new env vars
---
bothub/settings.py | 4 ++--
docker-compose.yml | 2 ++
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/bothub/settings.py b/bothub/settings.py
index ef4a600e2..f1d741608 100644
--- a/bothub/settings.py
+++ b/bothub/settings.py
@@ -525,8 +525,8 @@
}
USE_ELASTICSEARCH = env.bool(
- "USE_ELASTICSEARCH", default=True
- )
+ "USE_ELASTICSEARCH", default=True
+)
ELASTICSEARCH_DSL_INDEX_SETTINGS = {
"number_of_shards": env.int("ELASTICSEARCH_NUMBER_OF_SHARDS", default=1),
diff --git a/docker-compose.yml b/docker-compose.yml
index 651511d7e..ab7cc1564 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -107,7 +107,9 @@ services:
- ELASTICSEARCH_NUMBER_OF_REPLICAS=${ELASTICSEARCH_NUMBER_OF_REPLICAS:-0}
- ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYNLPLOG_INDEX:-ai_repositorynlplog}
- ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX:-ai_repositoryqanlplog}
+ - ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX=${ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX:-ai_repositorybasicexample}
- ELASTICSEARCH_SIGNAL_PROCESSOR=${ELASTICSEARCH_SIGNAL_PROCESSOR:-celery}
+ - USE_ELASTICSEARCH=${USE_ELASTICSEARCH:-true}
bothub-engine-celery-redis:
image: redis
From c8c4b00e5eac26901a120f3afeb8e91876fcec84 Mon Sep 17 00:00:00 2001
From: helllllllder
Date: Tue, 18 Jan 2022 15:45:46 -0300
Subject: [PATCH 3/6] fix _does_all_examples_have_intents method at
RepositoryVersionLanguage
---
bothub/common/models.py | 9 ++++++++-
bothub/common/tests.py | 1 +
2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/bothub/common/models.py b/bothub/common/models.py
index 280befa2f..54540f592 100644
--- a/bothub/common/models.py
+++ b/bothub/common/models.py
@@ -15,6 +15,7 @@
from django.utils.translation import ugettext_lazy as _
from django_elasticsearch_dsl_drf.wrappers import dict_to_obj
from elasticsearch_dsl import A
+from elasticsearch_dsl import Q as elasticQ
from rest_framework import status
from rest_framework.exceptions import APIException
@@ -1108,7 +1109,13 @@ def _search_weak_intents_and_entities(self):
def _does_all_examples_have_intents(self):
from bothub.common.documents import RepositoryExampleDocument
- search = RepositoryExampleDocument.search().query("match", intent_text="")
+ search = RepositoryExampleDocument.search().query(
+ "bool",
+ must=[
+ elasticQ("match", intent__text__raw=""),
+ elasticQ("match", repository_version_language__pk=self.pk),
+ ],
+ )
return False if search.execute().hits.total.value != 0 else True
@property
diff --git a/bothub/common/tests.py b/bothub/common/tests.py
index 6aa1455bf..8b72ba049 100644
--- a/bothub/common/tests.py
+++ b/bothub/common/tests.py
@@ -959,6 +959,7 @@ def test_empty_intent(self):
RepositoryExampleEntity.objects.create(
repository_example=example, start=0, end=7, entity="name"
)
+
self.assertFalse(self.repository.current_version().ready_for_train)
def test_intent_dont_have_min_examples(self):
From c0dad94231497bdc11d1ea002d6f4540a50f3cd0 Mon Sep 17 00:00:00 2001
From: helllllllder
Date: Wed, 19 Jan 2022 10:46:56 -0300
Subject: [PATCH 4/6] update readme
---
README.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/README.md b/README.md
index 7bc12e889..9261eb07a 100644
--- a/README.md
+++ b/README.md
@@ -166,6 +166,7 @@ You can set environment variables in your OS, write on ```.env``` file or pass v
| ELASTICSEARCH_REPOSITORYNLPLOG_INDEX | ```string``` | ```repositorynlplog``` | Specify the index title for the RepositoryNlpLog document.
| ELASTICSEARCH_SIGNAL_PROCESSOR | ```string``` | ```celery``` | Specify the signal processor responsible for updating the Elasticsearch data.
| GUNICORN_WORKERS | ``` int ``` | ``` multiprocessing.cpu_count() * 2 + 1 ``` | Gunicorn number of workers
+| USE_ELASTICSEARCH | ```boolean``` | ```true``` | Change the logic in requirements_to_train to use either elasticsearch or postgres.
## Roadmap
From 69d21b1949bdadd62fab6c656d6a574767e516c2 Mon Sep 17 00:00:00 2001
From: helllllllder
Date: Wed, 19 Jan 2022 10:53:07 -0300
Subject: [PATCH 5/6] update readme
---
README.md | 4 +++-
docker-compose.yml | 2 +-
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 9261eb07a..8b9405f35 100644
--- a/README.md
+++ b/README.md
@@ -163,7 +163,9 @@ You can set environment variables in your OS, write on ```.env``` file or pass v
| ELASTICSEARCH_DSL | ```string``` | ```es:9200``` | URL Elasticsearch.
| ELASTICSEARCH_NUMBER_OF_SHARDS | ```int``` | ```1``` | Specify the number of shards for the indexes.
| ELASTICSEARCH_NUMBER_OF_REPLICAS | ```int``` | ```1``` | Specify the number of replicas for the indexes.
-| ELASTICSEARCH_REPOSITORYNLPLOG_INDEX | ```string``` | ```repositorynlplog``` | Specify the index title for the RepositoryNlpLog document.
+| ELASTICSEARCH_REPOSITORYNLPLOG_INDEX | ```string``` | ```ai_repositorynlplog``` | Specify the index title for the RepositoryNLPLog document.
+| ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX | ```string``` | ```ai_repositoryqanlplog``` | Specify the index title for the RepositoryQANLPLog document.
+| ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX | ```string``` | ```ai_repositorybasicexample``` | Specify the index title for the RepositoryBasicExample document.
| ELASTICSEARCH_SIGNAL_PROCESSOR | ```string``` | ```celery``` | Specify the signal processor responsible for updating the Elasticsearch data.
| GUNICORN_WORKERS | ``` int ``` | ``` multiprocessing.cpu_count() * 2 + 1 ``` | Gunicorn number of workers
| USE_ELASTICSEARCH | ```boolean``` | ```true``` | Change the logic in requirements_to_train to use either elasticsearch or postgres.
diff --git a/docker-compose.yml b/docker-compose.yml
index ab7cc1564..4b1f1dc05 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -106,7 +106,7 @@ services:
- ELASTICSEARCH_NUMBER_OF_SHARDS=${ELASTICSEARCH_NUMBER_OF_SHARDS:-1}
- ELASTICSEARCH_NUMBER_OF_REPLICAS=${ELASTICSEARCH_NUMBER_OF_REPLICAS:-0}
- ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYNLPLOG_INDEX:-ai_repositorynlplog}
- - ELASTICSEARCH_REPOSITORYNLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX:-ai_repositoryqanlplog}
+ - ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX=${ELASTICSEARCH_REPOSITORYQANLPLOG_INDEX:-ai_repositoryqanlplog}
- ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX=${ELASTICSEARCH_REPOSITORYBASICEXAMPLE_INDEX:-ai_repositorybasicexample}
- ELASTICSEARCH_SIGNAL_PROCESSOR=${ELASTICSEARCH_SIGNAL_PROCESSOR:-celery}
- USE_ELASTICSEARCH=${USE_ELASTICSEARCH:-true}
From 9c8d094e92857e725c8a097f59914684508604cf Mon Sep 17 00:00:00 2001
From: helllllllder
Date: Wed, 19 Jan 2022 17:45:35 -0300
Subject: [PATCH 6/6] update readme and create quick_start guide
---
README.md | 7 ++++
docs/quick-start.md | 82 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 89 insertions(+)
create mode 100644 docs/quick-start.md
diff --git a/README.md b/README.md
index 8b9405f35..9f39156b0 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,13 @@
[![Python Version](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/)
[![License GPL-3.0](https://img.shields.io/badge/license-%20GPL--3.0-yellow.svg)](https://github.com/bothub-it/bothub-engine/blob/master/LICENSE)
+# Requirements
+
+* Python (3.6)
+* Pipenv
+* Docker
+* Docker-compose
+
## Development
Use ```make``` commands to ```check_environment```, ```install_requirements```, ```lint```, ```test```, ```migrate```, ```start```, ```migrations``` and ```collectstatic```.
diff --git a/docs/quick-start.md b/docs/quick-start.md
new file mode 100644
index 000000000..822cf2a1e
--- /dev/null
+++ b/docs/quick-start.md
@@ -0,0 +1,82 @@
+
+
+
+
BLACK LIVES MATTER
+
+
+ Black Lives Matter
+ ·
+ Supporting the cause
+ ·
+
+
+
+
+
+
+
+
+
+# Bothub
+
+[![Build Status](https://travis-ci.com/Ilhasoft/bothub-engine.svg?branch=master)](https://travis-ci.com/Ilhasoft/bothub-engine)
+[![Coverage Status](https://coveralls.io/repos/github/bothub-it/bothub-engine/badge.svg?branch=master)](https://coveralls.io/github/bothub-it/bothub-engine?branch=master)
+[![Code Climate](https://codeclimate.com/github/bothub-it/bothub-engine/badges/gpa.svg)](https://codeclimate.com/github/bothub-it/bothub-engine)
+[![Python Version](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/)
+[![License GPL-3.0](https://img.shields.io/badge/license-%20GPL--3.0-yellow.svg)](https://github.com/bothub-it/bothub-engine/blob/master/LICENSE)
+
+
+
+# Setting up for development
+
+* Download the project code
+
+ $ git clone https://github.com/Ilhasoft/bothub-engine.git
+
+* Prepare the virtual environment and install dependencies:
+
+ $ make install_requirements
+ $ make check_environment
+
+* Start the containers for the other dependencies(Postgresql, Elasticsearch and Redis):
+
+ $ docker-compose up -d database es bothub-engine-celery-redis
+
+* Create the .env file following the [Enviroment Variables section](https://github.com/Ilhasoft/bothub-engine#environment-variables)
+
+* Run the migrations and create and populate the indices at elasticsearch
+
+ $ make migrate
+ $ make search_index
+
+* Collect the static files into ```STATIC_ROOT```
+
+ $ make collectstatic
+
+* Run tests
+
+ $ make test
+
+* run django in a separated terminal
+
+ $ make start
+
+* run celery in a separated terminal
+
+ $ make start_celery
+
+* run lint after making updates in the code
+
+ $ make lint
+
+* (optional) Add initial data into the database
+
+ $ pipenv run python ./manage.py fill_db_using_fake_data
+
+* If you did not follow the previous step, create a superuser to access the django admin interface
+
+ $ pipenv run python ./manage.py createsuperuser
+
+The API will be running at ```http://localhost:8000``` and the admin interface can be accessed at ```http://localhost:8000/admin```
+
+