diff --git a/Pipfile b/Pipfile index 3c0f9de88..efb4aaf69 100644 --- a/Pipfile +++ b/Pipfile @@ -18,6 +18,7 @@ drf-yasg = "*" gunicorn = "*" gevent = "*" packaging = "*" +boto3 = "*" [dev-packages] "flake8" = "*" diff --git a/Pipfile.lock b/Pipfile.lock index c4ab89ec1..612f26a17 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "5f230c8e911d5f0f91321351555b3277e4a6ce71b6fe1843271cefaf6f11559e" + "sha256": "ff59f9ce5bebac8c74b1789aacdad90b1a6e893773c29bf4e3cb840e29590cd5" }, "pipfile-spec": 6, "requires": { @@ -23,6 +23,21 @@ ], "version": "==19.1.0" }, + "boto3": { + "hashes": [ + "sha256:654c7ebd6d089d5af634a8121f3960e50e283643660abcba07e602ac237f4839", + "sha256:f114b586c307f73a46d6dfe9dfb1c37865354f48fc749794d96517527424d1f5" + ], + "index": "pypi", + "version": "==1.9.220" + }, + "botocore": { + "hashes": [ + "sha256:748fe4ee5cc8b10ef09e52c740b488402d6f6d4d1f0dde0c936da232b42b1bdd", + "sha256:9ffd9264e4ad999d2929cfe1c7e413d4cdf76a8bd92f011dce31874f056d2e18" + ], + "version": "==1.12.220" + }, "certifi": { "hashes": [ "sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939", @@ -92,6 +107,14 @@ "index": "pypi", "version": "==3.9.0" }, + "docutils": { + "hashes": [ + "sha256:6c4f696463b79f1fb8ba0c594b63840ebd41f059e92b31957c46b74a4599b6d0", + "sha256:9e4d7ecfc600058e07ba661411a2b7de2fd0fafa17d1a7f7361cd47b1175c827", + "sha256:a2aeea129088da402665e92e0b25b04b073c04b2dce4ab65caaa38b7ce2e1a99" + ], + "version": "==0.15.2" + }, "drf-yasg": { "hashes": [ "sha256:68fded2ffdf46e03f33e766184b7d8f1e1a5236f94acfd0c4ba932a57b812566", @@ -188,6 +211,13 @@ ], "version": "==2.10.1" }, + "jmespath": { + "hashes": [ + "sha256:3720a4b1bd659dd2eecad0666459b9788813e032b83e7ba58578e48254e0a0e6", + "sha256:bde2aef6f44302dfb30320115b17d030798de8c4110e28d5cf6cf91a7a31074c" + ], + "version": "==0.9.4" + }, "markupsafe": { "hashes": [ "sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473", @@ -236,6 +266,14 @@ ], "version": "==2.4.2" }, + "python-dateutil": { + "hashes": [ + "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb", + "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e" + ], + "markers": "python_version >= '2.7'", + "version": "==2.8.0" + }, "python-decouple": { "hashes": [ "sha256:1317df14b43efee4337a4aa02914bf004f010cd56d6c4bd894e6474ec8c4fe2d" @@ -261,10 +299,10 @@ }, "ruamel.yaml": { "hashes": [ - "sha256:547aeab5c51c93bc750ed2a320c1559b605bde3aa569216aa75fd91d8a1c4623", - "sha256:c5e239b6a4f26baabb2e22b145582a7d99ae9d4ebb8902291365a61ed38faa7f" + "sha256:0db639b1b2742dae666c6fc009b8d1931ef15c9276ef31c0673cc6dcf766cf40", + "sha256:412a6f5cfdc0525dee6a27c08f5415c7fd832a7afcb7a0ed7319628aed23d408" ], - "version": "==0.16.1" + "version": "==0.16.5" }, "ruamel.yaml.clib": { "hashes": [ @@ -290,6 +328,13 @@ "markers": "platform_python_implementation == 'CPython' and python_version < '3.8'", "version": "==0.1.2" }, + "s3transfer": { + "hashes": [ + "sha256:6efc926738a3cd576c2a79725fed9afde92378aa5c6a957e3af010cb019fac9d", + "sha256:b780f2411b824cb541dbcd2c713d0cb61c7d1bcadae204cdddda2b35cef493ba" + ], + "version": "==0.2.1" + }, "six": { "hashes": [ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", @@ -310,6 +355,7 @@ "sha256:2393a695cd12afedd0dcb26fe5d50d0cf248e5a66f75dbd89a3d4eb333a61af4", "sha256:a637e5fae88995b256e3409dc4d52c2e2e0ba32c42a6365fee8bbd2238de3cfb" ], + "markers": "python_version >= '3.4'", "version": "==1.24.3" }, "whitenoise": { @@ -406,11 +452,11 @@ }, "ipython": { "hashes": [ - "sha256:1d3a1692921e932751bc1a1f7bb96dc38671eeefdc66ed33ee4cbc57e92a410e", - "sha256:537cd0176ff6abd06ef3e23f2d0c4c2c8a4d9277b7451544c6cbf56d1c79a83d" + "sha256:c4ab005921641e40a68e405e286e7a1fcc464497e14d81b6914b4fd95e5dee9b", + "sha256:dd76831f065f17bddd7eaa5c781f5ea32de5ef217592cf019e34043b56895aa1" ], "index": "pypi", - "version": "==7.7.0" + "version": "==7.8.0" }, "ipython-genutils": { "hashes": [ diff --git a/README.md b/README.md index f79eb9d50..007728c10 100644 --- a/README.md +++ b/README.md @@ -76,3 +76,7 @@ You can set environment variables in your OS, write on ```.env``` file or pass v | BOTHUB_NLP_BASE_URL | ```string``` | ```http://localhost:2657/``` | The bothub-blp production application URL. Used to proxy requests. | CHECK_ACCESSIBLE_API_URL | ```string``` | ```http://localhost/api/repositories/``` | URL used by ```bothub.health.check.check_accessible_api``` to make a HTTP request. The response status code must be 200. | SEND_EMAILS | ```boolean``` | ```True``` | Send emails flag. +| BOTHUB_ENGINE_AWS_S3_BUCKET_NAME | ```string``` | ```None``` | +| BOTHUB_ENGINE_AWS_ACCESS_KEY_ID | ```string``` | ```None``` | +| BOTHUB_ENGINE_AWS_SECRET_ACCESS_KEY | ```string``` | ```None``` | +| BOTHUB_ENGINE_AWS_REGION_NAME | ```string``` | ```None``` | diff --git a/bothub/api/v2/nlp/views.py b/bothub/api/v2/nlp/views.py index a3bd5f344..b72a73448 100644 --- a/bothub/api/v2/nlp/views.py +++ b/bothub/api/v2/nlp/views.py @@ -1,5 +1,6 @@ import base64 import json +import requests from django.db import models from django.utils.translation import gettext_lazy as _ @@ -24,6 +25,7 @@ from bothub.common.models import RepositoryUpdate from bothub.common.models import Repository from bothub.common import languages +from bothub.utils import send_bot_data_file_aws def check_auth(request): @@ -499,18 +501,24 @@ class RepositoryUpdateInterpretersViewSet( def retrieve(self, request, *args, **kwargs): check_auth(request) update = self.get_object() + try: + download = requests.get(update.bot_data) + bot_data = base64.b64encode(download.content) + except Exception: + bot_data = b'' return Response({ 'update_id': update.id, 'repository_uuid': update.repository.uuid, - 'bot_data': str(update.bot_data) + 'bot_data': str(bot_data) }) def create(self, request, *args, **kwargs): check_auth(request) + id = request.data.get('id') repository = get_object_or_404( RepositoryUpdate, - pk=request.data.get('id') + pk=id ) bot_data = base64.b64decode(request.data.get('bot_data')) - repository.save_training(bot_data) + repository.save_training(send_bot_data_file_aws(id, bot_data)) return Response({}) diff --git a/bothub/common/migrations/0035_auto_20190902_1455.py b/bothub/common/migrations/0035_auto_20190902_1455.py new file mode 100644 index 000000000..f1cef3592 --- /dev/null +++ b/bothub/common/migrations/0035_auto_20190902_1455.py @@ -0,0 +1,33 @@ +# Generated by Django 2.1.5 on 2019-09-02 14:55 + +from django.db import migrations, models +from bothub.utils import send_bot_data_file_aws +from bothub.common.models import RepositoryUpdate + + +def update_repository(apps, schema_editor): + for update in RepositoryUpdate.objects.all().exclude(bot_data__exact=''): + url = send_bot_data_file_aws(update.pk, update.bot_data) + repository_update = RepositoryUpdate.objects.get(pk=update.pk) + repository_update.bot_data = url + repository_update.save( + update_fields=[ + 'bot_data', + ]) + print('Updating bot_data repository_update {}'.format(str(update.pk))) + + +class Migration(migrations.Migration): + + dependencies = [ + ('common', '0034_repository_nlp_server'), + ] + + operations = [ + migrations.RunPython(update_repository), + migrations.AlterField( + model_name='repositoryupdate', + name='bot_data', + field=models.URLField(blank=True, verbose_name='bot data'), + ), + ] diff --git a/bothub/common/models.py b/bothub/common/models.py index bb15aee90..5335ce737 100644 --- a/bothub/common/models.py +++ b/bothub/common/models.py @@ -1,5 +1,4 @@ import uuid -import base64 import requests from functools import reduce @@ -489,10 +488,9 @@ class Meta: created_at = models.DateTimeField( _('created at'), auto_now_add=True) - bot_data = models.TextField( + bot_data = models.URLField( _('bot data'), - blank=True, - editable=False) + blank=True) by = models.ForeignKey( User, models.CASCADE, @@ -660,7 +658,7 @@ def save_training(self, bot_data): raise RepositoryUpdateAlreadyTrained() self.trained_at = timezone.now() - self.bot_data = base64.b64encode(bot_data).decode('utf8') + self.bot_data = bot_data self.repository.total_updates += 1 self.repository.save() self.save( @@ -670,7 +668,7 @@ def save_training(self, bot_data): ]) def get_bot_data(self): - return base64.b64decode(self.bot_data) + return self.bot_data def train_fail(self): self.failed_at = timezone.now() diff --git a/bothub/common/tests.py b/bothub/common/tests.py index 02d9e2a48..aebcb6b62 100644 --- a/bothub/common/tests.py +++ b/bothub/common/tests.py @@ -552,7 +552,7 @@ def test_train(self): update = self.repository.current_update() update.start_training(self.owner) - bot_data = b'bot_data__()\\\\//?(*)' + bot_data = 'https://s3.amazonaws.com' update.save_training(bot_data) self.assertEqual( diff --git a/bothub/common/views.py b/bothub/common/views.py index 7a9d7cf2e..14b051efe 100644 --- a/bothub/common/views.py +++ b/bothub/common/views.py @@ -1,5 +1,5 @@ from django.shortcuts import get_object_or_404 -from django.http import HttpResponse +from django.http import HttpResponseRedirect from django.core.exceptions import ValidationError from django.contrib.admin.views.decorators import staff_member_required from .models import RepositoryUpdate @@ -10,9 +10,5 @@ def download_bot_data(self, update_id): # pragma: no cover update = get_object_or_404(RepositoryUpdate, id=update_id) if not update.trained_at: raise ValidationError('Update #{} not trained at.'.format(update.id)) - response = HttpResponse( - update.get_bot_data(), - content_type='application/gzip') - response['Content-Disposition'] = 'inline; filename={}.tar.gz'.format( - update.id) + response = HttpResponseRedirect(update.get_bot_data()) return response diff --git a/bothub/utils.py b/bothub/utils.py index 4c52b9500..c4ecf2c44 100644 --- a/bothub/utils.py +++ b/bothub/utils.py @@ -1,3 +1,8 @@ +import io +import uuid +import boto3 +from decouple import config +from botocore.exceptions import ClientError from collections import OrderedDict @@ -10,3 +15,42 @@ def cast_supported_languages(i): def cast_empty_str_to_none(value): return value or None + + +def send_bot_data_file_aws(id, bot_data): + aws_access_key_id = config('BOTHUB_ENGINE_AWS_ACCESS_KEY_ID', default='') + aws_secret_access_key = config( + 'BOTHUB_ENGINE_AWS_SECRET_ACCESS_KEY', default='') + aws_bucket_name = config('BOTHUB_ENGINE_AWS_S3_BUCKET_NAME', default='') + aws_region_name = config('BOTHUB_ENGINE_AWS_REGION_NAME', 'us-east-1') + + confmat_url = '' + + if all([aws_access_key_id, aws_secret_access_key, aws_bucket_name]): + confmat_filename = 'repository_{}/bot_data_{}.tar.gz'.format( + str(id), uuid.uuid4()) + + botdata = io.BytesIO(bot_data) + + s3_client = boto3.client( + 's3', + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key, + region_name=aws_region_name, + ) + try: + s3_client.upload_fileobj( + botdata, + aws_bucket_name, + confmat_filename, + ExtraArgs={'ContentType': 'application/gzip'} + ) + confmat_url = '{}/{}/{}'.format( + s3_client.meta.endpoint_url, + aws_bucket_name, + confmat_filename + ) + except ClientError as e: + print(e) + + return confmat_url diff --git a/docker-compose.yml b/docker-compose.yml index 18561728d..f67a3a0b5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -50,6 +50,10 @@ services: - CHECK_ACCESSIBLE_API_URL=${CHECK_ACCESSIBLE_API_URL} - SEND_EMAILS=${SEND_EMAILS:-true} - SUPPORTED_LANGUAGES=${SUPPORTED_LANGUAGES:-en|pt} + - BOTHUB_ENGINE_AWS_ACCESS_KEY_ID=${BOTHUB_ENGINE_AWS_ACCESS_KEY_ID} + - BOTHUB_ENGINE_AWS_SECRET_ACCESS_KEY=${BOTHUB_ENGINE_AWS_SECRET_ACCESS_KEY} + - BOTHUB_ENGINE_AWS_S3_BUCKET_NAME=${BOTHUB_ENGINE_AWS_S3_BUCKET_NAME} + - BOTHUB_ENGINE_AWS_REGION_NAME=${BOTHUB_ENGINE_AWS_REGION_NAME} networks: