From 9b13a4ff351a038cb99b6ae0e58adbf67ff29489 Mon Sep 17 00:00:00 2001 From: Georgios Lignos Date: Fri, 2 Oct 2020 17:18:55 +0200 Subject: [PATCH] spam: adds spam prediction when a record is published --- requirements.txt | 3 +- setup.py | 3 + zenodo/modules/deposit/api.py | 34 +++++++- zenodo/modules/spam/__init__.py | 9 +- zenodo/modules/spam/config.py | 76 +++++++++++++++++ zenodo/modules/spam/ext.py | 82 +++++++++++++++++++ zenodo/modules/spam/tasks.py | 44 ++++++++++ .../zenodo_spam/email/spam_admin_email.tpl | 25 ++++++ .../zenodo_spam/email/spam_user_email.tpl | 27 ++++++ zenodo/modules/spam/utils.py | 57 +++++++++++++ 10 files changed, 357 insertions(+), 3 deletions(-) create mode 100644 zenodo/modules/spam/config.py create mode 100644 zenodo/modules/spam/ext.py create mode 100644 zenodo/modules/spam/tasks.py create mode 100644 zenodo/modules/spam/templates/zenodo_spam/email/spam_admin_email.tpl create mode 100644 zenodo/modules/spam/templates/zenodo_spam/email/spam_user_email.tpl create mode 100644 zenodo/modules/spam/utils.py diff --git a/requirements.txt b/requirements.txt index dec5c200f..5a48909ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -143,7 +143,7 @@ invenio-previewer==1.0.0a11 invenio-queues==1.0.0a2 invenio-records==1.3.2 invenio-records-files==1.0.0a11 -invenio-records-rest==1.6.2 +invenio-records-rest==1.6.3 invenio-records-ui==1.0.1 invenio-rest==1.1.2 invenio-search==1.2.3 @@ -250,3 +250,4 @@ WTForms==2.2.1 WTForms-Alchemy==0.16.9 WTForms-Components==0.10.3 zenodo-accessrequests==1.0.0a5 +joblib==0.14.1 diff --git a/setup.py b/setup.py index 4b0120c5e..dcf224a66 100644 --- a/setup.py +++ b/setup.py @@ -197,6 +197,7 @@ 'zenodo_stats = zenodo.modules.stats.ext:ZenodoStats', 'zenodo_theme = zenodo.modules.theme.ext:ZenodoTheme', 'zenodo_tokens = zenodo.modules.tokens.ext:ResourceAccessTokens', + 'zenodo_spam = zenodo.modules.spam.ext:ZenodoSpam', ], 'invenio_base.api_apps': [ 'zenodo_communities = ' @@ -206,6 +207,7 @@ 'zenodo_records = zenodo.modules.records.ext:ZenodoRecords', 'zenodo_exporter = zenodo.modules.exporter.ext:InvenioExporter', 'zenodo_tokens = zenodo.modules.tokens.ext:ResourceAccessTokens', + 'zenodo_spam = zenodo.modules.spam.ext:ZenodoSpam', ], 'invenio_base.blueprints': [ 'zenodo_communities = zenodo.modules.communities.views:blueprint', @@ -222,6 +224,7 @@ 'invenio_base.api_blueprints': [ 'zenodo_rest = zenodo.modules.rest.views:blueprint', 'zenodo_deposit = zenodo.modules.deposit.views_rest:blueprint', + 'zenodo_spam = zenodo.modules.spam.views:blueprint', ], 'invenio_base.api_converters': [ 'file_key = zenodo.modules.deposit.utils:FileKeyConverter', diff --git a/zenodo/modules/deposit/api.py b/zenodo/modules/deposit/api.py index 7195f9ebb..33d2de871 100644 --- a/zenodo/modules/deposit/api.py +++ b/zenodo/modules/deposit/api.py @@ -29,8 +29,11 @@ from contextlib import contextmanager from copy import copy +from elasticsearch_dsl import Q from flask import current_app +from flask_principal import ActionNeed from flask_security import current_user +from invenio_access import Permission from invenio_communities.models import Community, InclusionRequest from invenio_db import db from invenio_deposit.api import Deposit, index, preserve @@ -42,10 +45,12 @@ from invenio_pidstore.errors import PIDInvalidAction from invenio_pidstore.models import PersistentIdentifier, PIDStatus from invenio_records_files.models import RecordsBuckets +from invenio_search.api import RecordsSearch from invenio_sipstore.api import RecordSIP from invenio_sipstore.archivers import BagItArchiver from invenio_sipstore.models import SIP as SIPModel from invenio_sipstore.models import RecordSIP as RecordSIPModel +from werkzeug.exceptions import HTTPException from zenodo.modules.communities.api import ZenodoCommunity from zenodo.modules.records.api import ZenodoFileObject, ZenodoFilesIterator, \ @@ -54,6 +59,7 @@ zenodo_concept_doi_minter, zenodo_doi_updater from zenodo.modules.records.utils import is_doi_locally_managed, \ is_valid_openaire_type +from zenodo.modules.spam.tasks import check_metadata_for_spam from .errors import MissingCommunityError, MissingFilesError, \ OngoingMultipartUploadError, VersioningFilesError @@ -484,10 +490,36 @@ def validate_publish(self): raise MissingCommunityError(missing) @mark_as_action - def publish(self, pid=None, id_=None, user_id=None, sip_agent=None): + def publish(self, pid=None, id_=None, user_id=None, sip_agent=None, + check_for_spam=True): """Publish the Zenodo deposit.""" self['owners'] = self['_deposit']['owners'] self.validate_publish() + try: + if check_for_spam and current_app.config.get( + 'ZENODO_SPAM_MODEL_LOCATION'): + task = check_metadata_for_spam.delay( + self['recid'], str(self.id)) + spam_proba = task.get(timeout=8) + else: + spam_proba = 0 + if spam_proba > current_app.config['ZENODO_SPAM_THRESHOLD']: + if not Permission(ActionNeed('admin-access')).can(): + rs = RecordsSearch(index='records').query( + Q('query_string', query="owners:{}".format( + self['owners'][0]))) + has_other_records = rs.execute().hits.total['value'] + if not has_other_records: + current_app.config['ZENODO_SPAM_HANDLING_ACTIONS']( + self) + except HTTPException as e: + raise(e) + except Exception: + current_app.logger.error( + u'Could not check deposit for spam', + extra={'depid': self['recid']} + ) + is_first_publishing = not self.is_published() deposit = super(ZenodoDeposit, self).publish(pid, id_) diff --git a/zenodo/modules/spam/__init__.py b/zenodo/modules/spam/__init__.py index 4265ca276..8ae6d0071 100644 --- a/zenodo/modules/spam/__init__.py +++ b/zenodo/modules/spam/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # # This file is part of Zenodo. -# Copyright (C) 2017 CERN. +# Copyright (C) 2017-2020 CERN. # # Zenodo is free software; you can redistribute it # and/or modify it under the terms of the GNU General Public License as @@ -23,3 +23,10 @@ # as an Intergovernmental Organization or submit itself to any jurisdiction. """Spam module.""" + +from flask import current_app +from werkzeug.local import LocalProxy + +current_spam = LocalProxy( + lambda: current_app.extensions['zenodo-spam'] +) diff --git a/zenodo/modules/spam/config.py b/zenodo/modules/spam/config.py new file mode 100644 index 000000000..6dec9fd7d --- /dev/null +++ b/zenodo/modules/spam/config.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Zenodo. +# Copyright (C) 2020 CERN. +# +# Zenodo is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Zenodo is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Zenodo; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. + +from flask import abort, current_app, flash +from flask_login import logout_user +from invenio_accounts.models import User +from invenio_accounts.sessions import delete_user_sessions +from invenio_db import db + +from zenodo.modules.spam.utils import send_spam_admin_email, \ + send_spam_user_email + + +def default_spam_handling(deposit): + """Default actions to counter spam detected record.""" + user = User.query.get(deposit['_deposit']['owners'][0]) + user.active = False + delete_user_sessions(user) + logout_user() + db.session.add(user) + db.session.commit() + send_spam_user_email(user.email) + if current_app.config['ZENODO_SPAM_EMAIL_ADMINS']: + send_spam_admin_email(deposit, user) + flash( + ('Our spam protection system has classified your upload as a ' + 'potential spam attempt. As a preventive measure and due to ' + 'significant increase in spam, we have therefore deactivated your ' + 'user account and logged you out of Zenodo. Your upload has not been ' + 'published. If you think this is a mistake, please contact our ' + 'support.'), + category='warning' + ) + abort( + 400, + ('Our spam protection system has classified your upload as a ' + 'potential spam attempt. As a preventive measure and due to ' + 'significant increase in spam, we have therefore deactivated your ' + 'user account and logged you out of Zenodo. Your upload has not been ' + 'published. If you think this is a mistake, please contact our ' + 'support.'), + ) + + +# Function handling metadata detected as spam when publishing +ZENODO_SPAM_HANDLING_ACTIONS = default_spam_handling + +# Spam model for record predictions +ZENODO_SPAM_MODEL_LOCATION = None + +# Float number defining the probability over which a record is considered spam +ZENODO_SPAM_THRESHOLD = 0.5 + +# Should send email to Admins for automatically blocked users +ZENODO_SPAM_EMAIL_ADMINS = True diff --git a/zenodo/modules/spam/ext.py b/zenodo/modules/spam/ext.py new file mode 100644 index 000000000..e686db67f --- /dev/null +++ b/zenodo/modules/spam/ext.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Zenodo. +# Copyright (C) 2020 CERN. +# +# Zenodo is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Zenodo is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Zenodo; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. + +"""Support and contact module for Zenodo.""" + +from __future__ import absolute_import, print_function + +from threading import Lock + +import joblib +from celery.signals import celeryd_init +from flask import current_app +from werkzeug.utils import cached_property + +from . import config, current_spam + +lock = Lock() + + +class ZenodoSpam(object): + """Zenodo support form.""" + + @cached_property + def model(self): + """Spam detection model.""" + with lock: + self._is_cache_loading = True + if not current_app.config.get('ZENODO_SPAM_MODEL_LOCATION'): + return None + return joblib.load( + current_app.config['ZENODO_SPAM_MODEL_LOCATION']) + + @property + def is_cache_loading(self): + """Flag in case another thread is loading already the model.""" + return getattr(self, '_is_cache_loading', False) + + def __init__(self, app=None): + """Extension initialization.""" + if app: + self.init_app(app) + + def init_app(self, app): + """Flask application initialization.""" + self.app = app + self.init_config(app) + app.extensions['zenodo-spam'] = self + + @staticmethod + def init_config(app): + """Initialize configuration.""" + for k in dir(config): + if k.startswith('ZENODO_SPAM_'): + app.config.setdefault(k, getattr(config, k)) + + +@celeryd_init.connect +def warm_up_cache(instance, **kwargs): + """Preload the spam model in the celery application.""" + with instance.app.flask_app.app_context(): + current_spam.model diff --git a/zenodo/modules/spam/tasks.py b/zenodo/modules/spam/tasks.py new file mode 100644 index 000000000..0d9df8d9d --- /dev/null +++ b/zenodo/modules/spam/tasks.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Zenodo. +# Copyright (C) 2020 CERN. +# +# Zenodo is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Zenodo is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Zenodo; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. + +"""Forms for spam deletion module.""" + +from __future__ import absolute_import, print_function + +from celery import shared_task +from flask import current_app +from invenio_records.models import RecordMetadata + +from zenodo.modules.spam import current_spam + + +@shared_task(ignore_result=False) +def check_metadata_for_spam(depid_value, dep_id): + """Checks metadata of the provided deposit for spam content.""" + if not current_app.config.get('ZENODO_SPAM_MODEL_LOCATION'): + return 0 + deposit = RecordMetadata.query.get(dep_id) + spam_proba = current_spam.model.predict_proba( + [deposit.json['title'] + ' ' + deposit.json['description']])[0][1] + return spam_proba diff --git a/zenodo/modules/spam/templates/zenodo_spam/email/spam_admin_email.tpl b/zenodo/modules/spam/templates/zenodo_spam/email/spam_admin_email.tpl new file mode 100644 index 000000000..7bd0f2b7d --- /dev/null +++ b/zenodo/modules/spam/templates/zenodo_spam/email/spam_admin_email.tpl @@ -0,0 +1,25 @@ +{# +# This file is part of Zenodo. +# Copyright (C) 2020 CERN. +# +# Zenodo is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Zenodo is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Zenodo; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. +-#} + +The deposit https://zenodo.org/deposit/{{ deposit['recid'] }} from the User https://zenodo.org/spam/{{ user.id }}/delete/ has been marked as spam. diff --git a/zenodo/modules/spam/templates/zenodo_spam/email/spam_user_email.tpl b/zenodo/modules/spam/templates/zenodo_spam/email/spam_user_email.tpl new file mode 100644 index 000000000..70a642c16 --- /dev/null +++ b/zenodo/modules/spam/templates/zenodo_spam/email/spam_user_email.tpl @@ -0,0 +1,27 @@ +{# +# This file is part of Zenodo. +# Copyright (C) 2020 CERN. +# +# Zenodo is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Zenodo is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Zenodo; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. +-#} + +Our spam protection system has classified your upload as a potential spam attempt. +As preventive measure, we have therefore deactivated your user account. +If you think this is wrong, please contact us on our support line. diff --git a/zenodo/modules/spam/utils.py b/zenodo/modules/spam/utils.py new file mode 100644 index 000000000..7bc1c0a7a --- /dev/null +++ b/zenodo/modules/spam/utils.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Zenodo. +# Copyright (C) 2020 CERN. +# +# Zenodo is free software; you can redistribute it +# and/or modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Zenodo is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Zenodo; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, +# MA 02111-1307, USA. +# +# In applying this license, CERN does not +# waive the privileges and immunities granted to it by virtue of its status +# as an Intergovernmental Organization or submit itself to any jurisdiction. + +"""Forms for spam deletion module.""" + +from __future__ import absolute_import, print_function + +from flask import current_app, render_template +from flask_babelex import gettext as _ +from flask_mail import Message +from invenio_mail.tasks import send_email + + +def send_spam_user_email(recipient): + """Send email notification to blocked user after spam detection.""" + msg = Message( + _("Your Zenodo Upload has been automatically marked as Spam."), + sender=current_app.config.get('SUPPORT_EMAIL'), + recipients=[recipient], + ) + msg.body = render_template("zenodo_spam/email/spam_user_email.tpl") + send_email.delay(msg.__dict__) + + +def send_spam_admin_email(deposit, user): + """Send email notification to admins for a spam detection.""" + msg = Message( + _("Zenodo Deposit Marked as Spam."), + sender=current_app.config.get('SUPPORT_EMAIL'), + recipients=[current_app.config.get('ZENODO_ADMIN_EMAIL')], + ) + msg.body = render_template( + "zenodo_spam/email/spam_admin_email.tpl", + user=user, + deposit=deposit) + send_email.delay(msg.__dict__)