Skip to content

Commit

Permalink
spam: adds spam prediction when a record is published
Browse files Browse the repository at this point in the history
  • Loading branch information
Glignos committed Oct 7, 2020
1 parent 392a140 commit 9b13a4f
Show file tree
Hide file tree
Showing 10 changed files with 357 additions and 3 deletions.
3 changes: 2 additions & 1 deletion requirements.txt
Expand Up @@ -143,7 +143,7 @@ invenio-previewer==1.0.0a11
invenio-queues==1.0.0a2
invenio-records==1.3.2
invenio-records-files==1.0.0a11
invenio-records-rest==1.6.2
invenio-records-rest==1.6.3
invenio-records-ui==1.0.1
invenio-rest==1.1.2
invenio-search==1.2.3
Expand Down Expand Up @@ -250,3 +250,4 @@ WTForms==2.2.1
WTForms-Alchemy==0.16.9
WTForms-Components==0.10.3
zenodo-accessrequests==1.0.0a5
joblib==0.14.1
3 changes: 3 additions & 0 deletions setup.py
Expand Up @@ -197,6 +197,7 @@
'zenodo_stats = zenodo.modules.stats.ext:ZenodoStats',
'zenodo_theme = zenodo.modules.theme.ext:ZenodoTheme',
'zenodo_tokens = zenodo.modules.tokens.ext:ResourceAccessTokens',
'zenodo_spam = zenodo.modules.spam.ext:ZenodoSpam',
],
'invenio_base.api_apps': [
'zenodo_communities = '
Expand All @@ -206,6 +207,7 @@
'zenodo_records = zenodo.modules.records.ext:ZenodoRecords',
'zenodo_exporter = zenodo.modules.exporter.ext:InvenioExporter',
'zenodo_tokens = zenodo.modules.tokens.ext:ResourceAccessTokens',
'zenodo_spam = zenodo.modules.spam.ext:ZenodoSpam',
],
'invenio_base.blueprints': [
'zenodo_communities = zenodo.modules.communities.views:blueprint',
Expand All @@ -222,6 +224,7 @@
'invenio_base.api_blueprints': [
'zenodo_rest = zenodo.modules.rest.views:blueprint',
'zenodo_deposit = zenodo.modules.deposit.views_rest:blueprint',
'zenodo_spam = zenodo.modules.spam.views:blueprint',
],
'invenio_base.api_converters': [
'file_key = zenodo.modules.deposit.utils:FileKeyConverter',
Expand Down
34 changes: 33 additions & 1 deletion zenodo/modules/deposit/api.py
Expand Up @@ -29,8 +29,11 @@
from contextlib import contextmanager
from copy import copy

from elasticsearch_dsl import Q
from flask import current_app
from flask_principal import ActionNeed
from flask_security import current_user
from invenio_access import Permission
from invenio_communities.models import Community, InclusionRequest
from invenio_db import db
from invenio_deposit.api import Deposit, index, preserve
Expand All @@ -42,10 +45,12 @@
from invenio_pidstore.errors import PIDInvalidAction
from invenio_pidstore.models import PersistentIdentifier, PIDStatus
from invenio_records_files.models import RecordsBuckets
from invenio_search.api import RecordsSearch
from invenio_sipstore.api import RecordSIP
from invenio_sipstore.archivers import BagItArchiver
from invenio_sipstore.models import SIP as SIPModel
from invenio_sipstore.models import RecordSIP as RecordSIPModel
from werkzeug.exceptions import HTTPException

from zenodo.modules.communities.api import ZenodoCommunity
from zenodo.modules.records.api import ZenodoFileObject, ZenodoFilesIterator, \
Expand All @@ -54,6 +59,7 @@
zenodo_concept_doi_minter, zenodo_doi_updater
from zenodo.modules.records.utils import is_doi_locally_managed, \
is_valid_openaire_type
from zenodo.modules.spam.tasks import check_metadata_for_spam

from .errors import MissingCommunityError, MissingFilesError, \
OngoingMultipartUploadError, VersioningFilesError
Expand Down Expand Up @@ -484,10 +490,36 @@ def validate_publish(self):
raise MissingCommunityError(missing)

@mark_as_action
def publish(self, pid=None, id_=None, user_id=None, sip_agent=None):
def publish(self, pid=None, id_=None, user_id=None, sip_agent=None,
check_for_spam=True):
"""Publish the Zenodo deposit."""
self['owners'] = self['_deposit']['owners']
self.validate_publish()
try:
if check_for_spam and current_app.config.get(
'ZENODO_SPAM_MODEL_LOCATION'):
task = check_metadata_for_spam.delay(
self['recid'], str(self.id))
spam_proba = task.get(timeout=8)
else:
spam_proba = 0
if spam_proba > current_app.config['ZENODO_SPAM_THRESHOLD']:
if not Permission(ActionNeed('admin-access')).can():
rs = RecordsSearch(index='records').query(
Q('query_string', query="owners:{}".format(
self['owners'][0])))
has_other_records = rs.execute().hits.total['value']
if not has_other_records:
current_app.config['ZENODO_SPAM_HANDLING_ACTIONS'](
self)
except HTTPException as e:
raise(e)
except Exception:
current_app.logger.error(
u'Could not check deposit for spam',
extra={'depid': self['recid']}
)

is_first_publishing = not self.is_published()

deposit = super(ZenodoDeposit, self).publish(pid, id_)
Expand Down
9 changes: 8 additions & 1 deletion zenodo/modules/spam/__init__.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of Zenodo.
# Copyright (C) 2017 CERN.
# Copyright (C) 2017-2020 CERN.
#
# Zenodo is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
Expand All @@ -23,3 +23,10 @@
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""Spam module."""

from flask import current_app
from werkzeug.local import LocalProxy

current_spam = LocalProxy(
lambda: current_app.extensions['zenodo-spam']
)
76 changes: 76 additions & 0 deletions zenodo/modules/spam/config.py
@@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
#
# This file is part of Zenodo.
# Copyright (C) 2020 CERN.
#
# Zenodo is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Zenodo is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Zenodo; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.

from flask import abort, current_app, flash
from flask_login import logout_user
from invenio_accounts.models import User
from invenio_accounts.sessions import delete_user_sessions
from invenio_db import db

from zenodo.modules.spam.utils import send_spam_admin_email, \
send_spam_user_email


def default_spam_handling(deposit):
"""Default actions to counter spam detected record."""
user = User.query.get(deposit['_deposit']['owners'][0])
user.active = False
delete_user_sessions(user)
logout_user()
db.session.add(user)
db.session.commit()
send_spam_user_email(user.email)
if current_app.config['ZENODO_SPAM_EMAIL_ADMINS']:
send_spam_admin_email(deposit, user)
flash(
('Our spam protection system has classified your upload as a '
'potential spam attempt. As a preventive measure and due to '
'significant increase in spam, we have therefore deactivated your '
'user account and logged you out of Zenodo. Your upload has not been '
'published. If you think this is a mistake, please contact our '
'support.'),
category='warning'
)
abort(
400,
('Our spam protection system has classified your upload as a '
'potential spam attempt. As a preventive measure and due to '
'significant increase in spam, we have therefore deactivated your '
'user account and logged you out of Zenodo. Your upload has not been '
'published. If you think this is a mistake, please contact our '
'support.'),
)


# Function handling metadata detected as spam when publishing
ZENODO_SPAM_HANDLING_ACTIONS = default_spam_handling

# Spam model for record predictions
ZENODO_SPAM_MODEL_LOCATION = None

# Float number defining the probability over which a record is considered spam
ZENODO_SPAM_THRESHOLD = 0.5

# Should send email to Admins for automatically blocked users
ZENODO_SPAM_EMAIL_ADMINS = True
82 changes: 82 additions & 0 deletions zenodo/modules/spam/ext.py
@@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-
#
# This file is part of Zenodo.
# Copyright (C) 2020 CERN.
#
# Zenodo is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Zenodo is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Zenodo; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""Support and contact module for Zenodo."""

from __future__ import absolute_import, print_function

from threading import Lock

import joblib
from celery.signals import celeryd_init
from flask import current_app
from werkzeug.utils import cached_property

from . import config, current_spam

lock = Lock()


class ZenodoSpam(object):
"""Zenodo support form."""

@cached_property
def model(self):
"""Spam detection model."""
with lock:
self._is_cache_loading = True
if not current_app.config.get('ZENODO_SPAM_MODEL_LOCATION'):
return None
return joblib.load(
current_app.config['ZENODO_SPAM_MODEL_LOCATION'])

@property
def is_cache_loading(self):
"""Flag in case another thread is loading already the model."""
return getattr(self, '_is_cache_loading', False)

def __init__(self, app=None):
"""Extension initialization."""
if app:
self.init_app(app)

def init_app(self, app):
"""Flask application initialization."""
self.app = app
self.init_config(app)
app.extensions['zenodo-spam'] = self

@staticmethod
def init_config(app):
"""Initialize configuration."""
for k in dir(config):
if k.startswith('ZENODO_SPAM_'):
app.config.setdefault(k, getattr(config, k))


@celeryd_init.connect
def warm_up_cache(instance, **kwargs):
"""Preload the spam model in the celery application."""
with instance.app.flask_app.app_context():
current_spam.model
44 changes: 44 additions & 0 deletions zenodo/modules/spam/tasks.py
@@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
#
# This file is part of Zenodo.
# Copyright (C) 2020 CERN.
#
# Zenodo is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Zenodo is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Zenodo; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""Forms for spam deletion module."""

from __future__ import absolute_import, print_function

from celery import shared_task
from flask import current_app
from invenio_records.models import RecordMetadata

from zenodo.modules.spam import current_spam


@shared_task(ignore_result=False)
def check_metadata_for_spam(depid_value, dep_id):
"""Checks metadata of the provided deposit for spam content."""
if not current_app.config.get('ZENODO_SPAM_MODEL_LOCATION'):
return 0
deposit = RecordMetadata.query.get(dep_id)
spam_proba = current_spam.model.predict_proba(
[deposit.json['title'] + ' ' + deposit.json['description']])[0][1]
return spam_proba
@@ -0,0 +1,25 @@
{#
# This file is part of Zenodo.
# Copyright (C) 2020 CERN.
#
# Zenodo is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Zenodo is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Zenodo; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.
-#}

The deposit https://zenodo.org/deposit/{{ deposit['recid'] }} from the User https://zenodo.org/spam/{{ user.id }}/delete/ has been marked as spam.
@@ -0,0 +1,27 @@
{#
# This file is part of Zenodo.
# Copyright (C) 2020 CERN.
#
# Zenodo is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Zenodo is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Zenodo; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.
-#}

Our spam protection system has classified your upload as a potential spam attempt.
As preventive measure, we have therefore deactivated your user account.
If you think this is wrong, please contact us on our support line.

0 comments on commit 9b13a4f

Please sign in to comment.