Skip to content

Commit

Permalink
Merge 2b7aaed into dff3433
Browse files Browse the repository at this point in the history
  • Loading branch information
Glignos committed Oct 9, 2020
2 parents dff3433 + 2b7aaed commit fa0d696
Show file tree
Hide file tree
Showing 11 changed files with 346 additions and 4 deletions.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ invenio-previewer==1.0.0a11
invenio-queues==1.0.0a2
invenio-records==1.3.2
invenio-records-files==1.0.0a11
invenio-records-rest==1.6.2
invenio-records-rest==1.6.3
invenio-records-ui==1.0.1
invenio-rest==1.1.2
invenio-search==1.2.3
Expand Down Expand Up @@ -250,3 +250,4 @@ WTForms==2.2.1
WTForms-Alchemy==0.16.9
WTForms-Components==0.10.3
zenodo-accessrequests==1.0.0a5
joblib==0.14.1
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@
'zenodo_stats = zenodo.modules.stats.ext:ZenodoStats',
'zenodo_theme = zenodo.modules.theme.ext:ZenodoTheme',
'zenodo_tokens = zenodo.modules.tokens.ext:ResourceAccessTokens',
'zenodo_spam = zenodo.modules.spam.ext:ZenodoSpam',
],
'invenio_base.api_apps': [
'zenodo_communities = '
Expand All @@ -206,6 +207,7 @@
'zenodo_records = zenodo.modules.records.ext:ZenodoRecords',
'zenodo_exporter = zenodo.modules.exporter.ext:InvenioExporter',
'zenodo_tokens = zenodo.modules.tokens.ext:ResourceAccessTokens',
'zenodo_spam = zenodo.modules.spam.ext:ZenodoSpam',
],
'invenio_base.blueprints': [
'zenodo_communities = zenodo.modules.communities.views:blueprint',
Expand All @@ -222,6 +224,7 @@
'invenio_base.api_blueprints': [
'zenodo_rest = zenodo.modules.rest.views:blueprint',
'zenodo_deposit = zenodo.modules.deposit.views_rest:blueprint',
'zenodo_spam = zenodo.modules.spam.views:blueprint',
],
'invenio_base.api_converters': [
'file_key = zenodo.modules.deposit.utils:FileKeyConverter',
Expand Down
34 changes: 33 additions & 1 deletion zenodo/modules/deposit/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,11 @@
from contextlib import contextmanager
from copy import copy

from elasticsearch_dsl import Q
from flask import current_app
from flask_principal import ActionNeed
from flask_security import current_user
from invenio_access import Permission
from invenio_communities.models import Community, InclusionRequest
from invenio_db import db
from invenio_deposit.api import Deposit, index, preserve
Expand All @@ -42,10 +45,12 @@
from invenio_pidstore.errors import PIDInvalidAction
from invenio_pidstore.models import PersistentIdentifier, PIDStatus
from invenio_records_files.models import RecordsBuckets
from invenio_search.api import RecordsSearch
from invenio_sipstore.api import RecordSIP
from invenio_sipstore.archivers import BagItArchiver
from invenio_sipstore.models import SIP as SIPModel
from invenio_sipstore.models import RecordSIP as RecordSIPModel
from werkzeug.exceptions import HTTPException

from zenodo.modules.communities.api import ZenodoCommunity
from zenodo.modules.records.api import ZenodoFileObject, ZenodoFilesIterator, \
Expand All @@ -54,6 +59,7 @@
zenodo_concept_doi_minter, zenodo_doi_updater
from zenodo.modules.records.utils import is_doi_locally_managed, \
is_valid_openaire_type
from zenodo.modules.spam.tasks import check_metadata_for_spam

from .errors import MissingCommunityError, MissingFilesError, \
OngoingMultipartUploadError, VersioningFilesError
Expand Down Expand Up @@ -483,11 +489,37 @@ def validate_publish(self):
if missing:
raise MissingCommunityError(missing)

def spam_check(self):
"""Checks deposit metadata for spam content."""
try:
if current_app.config.get('ZENODO_SPAM_MODEL_LOCATION'):
task = check_metadata_for_spam.delay(str(self.id))
spam_proba = task.get(timeout=current_app.config[
'ZENODO_SPAM_CHECK_TIMEOUT'])
else:
spam_proba = 0
if spam_proba > current_app.config['ZENODO_SPAM_THRESHOLD']:
if not Permission(ActionNeed('admin-access')).can():
rs = RecordsSearch(index='records').query(
Q('query_string', query="owners:{}".format(
self['owners'][0])))
if not rs.count():
current_app.config['ZENODO_SPAM_HANDLING_ACTIONS'](
self)
except HTTPException:
raise
except Exception:
current_app.logger.exception(u'Could not check deposit for spam')

@mark_as_action
def publish(self, pid=None, id_=None, user_id=None, sip_agent=None):
def publish(self, pid=None, id_=None, user_id=None, sip_agent=None,
spam_check=True):
"""Publish the Zenodo deposit."""
self['owners'] = self['_deposit']['owners']
self.validate_publish()
if spam_check:
self.spam_check()

is_first_publishing = not self.is_published()

deposit = super(ZenodoDeposit, self).publish(pid, id_)
Expand Down
4 changes: 3 additions & 1 deletion zenodo/modules/github/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,9 @@ def publish(self):
'github_id': self.release['author']['id'],
'email': self.gh.account.user.email,
}
deposit.publish(user_id=self.event.user_id, sip_agent=sip_agent)
deposit.publish(
user_id=self.event.user_id, sip_agent=sip_agent,
spam_check=False)
recid_pid, record = deposit.fetch_published()
self.model.recordmetadata = record.model
if versioning and stashed_draft_child:
Expand Down
9 changes: 8 additions & 1 deletion zenodo/modules/spam/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
#
# This file is part of Zenodo.
# Copyright (C) 2017 CERN.
# Copyright (C) 2017-2020 CERN.
#
# Zenodo is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
Expand All @@ -23,3 +23,10 @@
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""Spam module."""

from flask import current_app
from werkzeug.local import LocalProxy

current_spam = LocalProxy(
lambda: current_app.extensions['zenodo-spam']
)
70 changes: 70 additions & 0 deletions zenodo/modules/spam/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# -*- coding: utf-8 -*-
#
# This file is part of Zenodo.
# Copyright (C) 2020 CERN.
#
# Zenodo is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Zenodo is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Zenodo; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.

from flask import abort, current_app, flash
from flask_login import logout_user
from invenio_accounts.models import User
from invenio_accounts.sessions import delete_user_sessions
from invenio_db import db

from zenodo.modules.spam.utils import send_spam_admin_email, \
send_spam_user_email


def default_spam_handling(deposit):
"""Default actions to counter spam detected record."""
user = User.query.get(deposit['_deposit']['owners'][0])
user.active = False
delete_user_sessions(user)
logout_user()
db.session.add(user)
db.session.commit()
send_spam_user_email(user.email)
if current_app.config['ZENODO_SPAM_EMAIL_ADMINS']:
send_spam_admin_email(deposit, user)
error_message = \
('Our spam protection system has classified your upload as a '
'potential spam attempt. As a preventive measure and due to '
'significant increase in spam, we have therefore deactivated your '
'user account and logged you out of Zenodo. Your upload has not been '
'published. If you think this is a mistake, please contact our '
'support.')
flash(error_message, category='warning')
abort(400, error_message,)


# Function handling metadata detected as spam when publishing
ZENODO_SPAM_HANDLING_ACTIONS = default_spam_handling

# Spam model for record predictions
ZENODO_SPAM_MODEL_LOCATION = None

# Float number defining the probability over which a record is considered spam
ZENODO_SPAM_THRESHOLD = 0.5

# Should send email to Admins for automatically blocked users
ZENODO_SPAM_EMAIL_ADMINS = True

# Timeout for spam check task before it bypasses the check
ZENODO_SPAM_CHECK_TIMEOUT = 8
74 changes: 74 additions & 0 deletions zenodo/modules/spam/ext.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
#
# This file is part of Zenodo.
# Copyright (C) 2020 CERN.
#
# Zenodo is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Zenodo is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Zenodo; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""Support and contact module for Zenodo."""

from __future__ import absolute_import, print_function

import joblib
from celery.signals import celeryd_init
from flask import current_app

from . import config, current_spam


class ZenodoSpam(object):
"""Zenodo support form."""

@property
def model(self):
"""Spam detection model."""
if not getattr(self, '_model', None):
if not current_app.config.get('ZENODO_SPAM_MODEL_LOCATION'):
model = None
else:
model = joblib.load(
current_app.config['ZENODO_SPAM_MODEL_LOCATION'])
self._model = model
return self._model

def __init__(self, app=None):
"""Extension initialization."""
if app:
self.init_app(app)

def init_app(self, app):
"""Flask application initialization."""
self.app = app
self.init_config(app)
app.extensions['zenodo-spam'] = self

@staticmethod
def init_config(app):
"""Initialize configuration."""
for k in dir(config):
if k.startswith('ZENODO_SPAM_'):
app.config.setdefault(k, getattr(config, k))


@celeryd_init.connect
def warm_up_cache(instance, **kwargs):
"""Preload the spam model in the celery application."""
with instance.app.flask_app.app_context():
current_spam.model
44 changes: 44 additions & 0 deletions zenodo/modules/spam/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
#
# This file is part of Zenodo.
# Copyright (C) 2020 CERN.
#
# Zenodo is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Zenodo is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Zenodo; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.

"""Forms for spam deletion module."""

from __future__ import absolute_import, print_function

from celery import shared_task
from flask import current_app
from invenio_records.models import RecordMetadata

from zenodo.modules.spam import current_spam


@shared_task(ignore_result=False)
def check_metadata_for_spam(dep_id):
"""Checks metadata of the provided deposit for spam content."""
if not current_app.config.get('ZENODO_SPAM_MODEL_LOCATION'):
return 0
deposit = RecordMetadata.query.get(dep_id)
spam_proba = current_spam.model.predict_proba(
[deposit.json['title'] + ' ' + deposit.json['description']])[0][1]
return spam_proba
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{#
# This file is part of Zenodo.
# Copyright (C) 2020 CERN.
#
# Zenodo is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Zenodo is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Zenodo; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.
-#}

The deposit https://zenodo.org/deposit/{{ deposit['recid'] }} from the User https://zenodo.org/spam/{{ user.id }}/delete/ has been marked as spam.
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
{#
# This file is part of Zenodo.
# Copyright (C) 2020 CERN.
#
# Zenodo is free software; you can redistribute it
# and/or modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# Zenodo is distributed in the hope that it will be
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Zenodo; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
# MA 02111-1307, USA.
#
# In applying this license, CERN does not
# waive the privileges and immunities granted to it by virtue of its status
# as an Intergovernmental Organization or submit itself to any jurisdiction.
-#}

Our spam protection system has classified your upload as a potential spam attempt.
As preventive measure, we have therefore deactivated your user account.
If you think this is wrong, please contact us on our support line.

0 comments on commit fa0d696

Please sign in to comment.