In [4]:
import requests
from datetime import datetime, timedelta

def harvest_zenodo_records(from_date, to_date=None):
    if not to_date:
        to_date = (from_date + timedelta(days=1)).date()
    if isinstance(from_date, datetime):
        from_date = from_date.date()
    if isinstance(to_date, datetime):
        to_date = to_date.date()
    headers = {"Content-Type": "application/json"}
    params = {
        'size': 500,
        'page': 1,
        'q': 'created:[{from_date} TO {to_date}]'.format(from_date=from_date, to_date=to_date)
    }
    url = 'https://zenodo.org/api/records'
    records = []
    res = requests.get(url, headers=headers, params=params).json()
    records.extend(res['hits']['hits'])
    while 'next' in res['links']:
        res = requests.get(res['links']['next'], headers=headers).json()
        records.extend(res['hits']['hits'])
    return records

resp = harvest_zenodo_records(datetime.now())

def prepare_features(rec):
    return rec['metadata']['description'] + rec['metadata']['title']

from sklearn.externals import joblib
text_clf = joblib.load('2017_06_14_record_spam.pkl') 

In [5]:
test_X = [prepare_features(d) for d in resp]

In [6]:
y = text_clf.predict(test_X)

In [7]:
from collections import Counter

In [11]:
spams = list(r for r, y in zip(resp, y) if y)

In [21]:
spams[0]

{'conceptdoi': '10.5281/zenodo.808143',
 'conceptrecid': '808143',
 'created': '2017-06-14T08:39:51.771905+00:00',
 'doi': '10.5281/zenodo.808144',
 'id': 808144,
 'links': {'badge': 'https://zenodo.org/badge/doi/10.5281/zenodo.808144.svg',
  'bucket': 'https://zenodo.org/api/files/8206b8ed-2837-432f-8f7a-a40ccf2739fa',
  'conceptbadge': 'https://zenodo.org/badge/doi/10.5281/zenodo.808143.svg',
  'conceptdoi': 'https://doi.org/10.5281/zenodo.808143',
  'doi': 'https://doi.org/10.5281/zenodo.808144',
  'html': 'https://zenodo.org/record/808144',
  'latest': 'https://zenodo.org/api/records/808144',
  'latest_html': 'https://zenodo.org/record/808144',
  'self': 'https://zenodo.org/api/records/808144'},
 'metadata': {'access_right': 'open',
  'access_right_category': 'success',
  'creators': [{'name': 'ADMIN'}],
  'description': '<p><strong>GBM 380 Global Business</strong></p>\n\n<p><strong>To purchase this material click below link</strong></p>\n\n<p>\xa0</p>\n\n<p><strong>http://www.assi

In [19]:
Counter(list(s['owners'][0] for s in spams))

Counter({30532: 10, 30894: 13, 32283: 62, 32735: 16})

In [None]:
def prepare_features(rec):
    return rec['description'] + rec['title']

from sklearn.externals import joblib
text_clf = joblib.load('2017_06_14_record_spam.pkl') 

In [81]:
len(resp)

392

In [83]:
resp[0]

{'conceptdoi': '10.5281/zenodo.807826',
 'conceptrecid': '807826',
 'created': '2017-06-14T07:13:55.748781+00:00',
 'doi': '10.5281/zenodo.807827',
 'id': 807827,
 'links': {'badge': 'https://zenodo.org/badge/doi/10.5281/zenodo.807827.svg',
  'bucket': 'https://zenodo.org/api/files/c12e356e-9d94-46bc-929e-7d42521dbac9',
  'conceptbadge': 'https://zenodo.org/badge/doi/10.5281/zenodo.807826.svg',
  'conceptdoi': 'https://doi.org/10.5281/zenodo.807826',
  'doi': 'https://doi.org/10.5281/zenodo.807827',
  'html': 'https://zenodo.org/record/807827',
  'latest': 'https://zenodo.org/api/records/807827',
  'latest_html': 'https://zenodo.org/record/807827',
  'self': 'https://zenodo.org/api/records/807827'},
 'metadata': {'access_right': 'open',
  'access_right_category': 'success',
  'communities': [{'id': 'biosyslit'}],
  'creators': [{'name': 'Doğan, Sibel'},
   {'name': 'Doğan, Salih'},
   {'name': 'Erman, Orhan'}],
  'description': 'FIGURE 4. Stigmaeus communis sp. nov. (Male) A. Dorsal vi

In [74]:
resp.json()['hits']['hits']

[{'conceptdoi': '10.5281/zenodo.807826',
  'conceptrecid': '807826',
  'created': '2017-06-14T07:13:55.748781+00:00',
  'doi': '10.5281/zenodo.807827',
  'id': 807827,
  'links': {'badge': 'https://zenodo.org/badge/doi/10.5281/zenodo.807827.svg',
   'bucket': 'https://zenodo.org/api/files/c12e356e-9d94-46bc-929e-7d42521dbac9',
   'conceptbadge': 'https://zenodo.org/badge/doi/10.5281/zenodo.807826.svg',
   'conceptdoi': 'https://doi.org/10.5281/zenodo.807826',
   'doi': 'https://doi.org/10.5281/zenodo.807827',
   'html': 'https://zenodo.org/record/807827',
   'latest': 'https://zenodo.org/api/records/807827',
   'latest_html': 'https://zenodo.org/record/807827',
   'self': 'https://zenodo.org/api/records/807827'},
  'metadata': {'access_right': 'open',
   'access_right_category': 'success',
   'communities': [{'id': 'biosyslit'}],
   'creators': [{'name': 'Doğan, Sibel'},
    {'name': 'Doğan, Salih'},
    {'name': 'Erman, Orhan'}],
   'description': 'FIGURE 4. Stigmaeus communis sp. nov

In [56]:
print(url)

https://zenodo.org/api/records?page=1&size=100&q=created:%5B2017-06-14%20TO%202017-06-15%5D


In [57]:
headers = {"Content-Type": "application/json"}
resp = requests.get(url, headers=headers)

In [58]:
resp.status_code

200

In [59]:
resp.json()['links']

{'next': 'https://zenodo.org/api/records/?sort=bestmatch&q=created%3A%5B2017-06-14+TO+2017-06-15%5D&page=2&size=100',
 'self': 'https://zenodo.org/api/records/?sort=bestmatch&q=created%3A%5B2017-06-14+TO+2017-06-15%5D&page=1&size=100'}

In [47]:
resp.json()['hits']

{'hits': [{'conceptdoi': '10.5281/zenodo.807826',
   'conceptrecid': '807826',
   'created': '2017-06-14T07:13:55.748781+00:00',
   'doi': '10.5281/zenodo.807827',
   'id': 807827,
   'links': {'badge': 'https://zenodo.org/badge/doi/10.5281/zenodo.807827.svg',
    'bucket': 'https://zenodo.org/api/files/c12e356e-9d94-46bc-929e-7d42521dbac9',
    'conceptbadge': 'https://zenodo.org/badge/doi/10.5281/zenodo.807826.svg',
    'conceptdoi': 'https://doi.org/10.5281/zenodo.807826',
    'doi': 'https://doi.org/10.5281/zenodo.807827',
    'html': 'https://zenodo.org/record/807827',
    'latest': 'https://zenodo.org/api/records/807827',
    'latest_html': 'https://zenodo.org/record/807827',
    'self': 'https://zenodo.org/api/records/807827'},
   'metadata': {'access_right': 'open',
    'access_right_category': 'success',
    'communities': [{'id': 'biosyslit'}],
    'creators': [{'name': 'Doğan, Sibel'},
     {'name': 'Doğan, Salih'},
     {'name': 'Erman, Orhan'}],
    'description': 'FIGURE 