In [None]:
from zenodo.wsgi import application as app
from invenio_db import db
# Push the Flask app context on the stack
app.app_context().push()
from pprint import pprint as pp

In [None]:
from invenio_pidstore.models import PersistentIdentifier
from invenio_records.models import RecordMetadata
from sqlalchemy.orm import join, mapper, query
from invenio_records.api import Record
from itertools import takewhile, groupby
from collections import namedtuple, Counter
import json

In [None]:
def parse_record(rec, spam=False):
    
    def fetch_key(key, clean_emails=True):
        out = rec[key] if key in rec else None
        return out
    
    def key_resolve(*delims):
        obj = rec
        for d in delims:
            try:
                obj = obj[d]
            except Exception:
                return False, None
        return True, obj
    
    out = {}
        
    # Fetch files
    out['files'] = []
    if '_files' in rec:
        for f in rec['_files']:
            f_json = {
                'filename': f['key'],
                'filetype': f['type'],
                'size': f['size'],
            }
            
            out['files'].append(f_json)
    
    # Fetch the fixed keys
    out['recid'] = rec['recid']  # Recid has to be there
    out['license'] = rec['license']['$ref'].split('licenses/')[1] if 'license' in rec else None
    keys = ['access_right', 'creators', 'title', 'description', 'communities', 'publication_date', 'keywords', 'subjects', 'notes',
            'resource_type', 'related_identifiers', 'contributors', 'doi', 'journal', 'alternate_identifiers',
            'imprint', 'references', 'thesis', 'meeting', 'part_of',]
    for key in keys:
        out[key] = fetch_key(key)
        
    # Insert the spam label
    out['spam'] = spam
    def change_key(k, v, obj):
        obj[k] = v
        return obj
    
    return out

In [None]:
rms_good = db.session.query(RecordMetadata).join(
    PersistentIdentifier,
    PersistentIdentifier.object_uuid == RecordMetadata.id).filter(
        PersistentIdentifier.pid_type=='recid', 
        PersistentIdentifier.status=='R', 
        PersistentIdentifier.object_type=='rec')
print(rms_good.count())

In [None]:
rms_bad = db.session.query(RecordMetadata).join(
    PersistentIdentifier,
    PersistentIdentifier.object_uuid == RecordMetadata.id).filter(
        PersistentIdentifier.pid_type=='recid', 
        PersistentIdentifier.status=='D', 
        PersistentIdentifier.object_type=='rec')
print(rms_bad.count())

In [None]:
nonspam_records = [Record(r.json, model=r) for r in rms_good if r.json is not None and r.json['access_right'] == 'open']
print(len(nonspam_records))

In [None]:
spam_records = [Record(r.json, model=r).revisions[-2] for r in rms_bad if r.json is not None and 'removal_reason' in r.json and 'spam' in r.json['removal_reason'].lower()]
spam_records = [r for r in spam_records if r['access_right'] == 'open']
print(len(spam_records))

In [None]:
spam_records_clean = [parse_record(r, spam=True) for r in spam_records]
print(len(spam_records_clean))

In [None]:
nonspam_records_clean = [parse_record(r, spam=False) for r in nonspam_records]
print(len(nonspam_records_clean))

In [None]:
with open('spam_records.json', 'w') as fp:
    json.dump(spam_records_clean, fp, indent=2)

In [None]:
with open('nonspam_records.json', 'w') as fp:
    json.dump(nonspam_records_clean, fp, indent=2)

In [None]:
with open('zenodo_open_metadata_06_04_2017.json', 'w') as fp:
    json.dump(nonspam_records_clean + spam_records_clean, fp, indent=2)

In [None]:
with open('zenodo_open_metadata_06_04_2017_sample.json', 'w') as fp:
    json.dump(nonspam_records_clean[:1] + spam_records_clean[:1], fp, indent=2)