## Configuration

In [None]:

### RUN THIS CELL TO BEGIN CONFIGURATION ###
%run manage_data_nodes_config.ipynb

In [1]:
### LEGACY CONFIGURATION CELL -- SKIP THIS IF USING THE CONFIGURATION FORM ABOVE ###

# Data Node configurationRawData or ProcessedData

datatype = 'RawData' # string: "RawData" or "ProcessedData"
namespace = 'we1sv1.1'
collection = 'test' # string: The _id of the target collection 

# Optional RawData Node Properties

relationships = [] # list
OCR = True # Boolean
rights = "" # string

# Optional ProcessedData Node Properties

processes = [] # list

# Paths to Data Manifests

source_files = [] # list

## Basic Setup

In [2]:
# If using the configuration form, get the values from the form
try:
    assert config.values['datatype']
    datatype = config.values['datatype']
    collection = config.values['collection']
    source_files = config.values['source_files']
    if datatype == 'RawData':
        relationships.values['relationships']
        rights.values['rights']
        OCR = config.values['OCR']
    else:
        processes = config.values['processes']
except:
    pass

# Import dependencies

import os, json, pymongo
from pymongo import MongoClient
from jsonschema import validate, FormatChecker

# Set up the MongoDB client, configure the databases, and assign variables to the "collections" 
client = MongoClient('mongodb://localhost:27017')
db = client.we1s
Corpus = db.Corpus

# Define the schema to be used to create the manifest
if datatype == 'RawData':
    schema = [
        { "name": "path", "type": "string" },
        { "name": "collection", "type": "string" },    
        { "name": "relationships", "type": "list" },    
        { "name": "OCR", "type": "bool" },    
        { "name": "rights", "type": "string" }
    ]
else:
    schema = [
        { "name": "path", "type": "string" },
        { "name": "collection", "type": "string" },    
        { "name": "processes", "type": "list" }    
    ]
    

# Auto-generate `_id` and `path` values for the Data Node
_id = datatype
path = ',Corpus,' + collection + ','

# Pass the configurations to a variable
opts = globals()

## API Methods

In [12]:
def create_data_manifest(opts, schema):
    """
    Detects if the RawData or ProcessedData node exists along the
    specified path. If not, the node is created.
    """
    try:
        assert Corpus.find({'path': opts['path']})
        pass
    except:
        manifest = {'namespace': namespace}
        for item in schema:
            key = item['name']
            val = opts[key]
            # This function does not exist
            valid_datatype = isinstance(val, item['type'])
            try:
                valid_datatype == True
                manifest[key] = val
            except:
                print(manifest[key] + ' has an invalid data type. Please double check it against the schema.')
        Corpus.insert_one(manifest)


def detect_data_node(manifest, opts, schema):
    """
    Attempts to find the manifest's data node in the database. If it does not exist, creates the node or raises an error if the database does not contain the specified collection.
    """
    try:
        # Check whether the data node already exists
        assert Corpus.find_one(manifest['path'])
        pass
    except:
        try:
            # Check whether the collection exists
            assert Corpus.find_one(manifest['path'].rstrip(opts['datatype'] + ','))
            # Create the data node
            create_data_manifest(opts, schema)
        except:
            msg = 'The database does not contain the collection listed in the `path` property for ' + manifest['_id'] + '. Please run the `create_collection` notebook.'
            print(msg)

        
def insert_doc_manifests(doc_paths, opts, schema):
    """
    Loops through a list of doc manifests, ensures that the collection and data node specified therein exists in the database, and then inserts
    the doc's manifest in the database.
    """
    for path in doc_paths:
        with open(path, 'r') as f:
            manifest = json.load(f)
        detect_data_node(manifest, opts, schema)
        if validate(manifest) == True:
            Corpus.insert_one(manifest)
        else:
            print("Error: Could not produce a valid manifest.")

            
def validate_manifest(manifest):
    """
    Validates a manifest against the online manifest schema.
    """
    schema_file = 'https://raw.githubusercontent.com/whatevery1says/manifest/master/schema/Corpus/collection.json'
    schema = json.loads(requests.get(schema_file).text)
    try:
        validate(manifest, schema, format_checker=FormatChecker())
        return True
    except:
        return False

## Execute Action

In [None]:
insert_doc_manifests(source_files, opts, schema)