## Configuration

In [None]:
"""
WARNING! This notebook does not validate data manifests before inserting them in the database.
"""

# Data Node configurationRawData or ProcessedData

datatype = 'RawData' # string: "RawData" or "ProcessedData" 
collection = '' # string: The _id of the target collection 

# Optional Data Node Properties

relationships = [] # list
OCR = True # Boolean
rights = "" # string

# Paths to Data Manifests

source_files = [] # list

## Basic Setup

In [None]:
# Import dependencies

import os, json, pymongo
from pymongo import MongoClient

# Set up the MongoDB client, configure the databases, and assign variables to the "collections" 
client = MongoClient('mongodb://localhost:27017')
db = client.we1s
Corpus = db.Corpus

# Define the schema to be used to create the manifest
schema = [
    { "name": "path", "type": "string" },
    { "name": "relationships", "type": "list" },    
    { "name": "OCR", "type": "bool" },    
    { "name": "rights", "type": "string" }
]

# Auto-generate `_id` and `path` values for the Data Node
_id = datatype
path = ',Corpus,' + collection + ','

# Pass the configurations to a variable
opts = globals()

## API Methods

In [None]:
def detect_data_node(manifest, opts, schema):
    """
    Attempts to the manifest's data node in the database. If it does not exist, creates the node or raises an error if the database does not contain the specified collection.
    """
    try:
        # Check whether the data node already exists
        assert Corpus.find_one(manifest['path'])
        pass
    except:
        try:
            # Check whether the collection exists
            assert Corpus.find_one(manifest['path'].rstrip(opts['datatype'] + ','))
            # Create the data node
            create_data_manifest(opts, schema)
        except:
            msg = 'The database does not contain the collection listed in the `path` property for ' + manifest['_id'] + '. Please run the `create_collection` notebook.'
            print(msg)

            
def create_data_manifest(opts, schema):
    """
    Detects if the RawData or ProcessedData node exists along the
    specified path. If not, the node is created.
    """
    try:
        assert Corpus.find({'path': opts['path']})
        pass
    except:
        manifest = {}
        for item in schema:
            key = item['name']
            val = opts[key]
            validate_datatype(key, val, item['type'])
            manifest[key] = val
        Corpus.insert_one(manifest)

        
def insert_doc_manifests(doc_paths, opts, schema):
    """
    Loops through a list of doc manifests, ensures that the collection and data node specified therein exists in the database, and then inserts
    the doc's manifest in the database.
    """
    for path in doc_paths:
        with open(path, 'r') as f:
            manifest = json.load(f)
        detect_data_node(manifest, opts, schema)
        Corpus.insert_one(manifest)

## Execute Action

In [None]:
insert_doc_manifests(doc_paths, opts, schema)