##  MAIN CONTROLLER

In [17]:
from config import sparqlTerms, mig_ns, sparql_mig_test, sparql_mig_simple, sparql_mig_dev, vocabs, types
from SPARQLWrapper import JSON, SPARQLWrapper
from utilities import removeNS, PrintException, cleanOutputs
import re, os, concurrent.futures, json, requests, time, datetime
from rdflib import URIRef, Literal, Namespace, Graph


In [18]:
def main():
    print(datetime.datetime.fromtimestamp(time.time()).strftime('%H:%M:%S'))
    sparqlData=sparql_mig_simple
    #  Iterate over every type of object that needs to be migrated. 
    #  This is the first splitting of the data for migration.
    cleanOutputs(types)
    for objectType in types:
        # a queryObject knows where it came from.
        # a queryObject has been split into multiple groups
        # only one group exists for community, and one for collection objects
        # approximately a thousand queries each are minted for thesis and for generic objects
        # these queries are based on the first folder in the fedora pair tree
        queryObject = QueryFactory.getMigrationQuery(objectType, sparqlData)
        queryObject.generateQueries()
        print('%s queries generated' % (objectType))
        print('%i queries of %s objects to be transformed' % (len(queryObject.queries), objectType))
        i = 0
        with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
            
            future_to_result = {executor.submit(parellelTransform, queryObject, group): group for group in queryObject.queries.keys()}
            for future in concurrent.futures.as_completed(future_to_result):
                result = future_to_result[future]
                try:
                    i = i + 1
                    future.result()
                    print("%i of %i %s queries transformed" % (i, len(queryObject.queries), objectType) )
                except Exception:
                    PrintException()
        #queryObject.postResults()
        print("%s objects transformation completed" % (objectType) )
        del queryObject
    print(datetime.datetime.fromtimestamp(time.time()).strftime('%H:%M:%S'))

def parellelTransform(queryObject, group):
    DTO = Data(queryObject.queries[group], group, queryObject.sparqlData, sparqlTerms, queryObject) # query, group, object
    DTO.transformData()
    DTO.resultsToTriplestore()



##  TRANSFORMATIONS
#### functions for handling data passed over by the data object. Takes a triple, detects what kind of action needs to be taken based on the predicate, sends it to the appropriate function for transformations, then returns it back to the data handler to be saved.

In [19]:
class Transformation():
    
    def __init__(self):
        self.output = []
       
    ############################################################################
    ######################## transformation on dcterms:language ################
    ############################################################################

    def language(self, triple, objectType):
        # normalize values and convert to URI (consult the "vocabs" variable from the config file (this folder))
        for vocab in vocabs["language"]:
            # mint a new triple with the mapped type
            if triple['object']['value'] in  vocab["mapping"]:
                self.output.append(
                    {
                            'subject': {
                                'value': triple['subject']['value'], # the subject of the triple
                                'type': 'uri'
                            }, 
                            'predicate': {
                                'value': triple['predicate']['value'], # the predicate of the triple
                                'type': 'uri'
                            }, 
                            'object': {
                                'value': vocab["uri"], # mapped uri
                                'type': 'uri'
                            }
                        } 
                    ) 
        return self.output


    ############################################################################
    ######################## transformation on dc:rights #######################
    ############################################################################
    
    
    def rights(self, triple, objectType):
        #### 
        # several different license values need to be coerced into one common value, this needs to be confirmed with leah before it is written
        self.output.append(triple)
        return self.output

    ############################################################################
    ######################## transformation on dc:license ######################
    ############################################################################
    
    
    def license(self, triple, objectType):
        #### 
        
        self.output.append(triple)
        return self.output
    
    ############################################################################
    ######################## transformation on acl:visibilityAfterEmbargo ######################
    ############################################################################
    
    
    def aclvisibilityAfterEmbargo(self, triple, objectType):
        if ("open" in triple['object']['value']) or ("open access" in triple['object']['value']):
            triple['object']['value'] = "http://terms.library.ualberta.ca/public"
            triple['object']['type'] = 'uri'
            self.output.append(triple)
            return self.output            
        elif "university_of_alberta" in triple['object']['value']:
            triple['object']['value'] = "http://terms.library.ualberta.ca/authenticated"
            triple['object']['type'] = 'uri'
            self.output.append(triple)
            return self.output
    
    ############################################################################
    ######################## transformation on ual:institution #################
    ############################################################################

    def institution(self, triple, objectType):
        self.output.append(
            {
            'subject': {
                'value': triple['subject']['value'], 
                'type': 'uri'
            }, 
            'predicate': {
                'value': triple['predicate']['value'], 
                'type': 'uri'
            }, 
            'object': {
                'value': 'http://id.loc.gov/authorities/names/n79058482', 
                'type': 'uri'
            }
        }
        )
        return self.output
 

    ############################################################################
    ######################## transformation on dcterms:license #################
    ############################################################################

    
    def license(self, triple, objectType):
        #### 
        # convert licenses from text to URI (use vocabs variable, some coersion will be necessary)
        if "I am required to use/link to a publisher's license" in triple['object']['value']:
            return None
        else:
            for vocab in vocabs["license"]:
                if triple['object']['value'] in vocab["mapping"]:
                    self.output.append(
                        {
                            'subject': {
                                'value': triple['subject']['value'], # the subject of the triple
                                'type': 'uri'
                            }, 
                            'predicate': {
                                'value': triple['predicate']['value'], # the predicate of the triple
                                'type': 'uri'
                            }, 
                            'object': {
                                'value': vocab["uri"], # mapped uri
                                'type': 'uri'
                            }
                        }
                    )
            if len(self.output)>0:
                return self.output
            else:
                self.output.append(
                    {
                        'subject': {
                            'value': triple['subject']['value'], # the subject of the triple
                            'type': 'uri'
                        }, 
                        'predicate': {
                            'value': "http://purl.org/dc/elements/1.1/rights", # the predicate of the triple
                            'type': 'uri'
                        }, 
                        'object': {
                            'value': triple['object']['value'], # mapped uri
                            'type': 'literal'
                        }
                    }
                )
                return self.output
    
    ############################################################################
    ######################## transformation on dcterms:type ####################
    ############################################################################
    
    def type(self, triple, objectType):
        if objectType == 'generic':
            for vocab in vocabs["type"]:
                # mint a new triple with the mapped type
                if triple['object']['value'] in vocab["mapping"]:
                    self.output.append(
                        {
                            'subject': {
                                'value': triple['subject']['value'], # the subject of the triple
                                'type': 'uri'
                            }, 
                            'predicate': {
                                'value': triple['predicate']['value'], # the predicate of the triple
                                'type': 'uri'
                            }, 
                            'object': {
                                'value': vocab["uri"], # mapped uri
                                'type': 'uri'
                            }
                        }
                    )
     
            else:
                pass
        elif (objectType == 'community') or (objectType == 'collection'):
            self.output.append(triple)
        
        return self.output
        
    def modelsmemberOf(self, triple, objectType):
        if "http" not in triple['object']['value']:
            value = triple['object']['value']
            triple['object']['value'] = "http://gillingham.library.ualberta.ca:8080/fedora/rest/prod/%s/%s/%s/%s/%s" % (value[0:2], value[2:4], value[4:6], value[6:8], value)
            triple['object']['type'] = 'uri'
        self.output.append(triple)
        return self.output

    def modelshasMember(self, triple, objectType):
        if "http" not in triple['object']['value']:
            value = triple['object']['value']
            triple['object']['value'] = "http://gillingham.library.ualberta.ca:8080/fedora/rest/prod/%s/%s/%s/%s/%s" % (value[0:2], value[2:4], value[4:6], value[6:8], value)
            triple['object']['type'] = 'uri'
        self.output.append(triple)
        return self.output
    
    def accessRights(self, triple, objectType):
        if "http://projecthydra.org/ns/auth/group#public" in triple['object']['value']:
            triple['object']['value'] = "http://terms.library.ualberta.ca/public"
            triple['object']['type'] = 'uri'
            self.output.append(triple)
            return self.output            
        elif ("http://projecthydra.org/ns/auth/group#university_of_alberta" in triple['object']['value']) or ("http://projecthydra.org/ns/auth/group#registered" in triple['object']['value']):
            triple['object']['value'] = "http://terms.library.ualberta.ca/authenticated"
            triple['object']['type'] = 'uri'
            self.output.append(triple)
            return self.output 
        else:
            triple['object']['value'] = "http://terms.library.ualberta.ca/private"
            triple['object']['type'] = 'uri'
            self.output.append(triple)
            return self.output
    
    def available(self, triple, objectType):
        self.output.append(triple)
        self.output.append(
                        {
                            'subject': {
                                'value': triple['subject']['value'], # the subject of the triple
                                'type': 'uri'
                            }, 
                            'predicate': {
                                'value': "http://purl.org/dc/terms/accessRights", # the predicate of the triple
                                'type': 'uri'
                            }, 
                            'object': {
                                'value':"http://terms.library.ualberta.ca/public", # mapped uri
                                'type': 'uri'
                            }
                        }        
        )
        return self.output

##  QUERY BUILDER
##### Pulls current mappings from triplestore, dynamically builds queries in managable sizes

In [20]:
class Query(object):
    """ Query objects are dynamically generated, and contain SPARQL CONSTRUCT queries with input from the jupiter application profile """
    def __init__(self, objectType, sparqlData, sparqlTerms=sparqlTerms):
        self.mapping = []
        self.sparqlTerms = SPARQLWrapper(sparqlTerms)  # doesn't need to change (the terms store doesn't change)
        self.sparqlData = SPARQLWrapper(sparqlData)  # sets the triple store from which to get data (simple, test, or dev)
        self.sparqlResults = SPARQLWrapper("http://206.167.181.123:9999/blazegraph/namespace/results/sparql")
        self.sparqlTerms.setMethod("POST")
        self.sparqlData.setMethod("POST")
        self.sparqlResults.setMethod("POST")
        self.queries = {}
        self.splitBy = {}
        self.prefixes = ""
        self.filename = ""
        for ns in mig_ns:
            self.prefixes = self.prefixes + " PREFIX %s: <%s> " % (ns['prefix'], ns['uri'])
        self.getMappings()
        

    
    def postResults(self):
        directory = 'results/%s' % (self.objectType)
        for (dirpath, dirnames, filenames) in os.walk(directory):
            for filename in filenames:
                with open(os.path.join(dirpath, filename), 'rb') as f:
                    query = "INSERT DATA {%s}" % (f.read())
                    self.sparqlResults.setReturnFormat(JSON)
                    self.sparqlResults.setQuery(query)
                    self.sparqlResults.query()                        
                
    
    def getMappings(self):
        if (self.objectType == 'collection') or (self.objectType == 'community') or (self.objectType == 'generic') or (self.objectType ==  'thesis'):
            query = "prefix ual: <http://terms.library.ualberta.ca/>SELECT * WHERE {GRAPH ual:%s {?newProperty ual:backwardCompatibleWith ?oldProperty} }" % (self.objectType)
            self.sparqlTerms.setReturnFormat(JSON)
            self.sparqlTerms.setQuery(query)
            results = self.sparqlTerms.query().convert()
            for result in results['results']['bindings']:
                self.mapping.append((result['newProperty']['value'], result['oldProperty']['value']))
        else:
            pass

    def getSplitBy(self):
        # base query only needs 3 prefixes appended to the "select" statement defined by the object
        query = "prefix dcterm: <http://purl.org/dc/terms/> prefix info: <info:fedora/fedora-system:def/model#> prefix ual: <http://terms.library.ualberta.ca/> %s" % (self.select)
        self.sparqlData.setReturnFormat(JSON)
        self.sparqlData.setQuery(query)
        results =  self.sparqlData.query().convert()
        # iterate over query results
        for result in results['results']['bindings']:
            # the group is the two folders at the base of the pair tree, concatenated by an underscore
            group = result['resource']['value'].split('/')[6]
            # assign that parameter by which you want to search to that group
            self.splitBy[group] = "/".join( result['resource']['value'].split('/')[:7] )# the stem of the resource [0] and the group number by which to save [1] (this is the first digit in the pair tree)
            

    def generateQueries(self):
        pass
    
    def writeQueries(self):
        filename = "cache/%s.json" % (self.objectType)
        with open(filename, 'w+') as f:
            json.dump([self.queries], f, sort_keys=True, indent=4, separators=(',', ': '))
               
class Collection(Query):
    def __init__(self, sparqlData):
        self.objectType = 'collection'
        self.construct = "CONSTRUCT { ?resource info:hasModel 'IRItem'^^xsd:string ; rdf:type pcdm:Collection"
        self.where = ["WHERE { ?resource info:hasModel 'Collection'^^xsd:string . OPTIONAL { ?resource ualids:is_community 'false'^^xsd:boolean } . OPTIONAL { ?resource ualid:is_community 'false'^^xsd:boolean } . OPTIONAL { ?resource ual:is_community 'false'^^xsd:boolean }"]
        self.select = None
        super().__init__(self.objectType, sparqlData)

    def generateQueries(self):
        for where in self.where:
            construct = self.construct
            for pair in self.mapping:
                construct = "%s ; <%s> ?%s" % (construct, pair[0], re.sub(r'[0-9]+', '', pair[0].split('/')[-1].replace('#', '').replace('-', '')))
                where = " %s . OPTIONAL { ?resource <%s> ?%s . FILTER (str(?%s)!='') }" % (where, pair[1], re.sub(r'[0-9]+', '', pair[0].split('/')[-1].replace('#', '').replace('-', '')), re.sub(r'[0-9]+', '', pair[0].split('/')[-1].replace('#', '').replace('-', '')))
            self.queries['collection'] = "%s %s } %s }" % (self.prefixes, construct, where)
        self.writeQueries

class Community(Query):
    def __init__(self, sparqlData):
        self.objectType = 'community'
        self.construct = "CONSTRUCT { ?resource info:hasModel 'IRItem'^^xsd:string ; rdf:type pcdm:Object; rdf:type ual:Community"
        self.where = ["WHERE { ?resource info:hasModel 'Collection'^^xsd:string ; OPTIONAL { ?resource ualids:is_community 'true'^^xsd:boolean } . OPTIONAL { ?resource ualid:is_community 'true'^^xsd:boolean } . OPTIONAL { ?resource ual:is_community 'true'^^xsd:boolean }"]
        self.select = None
        super().__init__(self.objectType, sparqlData)

    def generateQueries(self):
        for where in self.where:
            construct = self.construct
            for pair in self.mapping:
                construct = "%s ; <%s> ?%s" % (construct, pair[0], re.sub(r'[0-9]+', '', pair[0].split('/')[-1].replace('#', '').replace('-', '')))
                where = " %s . OPTIONAL { ?resource <%s> ?%s . FILTER (str(?%s)!='') }" % (where, pair[1], re.sub(r'[0-9]+', '', pair[0].split('/')[-1].replace('#', '').replace('-', '')), re.sub(r'[0-9]+', '', pair[0].split('/')[-1].replace('#', '').replace('-', '')))
            self.queries['community'] = "%s %s } %s }" % (self.prefixes, construct, where)
        self.writeQueries()

class Generic(Query):
    def __init__(self, sparqlData):
        self.objectType = 'generic'
        self.construct = "CONSTRUCT { ?resource info:hasModel 'IRItem'^^xsd:string ; dcterm:available ?available ; dcterm:accessRights ?visibility; rdf:type works:Work; rdf:type pcdm:Object ; bibo:owner ?owner ; acl:embargoHistory ?history ; acl:visibilityAfterEmbargo ?visAfter"
        self.where = []
        self.select = "SELECT distinct ?resource WHERE { ?resource info:hasModel 'GenericFile'^^xsd:string ; dcterm:type ?type . filter(str(?type) != 'Thesis'^^xsd:string) }"
        super().__init__(self.objectType, sparqlData)

    def generateQueries(self):
        self.getSplitBy()
        query = "%s %s" % (self.prefixes, self.select)
        for group in self.splitBy.keys():
            where = "WHERE {  ?resource info:hasModel 'GenericFile'^^xsd:string ; dcterm:type ?type . filter(str(?type) != 'Thesis'^^xsd:string) . FILTER (contains(str(?resource), '%s'))" % (self.splitBy[group])
            construct = self.construct
            for pair in self.mapping:
                construct = "%s ; <%s> ?%s" % (construct, pair[0], re.sub(r'[0-9]+', '', pair[0].split('/')[-1].replace('#', '').replace('-', '')))
                where = " %s . OPTIONAL { ?resource <%s> ?%s . FILTER (str(?%s)!='') }" % (where, pair[1], re.sub(r'[0-9]+', '', pair[0].split('/')[-1].replace('#', '').replace('-', '')), re.sub(r'[0-9]+', '', pair[0].split('/')[-1].replace('#', '').replace('-', '')))
            self.queries[group] =  "%s %s } %s . OPTIONAL {?permission webacl:accessTo ?resource ; webacl:mode webacl:Read ; webacl:agent ?visibility } . OPTIONAL {?permission webacl:accessTo ?resource ; webacl:mode webacl:Write ; webacl:agent ?owner } . OPTIONAL {?resource acl:hasEmbargo ?embargo . OPTIONAL {?embargo acl:embargoReleaseDate ?available } . OPTIONAL {?embargo acl:embargoHistory ?history } . OPTIONAL {?embargo acl:visibilityAfterEmbargo ?visAfter } } }" % (self.prefixes, construct, where)
        self.writeQueries()


class Thesis(Query):
    def __init__(self, sparqlData):
        self.objectType = 'thesis'
        self.construct = "CONSTRUCT { ?resource info:hasModel 'IRItem'^^xsd:string ; dcterm:available ?available ; dcterm:accessRights ?visibility; rdf:type works:Work ; rdf:type pcdm:Object ; rdf:type bibo:Thesis; bibo:owner ?owner ; acl:embargoHistory ?history ; acl:visibilityAfterEmbargo ?visAfter"
        self.where = []
        self.select = "SELECT distinct ?resource WHERE { ?resource info:hasModel 'GenericFile'^^xsd:string ; dcterm:type 'Thesis'^^xsd:string }"
        super().__init__(self.objectType, sparqlData)

    def generateQueries(self):
        self.getSplitBy()        
        query = "%s %s" % (self.prefixes, self.select)
        for group in self.splitBy.keys():
            where = "WHERE { ?resource info:hasModel 'GenericFile'^^xsd:string ; dcterm:type 'Thesis'^^xsd:string . FILTER (contains(str(?resource), '%s'))" % (self.splitBy[group])
            construct = self.construct
            for pair in self.mapping:
                construct = "%s ; <%s> ?%s" % (construct, pair[0], re.sub(r'[0-9]+', '', pair[0].split('/')[-1].replace('#', '').replace('-', '')))
                where = " %s . OPTIONAL { ?resource <%s> ?%s . FILTER (str(?%s)!='') } " % (where, pair[1], re.sub(r'[0-9]+', '', pair[0].split('/')[-1].replace('#', '').replace('-', '')), re.sub(r'[0-9]+', '', pair[0].split('/')[-1].replace('#', '').replace('-', '')))
            self.queries[group] =  "%s %s } %s . OPTIONAL {?permission webacl:accessTo ?resource ; webacl:mode webacl:Read ; webacl:agent ?visibility } . OPTIONAL {?permission webacl:accessTo ?resource ; webacl:mode webacl:Write ; webacl:agent ?owner } . OPTIONAL {?resource acl:hasEmbargo ?embargo . OPTIONAL {?embargo acl:embargoReleaseDate ?available } . OPTIONAL {?embargo acl:embargoHistory ?history } . OPTIONAL {?embargo acl:visibilityAfterEmbargo ?visAfter } } }" % (self.prefixes, construct, where)
        self.writeQueries()


class File(Query):
    def __init__(self, sparqlData, objectType, filterType):
        self.rdfType = "<http://www.w3.org/ns/ldp#NonRDFSource>"
        self.pcdmType = "pcdm:File"
        self.construct = "CONSTRUCT {?file rdf:type %s; rdf:type %s; pcdm:fileOf ?fileset ; iana:describedby ?fixty ; iana:describedby ?fcr ; fedora:hasParent ?fileset ; ?predicate ?object }" % (self.rdfType, self.pcdmType)
        self.where = "WHERE { ?resource rdf:type %s . FILTER ( strEnds(str(?resource), '%s') && " % (self.rdfType, self.filterType)
        self.select = "SELECT distinct ?resource WHERE  {?resource rdf:type %s . FILTER ( strEnds(str(?resource), '%s') ) }" % (self.rdfType, self.filterType)
        super().__init__(self.objectType, self.sparqlData)
        
    def generateQueries(self):
        self.getSplitBy()
        for group in self.splitBy.keys():
            self.queries[group] = "%s %s %s strStarts(str(?resource), '%s') ) . ?resource ?predicate ?object . FILTER ( !contains(str(?predicate), 'http://www.iana.org/assignments/relation/') && !contains(str(?predicate), 'http://fedora.info/definitions/v4/repository#hasFixityService')  && !contains(str(?predicate), 'http://fedora.info/definitions/v4/repository#hasParent') && str(?object)!='' )  . BIND(URI(CONCAT(str(?resource), '/file')) AS ?file)  . BIND(URI(CONCAT(str(?resource), '/fileset')) AS ?fileset) . BIND(URI(CONCAT(str(?resource), '/file/fcr:fixity')) AS ?fixity)  . BIND(URI(CONCAT(str(?resource), '/file/fcr:metadata')) AS ?fcr)}" % (self.prefixes, self.construct, self.where, self.splitBy[group])
        self.writeQueries()    


class Fileset(Query):
    def __init__(self, sparqlData, objectType, filterType):
        self.pcdmType = "works:Fileset"
        self.rdfType = "<http://fedora.info/definitions/v4/repository#NonRdfSourceDescription>"
        self.construct = "CONSTRUCT { ?parent pcdm:hasRelatedObject ?relatedObject . ?fileset rdf:type fedora:Container ; rdf:type fedora:Resource ; rdf:type pcdm:Object ; rdf:type works:FileSet; rdf:type <http://www.w3.org/ns/ldp#Container> ; rdf:type <http://www.w3.org/ns/ldp#RDFSource> ; pcdm:hasFile ?file ; pcdm:isMemberOf ?relatedObject ; fedora:hasParent ?relatedObject . ?relatedObject pcdm:relatedObjectOf ?parent ; rdf:type ual:%s ; rdf:type fedora:Container ; rdf:type fedora:Resource ; rdf:type pcdm:Object ; rdf:type works:Work ; rdf:type <http://www.w3.org/ns/ldp#Container> ; rdf:type <http://www.w3.org/ns/ldp#RDFSource> ; pcdm:hasMember ?fileset ; fedora:hasParent ?parent ; ?predicate ?object } " % (self.filterType)
        self.where = "WHERE { ?resource rdf:type %s . FILTER ( strEnds(str(?resource), '%s/fcr:metadata') && " % (self.rdfType, self.filterType)
        self.select = "SELECT distinct ?resource WHERE {?resource rdf:type %s . FILTER ( strEnds(str(?resource), '%s/fcr:metadata') ) }" % (self.rdfType, self.filterType)
        super().__init__(self.objectType, self.sparqlData)
        
    def generateQueries(self):
        self.getSplitBy()
        for group in self.splitBy.keys():
            self.queries[group] = "%s %s %s strStarts(str(?resource), '%s')) . ?resource ?predicate ?object FILTER ( !contains(str(?predicate), 'http://www.iana.org/assignments/relation/') && !contains(str(?predicate), 'http://fedora.info/definitions/v4/repository#hasFixityService') && !contains(str(?predicate), 'http://fedora.info/definitions/v4/repository#hasParent') && !contains(str(?predicate), 'http://fedora.info/definitions/v4/repository#hasVersions') && str(?object)!='' )  . BIND(URI(REPLACE(STR(?resource), '/%s/fcr:metadata', '/fileset')) AS ?fileset) . BIND(URI(REPLACE(STR(?resource), '/%s/fcr:metadata', '/file')) AS ?file)  . BIND(URI(REPLACE(STR(?resource), '/%s/fcr:metadata', '')) AS ?parent) . BIND(URI(REPLACE(STR(?resource), '/fcr:metadata', '/relatedObject')) AS ?relatedObject) }" % (self.prefixes, self.construct, self.where, self.splitBy[group], self.filterType, self.filterType, self.filterType)
        self.writeQueries()     


class era1statsFile(File):
    def __init__(self, sparqlData):
        self.sparqlData = sparqlData
        self.objectType = 'era1statsFile'        
        self.filterType = "era1stats"
        super().__init__(self.sparqlData, self.objectType, self.filterType)

        
class era1statsFileset(Fileset):
    def __init__(self, sparqlData):
        self.sparqlData = sparqlData
        self.objectType = 'era1statsFileset'        
        self.filterType = "era1stats"
        super().__init__(self.sparqlData, self.objectType, self.filterType)  
        

class fedora3foxmlFile(File):
    def __init__(self, sparqlData):
        self.sparqlData = sparqlData
        self.objectType = 'fedora3foxmlFile'        
        self.filterType = "fedora3foxml"
        super().__init__(self.sparqlData, self.objectType, self.filterType)
        

class fedora3foxmlFileset(Fileset):
    def __init__(self, sparqlData):
        self.sparqlData = sparqlData
        self.objectType = 'fedora3foxmlFileset'        
        self.filterType = "fedora3foxml"
        super().__init__(self.sparqlData, self.objectType, self.filterType)
        

class contentFile(File):
    def __init__(self, sparqlData):
        self.sparqlData = sparqlData
        self.objectType = 'contentFile'
        self.filterType = "content"
        super().__init__(self.sparqlData, self.objectType, self.filterType)
        self.construct = "CONSTRUCT { ?file rdf:type ldp:NonRDFSource ; rdf:type fedora:Binary ; rdf:type fedora:Resource ; rdf:type pcdm:File; pcdm:fileOf ?fileset; iana:describedby ?fcr ; ?predicate ?object }"

        
class contentFileset(Fileset):
    def __init__(self, sparqlData):
        self.sparqlData = sparqlData
        self.objectType = 'contentFileset'
        self.filterType = "content"
        super().__init__(self.sparqlData, self.objectType, self.filterType)
        self.construct = "CONSTRUCT { ?fileset rdf:type fedora:Container ; rdf:type fedora:Resource ; rdf:type pcdm:Object ; rdf:type works:FileSet; rdf:type ldp:Container ; rdf:type ldp:RDFSource ; pcdm:hasFile ?file ; pcdm:memberOf ?parent ; ?predicate ?object }"
        
        
class characterizationFile(File):
    def __init__(self, sparqlData):
        self.sparqlData = sparqlData
        self.objectType = 'characterizationFile'
        self.filterType = "characterization"
        super().__init__(self.sparqlData, self.objectType, self.filterType)
        self.construct = "CONSTRUCT { ?file rdf:type ldp:NonRDFSource ; rdf:type fedora:Binary ; rdf:type fedora:Resource ; rdf:type pcdm:File; pcdm:fileOf ?fileset; iana:describedby ?fcr ; ?predicate ?object }"

    def generateQueries(self):
        self.getSplitBy()
        for group in self.splitBy.keys():
            self.queries[group] = "%s %s %s strStarts(str(?resource), '%s') ) . ?resource ?predicate ?object . FILTER ( !contains(str(?predicate), 'http://www.iana.org/assignments/relation/') && !contains(str(?predicate), 'http://fedora.info/definitions/v4/repository#hasFixityService')  && !contains(str(?predicate), 'http://fedora.info/definitions/v4/repository#hasParent') && str(?object)!='' )  . BIND(URI(CONCAT(str(?resource), '/file')) AS ?file)  . BIND(URI(REPLACE(str(?resource), '/characterization', '/fileset')) AS ?fileset) . BIND(URI(CONCAT(str(?resource), '/file/fcr:fixity')) AS ?fixity)  . BIND(URI(CONCAT(str(?resource), '/file/fcr:metadata')) AS ?fcr)}" % (self.prefixes, self.construct, self.where, self.splitBy[group])
        self.writeQueries() 
    
class characterizationFileset(Fileset):
    def __init__(self, sparqlData):
        self.sparqlData = sparqlData
        self.objectType = 'characterizationFileset'
        self.filterType = "characterization"
        super().__init__(self.sparqlData, self.objectType, self.filterType)
        self.construct = "CONSTRUCT { ?fileset rdf:type %s; rdf:type %s; rdf:type ual:fits ; pcdm:hasFile ?file ; pcdm:memberOf ?parent ; iana:describes ?file ; ?predicate ?object }" % (self.rdfType, self.pcdmType)      


##  DATA TRANSPORT OBJECTS
##### Runs a query, sends data to get transformed, saves data to appropriate file

In [21]:
class Data(object):
    def __init__(self, query, group, sparqlData, sparqlTerms, queryObject):
        self.q = query
        self.prefixes = queryObject.prefixes
        self.group = group
        self.sparqlData = sparqlData
        self.sparqlTerms = sparqlTerms
        self.output = []
        self.graph = Graph()
        self.objectType = queryObject.objectType
        self.directory = "results/%s/" % (self.objectType)
        self.filename = "results/%s/%s.nt" % (self.objectType, group)
        if not os.path.exists(self.directory):
            os.makedirs(self.directory)
        

    def transformData(self):
        self.sparqlData.setReturnFormat(JSON)
        self.sparqlData.setQuery(self.q)
        # queries a batch of resources from this particular "group"
        results = self.sparqlData.query().convert()['results']['bindings']
        # iterates over each resource and performs transformations
        for result in results:
            result = TransformationFactory().getTransformation(result, self.objectType)
            if isinstance(result, list):
                for triple in result:
                    s = URIRef(triple['subject']['value'])
                    p = URIRef(triple['predicate']['value'])
                    if triple['object']['type'] == 'uri':
                        o = URIRef(triple['object']['value'])
                    else:
                        o = Literal(triple['object']['value'])
                    self.graph.add((s, p, o))
        self.graph.serialize(destination=self.filename, format='nt')
        
    def resultsToTriplestore(self):
        print('hello')
        url = 'http://206.167.181.123:9999/blazegraph/namespace/results/sparql'
        files = {'file': ('result.nt', self.graph.serialize(format='nt'), 'text/turtle') }
        r = requests.post(url, files=files)

In [22]:
class QueryFactory():
    @staticmethod
    def getMigrationQuery(objectType, sparqlData):
        """ returns a specified query object depending on the type passed in"""
        if objectType == "collection": return Collection(sparqlData)
        elif objectType == "community": return Community(sparqlData) 
        elif objectType == "thesis": return Thesis(sparqlData)
        elif objectType == "generic": return Generic(sparqlData)
        elif objectType == "era1statsFile": return era1statsFile(sparqlData)
        elif objectType == "era1statsFileset": return era1statsFileset(sparqlData) 
        elif objectType == "fedora3foxmlFile": return fedora3foxmlFile(sparqlData)
        elif objectType == "fedora3foxmlFileset": return fedora3foxmlFileset(sparqlData) 
        elif objectType == "contentFile": return contentFile(sparqlData)
        elif objectType == "contentFileset": return contentFileset(sparqlData) 
        elif objectType == "characterizationFile": return characterizationFile(sparqlData)
        elif objectType == "characterizationFileset": return characterizationFileset(sparqlData)
        else:
            return None

In [23]:
class TransformationFactory():
    @staticmethod
    def getTransformation(triple, objectType):
        function = re.sub(r'[0-9]+', '', triple['predicate']['value'].split('/')[-1].replace('#', '').replace('-', ''))      
        if function ==  "accessRights": return Transformation().accessRights(triple, objectType)
        elif function ==  "modelsmemberOf": return Transformation().modelsmemberOf(triple, objectType)
        elif function ==  "modelshasMember": return Transformation().modelshasMember(triple, objectType)
        elif function == "language": return Transformation().language(triple, objectType)
        elif function == "type": return Transformation().type(triple, objectType)
        elif function ==  "rights": return Transformation().rights(triple, objectType)
        elif function == "license": return Transformation().license(triple, objectType)
        elif function == "ontologyinstitution": return Transformation().institution(triple, objectType)
        elif function == "available": return Transformation().available(triple, objectType)
        elif function == "aclvisibilityAfterEmbargo": return Transformation().aclvisibilityAfterEmbargo(triple, objectType)
        else:
            return [triple]

In [24]:
if __name__ == "__main__":
	main()

13:34:11
collection queries generated
1 queries of collection objects to be transformed
1 of 1 collection queries transformed
collection objects transformation completed
community queries generated
1 queries of community objects to be transformed
1 of 1 community queries transformed
community objects transformation completed
generic queries generated
1 queries of generic objects to be transformed
1 of 1 generic queries transformed
generic objects transformation completed
thesis queries generated
2 queries of thesis objects to be transformed
1 of 2 thesis queries transformed
2 of 2 thesis queries transformed
thesis objects transformation completed
era1statsFile queries generated
1 queries of era1statsFile objects to be transformed
1 of 1 era1statsFile queries transformed
era1statsFile objects transformation completed
era1statsFileset queries generated
1 queries of era1statsFileset objects to be transformed
1 of 1 era1statsFileset queries transformed
era1statsFileset objects transformati