In [9]:
#define function to read mdEditor files, and write out as csv 
#and shapefile with geography, if applicable
#by Tamatha A. Patterson; verson 5; December 2022
#distribution section updated.
#extent merged with metadata and written to shapefile. #FeatureCollection type handled.
#associations section updated.

def mdEditor_read(metadataToRead, contact_md, csvname, workspace, resourcetype = True, geo = False):
    #recordtype used to write chosen resource types  \"all\", \"project\", etc.in future versions
    #extent/geography read when is True, skipped when False.
    import os
    import json
    import csv
    import pandas as pd
    import geopandas as gpd
    import collections
    import fiona
    from shapely.geometry import Point, LineString, Polygon, MultiPolygon
    from datetime import date
    
    def removeComma(string): #define remove comma function
        return string.replace(",","; ")
    
    def listToString(s): #define converting list to a string format
        str1 = "; "
        return (str1.join(s))
    
    def def_value():
        return "none"
    
    os.chdir(workspace) # assign working directory
    
    df = pd.read_json(metadataToRead)#read JSON metadata files into dataframe
    values = df.get('data') #assigns metadata to values
    for e in range(0, len(values)): #json file may have multiple metadata records
        element = values[e] #assign list value e containing the metadata #keys= id, attributes, type
        ID = element.get('id') #get metadata id
        attribute = element.get('attributes') # keys= profile, json, data-updated
        typpe = element.get('type') #get metadata type
        if typpe != 'records':  # skip if metadata is not a record and is a data dictionary, setting, schemas, custom-profiles...
            continue  #go to next record 
            
        #parse 'attribute' to create profile, json, and date-updated\n",
        dateUpdate = attribute.get('date-updated') #create dateUpdate value--where does this date come from???
        jsondata = attribute.get('json') #create'json' data value
        profile = attribute.get('profile') #create 'profile' value
        
        #convert string to dictionary
        jsondatadict = json.loads(jsondata)  #3 keys = schema, metadata, mdDictionary
        schema = jsondatadict.get('schema')
        metadata = jsondatadict.get('metadata') #4 keys = metadataInfo, resourceInfo, associatedResource, resourceDistribution\n",
        mdDictionary = jsondatadict.get('mdDictionary')
        #get metadata key entries
        metadataInfo = metadata.get('metadataInfo') #6 Keys = metadataIdentifier, metadataContact, defaultMetadataLocale, metadataDate, parentMetadata, metadataStatus
        resourceInfo = metadata.get('resourceInfo') #12 keys = resourceType, citation, pointOfContact, abstract, shortAbstract, status, defaultResourceLocale, extent, keyword, purpose, taxonomy, timePeriod
        associatedResource = metadata.get('associatedResource') #is list
        resourceDistribution = metadata.get('resourceDistribution') #keys = n
        
        #parse resourceInfo resourceType key of 12: 'resourceType'
        resourceType = resourceInfo.get('resourceType')
        #parse resource Type info
        typelist = resourceType[0]
        typee = typelist.get('type')
        typeename = typelist.get('name')
        if typeename != None: 
            typeename = removeComma(typelist.get('name'))
            
        #check if desired resource type is record resource type, go to next record if not.
        if any([resourcetype == typee, resourcetype == True]):  
            print(f"Search record resource type is {resourcetype} and record resource type is {typee}.")
        else: 
            print(f'Not a {resourcetype}, moving on.')
            continue
            
        #parse metadataInfo dictionary 6 keys
        metadataIdentifier = metadataInfo.get('metadataIdentifier') #Harvest as ID to fields
        metadataContact = metadataInfo.get('metadataContact')
        defaultMetadataLocale = metadataInfo.get('defaultMetadataLocale')
        metadataDate = metadataInfo.get('metadataDate')
        parentMetadata = metadataInfo.get('parentMetadata')
        metadataStatus = metadataInfo.get('metadataStatus') #Harvest as status to fields

        #get metadata uuid identifier; autocreated in mdEditor\n",
        if metadataIdentifier['namespace'] == 'urn:uuid':
                metaIdentifier = metadataIdentifier.get('identifier')

        #parse resourceInfo 11/12 keys: 'resourceType'(above), 'citation', 'pointOfContact', 'abstract', 'shortAbstract', 'status', \n",
        #...'defaultResourceLocale', 'extent', 'keyword', 'purpose', 'taxonomy', 'timePeriod'\n",
        citation = resourceInfo.get('citation')
        pointOfContact = resourceInfo.get('pointOfContact')
        abstract = removeComma(resourceInfo.get('abstract')) #harvested to fields
        if resourceInfo.get('shortAbstract') == None:
            shortAbstract = " "
        else:
            shortAbstract = removeComma(resourceInfo.get('shortAbstract'))#harvested as shortAbstract to fields
        statusList = resourceInfo.get('status')
        status = statusList[0]#harvest as status to fields
        defaultResourceLocale = resourceInfo.get('defaultResourceLocale')
        extent = resourceInfo.get('extent')
        keyword = resourceInfo.get('keyword')
        if resourceInfo.get('purpose') == None:
            purpose = " "
        else:
            purpose = removeComma(resourceInfo.get('purpose')) #harvested to fields
        taxonomy = resourceInfo.get('taxonomy')
        timePeriod = resourceInfo.get('timePeriod')
        
        #find last update date from metadataDate
        #Consider comparing this to last run date and only reading metadata updated after????
        if len(metadataDate) == 1:
            lastUpdate = metadataDate[0].get('date') 
            dateType = metadataDate[0].get('dateType')
        else:
            if len(metadataDate) > 1:
                for i in metadataDate:
                    if i.get('dateType') == "lastUpdate":
                        lastUpdate = (i.get('date')).split('T')[0]
                        dateType = 'last updated'
                    else: 
                        dateType = i.get('dateType')
                        lastUpdate = (i.get('date')).split('T')[0]                   

        #parse citation info
        title = removeComma(citation.get('title')) #harvested as title to fields
        dates = citation.get('date')
        responsibleParty = citation.get('responsibleParty')
        altTitle = citation.get('alternateTitle')
        if altTitle != None:
            altTitle = listToString(altTitle)#Harvested as altTitle to fields
            altTitle = removeComma(altTitle)
         
        #pull citation identifiers (added 10 April 2024)
        #print(citation)
        ids = []
        identify = citation.get('identifier')
        #print (identify)
        if 'identifier' in citation:
            for fier in identify:
                ns = fier.get('namespace')
                i = fier.get('identifier')
                if ns is not None:
                    nsi = str(ns +': '+i)
                    ids.append(nsi) #Harvested to fields
            ids = listToString(ids)
        else:
            ids = 'none'
        #print (ids)
        
        #Get and format startDate and endDate
        try:
            startDate = (timePeriod.get('startDateTime','None')).split('T')[0]
            end = timePeriod.get('endDateTime', 'None')
            if end == None:
                endDate = 'onGoing'
            else:
                endDate = end.split('T')[0]
        except:
            startDate = 'None'
            endDate = 'None'
        
        #KEYWORDS: create empty list for keywords
        klist =[]
        #loop through keyword thesaurus and add keywords to keyword list
        try:
            for g in range(0, len(keyword)):
                word = keyword[g]
                word1 = word.get('keyword')
                for h in range(0, len(word1)):
                    word2 = word1[h]
                    word3 = word2.get('keyword')
                    klist.append(word3)
            keywords = listToString(klist)
        except:
            keywords = 'None'
    
        #TAXONOMY: parse species names from taxonomy; may need to loop if more than one species. updated 14June2024
        if 'taxonomy' not in resourceInfo:# check for taxonomy entry
            taxnameList = 'none'
            comnameList = 'none'
        else:
            taxname = []
            comname = []
            commonname = ""
            taxdic = taxonomy[0] #list len 1 to dictionary
            taxClass = taxdic.get('taxonomicClassification') #list len1
            if len(taxClass) == 1:
                taxSys = taxdic.get('taxonomicSystem')
                if (taxSys[0].get('citation')).get('title')=='Integrated Taxonomic Information System (ITIS)': #if using ITIS taxonomy, continue.
                    taxClass1 = taxClass[0]
                    taxSysID = taxClass1.get('taxonoicSystemID')
                    taxLevel = taxClass1.get('taxonomicLevel')
                    taxName = taxClass1.get('taxonomicName')
                    taxSubClass = taxClass1.get('subClassification')
                    
                    for a in range(0, len(taxSubClass)): 
                        taxSubL = taxSubClass[a]
                        
                        for b in range (0, len(taxSubL)):
                            #taxSysID0 = taxSubL.get('taxonoicSystemID')
                            taxLevel0 = taxSubL.get('taxonomicLevel')
                            #taxName0 = taxSubL.get('taxonomicName')
                            taxSubClass0 = taxSubL.get('subClassification')
                            #taxIs0 = taxSub0.get('isITIS')
                                                
                            for c in range(0,len(taxSubClass0)):
                                subKingdom =taxSubClass0[c]
                                
                                for d in range(0,len(subKingdom)):
                                    infraKingdomL = subKingdom.get('subClassification')
                                    
                                    for e in range(0,len(infraKingdomL)):
                                        infraKingdom = infraKingdomL[0]
                                        phylumL = infraKingdom.get('subClassification')
                                               
                                        for f in range (0,len(phylumL)):
                                            phylum = phylumL[0]
                                            subphylumL = phylum.get('subClassification')
                                                                     
                                            for g in range(0,len(subphylumL)):        
                                                subphylum = subphylumL[0]
                                                infraphylumL = subphylum.get('subClassification')
                                                for h in range(0,len(infraphylumL)):
                                                    infraphylum = infraphylumL[0]
                                                    superclassL = infraphylum.get('subClassification')
                                                                      
                                                    for i in range(0,len(superclassL)):
                                                        superclass = superclassL[0]
                                                        classL = superclass.get('subClassification')
                 
                                                        if classL is not None:
                                                            for j in range(0,len(classL)): 
                                                                classD = classL[0]
                                                                                                                  
                                                                if classD is not None:
                                                                    for k in range(0,len(classD)):
                                                                        orderL = classD.get('subClassification')
                                                                        
                                                                        if orderL is not None:                                                            
                                                                            for l in range(0,len(orderL)):
                                                                                order = orderL[0]
                                                                                familyL = order.get('subClassification')
            
                                                                                if familyL is not None:
                                                                                    for m in range(0,len(familyL)):
                                                                                        family = familyL[m]
                                                                                        genusL = family.get('subClassification')

                                                                                        if genusL is not None:
                                                                                            for n in range(0,len(genusL)):
                                                                                                taxx = genusL[n]
                                                                                                tax = taxx.get('taxonomicName')
                                                                                                if taxx.get('subClassification','none') != 'none':
                                                                                                    sub = taxx.get('subClassification')
                                                                                                    for s in range(0,len(sub)):
                                                                                                        spec = sub[s]
                                                                                                        sp = spec.get('taxonomicName')
                                                                                                        if sp not in taxname:
                                                                                                            taxname.append(sp)
                                                                                                        commm = spec.get('commonName')
                                                                                                        if commm not in comname:
                                                                                                            comname.append(commm)
                                                                                                else:
                                                                                                    taxx = genusL[0]
                                                                                                    taxa = taxx.get('taxonomicName') 
                                                                                                    if taxa not in taxname:
                                                                                                        taxname.append(taxa) #taxname list harvested to output fields
                                                                                                    common = taxx.get('commonName')
                                                                                                    if common not in comname:
                                                                                                        comname.append(common) #comname list harvested to output fields     
                                
                else:
                    taxnameList = 'non-ITIS taxonomy'
                    comnameList = 'non-ITIS taxonomy' 

            #formating taxonomic output:    
            for i in range(len(taxname)):
                taxname[i]=taxname[i].replace("''","")
                
            taxnameList = listToString(taxname)
                
            c2=[]#print (type(comname))
            for c in comname:
                #print(c)# (list(c))
                if c != None and type(c) == list:
                    cstr = listToString(c)
                    cstr.rstrip("'")
                    cstr.rstrip('""')          
                    c2.append(cstr)
            comnameList = listToString(c2)                     
                                             
        #RESOURCE DISTRIBUTION: get resourceDistribution metadata
        resourceDistribution = metadata.get('resourceDistribution')
        distlist = {} #create empty distribution list
        #print (resourceDistribution)
        try:
            for d in range(0, len(resourceDistribution)): #iterate through resourceDistribution info list
                distributor = resourceDistribution[d]
                dist = dict(distributor)
                dist0 = dist.get("distributor")
                dist1 = dist0[0] #dictionary keys = 'contact', 'transferOption'
                contact = dist1.get('contact')
                order = dist1.get('orderProcess')
                transopt = dist1.get('transferOption')
                distrole =contact.get('role') #harvested as distributor role to distlist
                distparty = contact.get('party') #distrbutor contact identifiers
                #if len(distparty) > 1:
                    #for org in range(0,len(distparty)): 
                        #distID = distparty[0]
                        #distributorID = distID.get('contactId') #harvest distributor ID & compare with contact master list be
                #else:
                distID = distparty[0]
                distributorID = distID.get('contactId') #harvest distributor ID & compare with contact master list be
                transopt1 = transopt[0]
                transopt2 = transopt1.get('onlineOption')
                transopt3 = transopt2[0]
                onlineName = transopt3.get('name') #harvested to distlist
                onlineUri = transopt3.get('uri') #harvested to distlist
                distInfo =[distrole, distributorID, onlineName, onlineUri]
                distlist[d] = distInfo
                #print ("distlist = ", distlist)
                distInfoString = '; '.join(distInfo)
    
        except:
            #print(title, ' has NO distribution metadata')
            distInfoString = ' '      

        #Associated product list:
        assoclistString = 'tbd'

        #Extent bounding box
        northlat = 'empty'
        southlat = 'empty'
        eastlong = 'empty'
        westlong = 'enpty'
        geoDis = 'empty'
        if isinstance(extent, list) == True:
            extenlist = (extent[0])
            #Extent description, if present.
            if 'description' in extenlist:
                geoDis = extenlist['description']
            else:
                geoDis = 'noExtentDiscription'
            #Extent bounding box, if present.
            if 'geographicExtent' in extenlist:
                extenGeo = extenlist['geographicExtent']
                extenGeo1 = extenGeo[0]
                if 'boundingBox' in extenGeo1:
                    extenBox = extenGeo1['boundingBox']
                    northlat = extenBox.get('northLatitude')  #harvest for mbm national asset catalog
                    southlat = extenBox.get('southLatitude')  #harvest for mbm national asset catalog
                    eastlong = extenBox.get('eastLongitude')  #harvest for mbm national asset catalog
                    westlong = extenBox.get('westLongitude')  #harvest for mbm national asset catalog      
        else:
            northlat = 'na'
            southlat = 'na'
            eastlong = 'na'
            westlong = 'na'
            geoDis = 'none'

       #POINTS of CONTACT
        #read Master Contact JSON metadata file into dataframe
        contactmetadata = pd.read_json(contact_md)
        contactmd1 = dict(contactmetadata)
        contactmd2 = contactmd1.get('data')
        
        POC = collections.defaultdict(list) # create empty dictionary for contacts
        POCvalues = []
        count = 0

        #iterate through master contact metadata
        for k in contactmd2:
            contactmd3 = contactmd2[count]
            contactmd4 = contactmd3.get('attributes')
            contactmd5 = contactmd4.get('json')
            if contactmd5 is not None:
                contactmd6 = json.loads(contactmd5)
                contactmd7 = dict(contactmd6)
                contactIDmd = contactmd7.get('contactId') #harvest id#
                count += 1
            else:
                continue

            for id in distlist:
                if distlist[id][1] == contactIDmd:
                    contactisOrganizationmd = contactmd7.get('isOrganization')
                    contactName = contactmd7.get('name') #havest as distributor name to fields
                    contactMemberOf = contactmd7.get('memberOfOrganization')
                    contactemail = contactmd7.get('electronicMailAdddress')
                    contactType = contactmd7.get('contactType')
                    distlist[id][1] = contactName 

            # iterate through contacts from metadata
            for j in pointOfContact:
                party = j.get('party')
                for p in range(0, len(party)):
                    partyContact = party[p]
                    partyContactID = partyContact.get('contactId') #id to compare in master contact list
                    role = j.get('role')
    
                    #compare master list contact ID with metadata contact ID
                    if contactIDmd == partyContactID:
                        contactisOrganizationmd = contactmd7.get('isOrganization')
                        contactName = contactmd7.get('name')
                        contactMemberOf = contactmd7.get('memberOfOrganization')
                        contactemail = contactmd7.get('electronicMailAdddress')
                        contactType = contactmd7.get('contactType')
                        POC[role].append(contactName)
                    else:
                        continue

            owner = listToString(POC['owner'])
            PointOC = listToString(POC['pointOfContact'])
            princ = listToString(POC['principalInvestigator'])
            custodian = listToString(POC['custodian'])
            admin = listToString(POC['administrator'])
            originator = listToString(POC['originator'])
            contributor = listToString(POC['contributor'])
            #distlistString = '; '.join(distInfo) 


       #Write vales to CSV
        fields = [ID, ids, typee, title, altTitle, typeename, purpose, abstract, shortAbstract, 
                  PointOC, owner, princ, custodian, admin, originator, contributor, startDate, endDate, lastUpdate, status, 
                  metaIdentifier, metadataStatus, keywords, taxnameList, comnameList, distInfoString, assoclistString,
                  northlat, southlat, eastlong, westlong, geoDis]
        
        #write files to csv
        with open (csvname, 'a', newline = '') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(fields) 
            #print("CSV created./n","Number of contacts in master list = ", len(contactmd2))     


        
        #Extent to geodataframe if True
        if geo == True:  #only gather extents from projects?
            if 'geographicExtent' not in extenlist:
                print ('metadata has NO geographic extent.')
                continue
            else:
                extenGeo = extenlist['geographicExtent']
                extenGeo1 = extenGeo[0]
                if 'geographicElement' not in extenGeo1:  
                    print ('metadata has NO Geometry.')
                    continue
                else:
                    extenGeoElement = extenGeo1['geographicElement'] #type = list
                    geoInput =[] #empty list for geoinput to geodataframe
                    poly = gpd.GeoDataFrame(columns = ['ID', 'ids' 'name','geometry', 'type', 'title', 'altTitle', 'typename', 
                                                       'purpose', 'abstract', 'shortAb', 'PointOC', 'trustee', 'PI', 
                                                       'custodian', 'admin', 'origin', 'contrib', 'startDate', 'endDate', 'lastUpdat',
                                                       'status', 'metaIdent', 'metaStatus', 'keywords', 'taxname', 'comname',
                                                       'distib', 'assoc']) #gEid?
                    for ex in range(0, len(extenGeoElement)): #need id, name, descripiton, geometry
                        gElement = extenGeoElement[ex]  #=dict_keys(['type', 'id', 'geometry', 'properties']) or ['type', 'features']
                        if gElement.get('type') == "FeatureCollection":
                            extrastep = gElement.get('features')
                            nextstep = extrastep[0] #=dict_keys(['type', 'id', 'geometry', 'properties'])
                            gtype = nextstep.get('type') #Feature
                            gEid = nextstep.get('id') #harvest as GeoID to fields
                            gEgeometry = nextstep.get('geometry') #type=dict_keys(['type', 'coordinates'])
                            ggtype = gEgeometry.get('type')
                            gcoordinates = gEgeometry.get('coordinates')#list
                            #print (gcoordinates)
                            gEcoordinates = gcoordinates[0] #list length = 1
                            
                            if len(gEcoordinates) == 1:
                                gEcoordinates = gEcoordinates[0]
                        
                            #poly_coord = Polygon(gEcoordinates)
                            gEproperties = nextstep.get('properties')
                            gname = gEproperties.get ('name', 'NotNamed') #harvested to geodataframe
                            propertyDesc = gEproperties.get('description', 'NotDescribed')
                        
                            if ggtype == 'Polygon':
                                #gcoordinates = gEgeometry.get('coordinates')#list
                                #gEcoordinates = gcoordinates[0] #list
                                poly_coord = Polygon(gEcoordinates)
                                geoattributes = {'id':gEid, 'ids': ids, 'name':gname, 'geometry':poly_coord, 'type':typee, 'title':title, 
                                    'altTitle':altTitle, 'typename':typeename, 'purpose':purpose, 'abstract':abstract, 'shortAb':shortAbstract, 
                                    'PointOC':PointOC, 'trustee':owner, 'PI':princ, 'custodian':custodian, 'admin':admin,
                                    'origin':originator, 'contrib':contributor, 'startDate':startDate, 
                                    'endDate':endDate, 'lastUpdat':lastUpdate, 'status':status, 
                                    'metaIdent':metaIdentifier, 'metaStatus':metadataStatus, 'keywords':keywords, 'taxname':taxnameList,
                                    'comname':comnameList, 'distrib':distInfoString, 'assoc':assoclistString} #creating dict of geoattrit of geoattribute 
                                geoInput.append(geoattributes)
                            elif ggtype == 'Point':
                                ptcoordinates = gEgeometry.get('coordinates')
                                pt_coord = Point(ptcoordinates)
                                geoattributes = {'id':gEid, 'name':gname, 'geometry':pt_coord, 'type':typee, 'title':title, 
                                   'altTitle':altTitle, 'typename':typeename, 'purpose':purpose, 'abstract':abstract, 'shortAb':shortAbstract, 
                                    'PointOC':PointOC, 'trustee':owner, 'PI':princ, 'custodian':custodian, 'admin':admin,
                                    'origin':originator, 'contrib':contributor, 'startDate':startDate, 
                                    'endDate':endDate, 'lastUpdate':lastUpdate, 'status':status, 
                                    'metaIdent':metaIdentifier, 'metaStatus':metadataStatus, 'keywords':keywords, 'taxname':taxnameList,
                                    'comname':comnameList, 'distrib':distInfoString, 'assoc':assoclistString} #gEid?} #creating dict of geoattributesgeoattributes = {'id':gEid, 'name':gname, 'geometry':pt_coord} #creating dict of geoattributes
                                geoInput.append(geoattributes)
                            elif ggtype == 'MultiPolygon':
                                print ('this is multipolygon')
                                mpoly_coord = MultiPolygon(gEcoordinates)
                                geoattributes = {'id':gEid, 'ids': ids, 'name':gname, 'geometry':mpoly_coord, 'type':typee, 'title':title, 
                                    'altTitle':altTitle, 'typename':typeename, 'purpose':purpose, 'abstract':abstract, 'shortAb':shortAbstract, 
                                    'PointOC':PointOC, 'trustee':owner, 'PI':princ, 'custodian':custodian, 'admin':admin,
                                    'origin':originator, 'contrib':contributor, 'startDate':startDate, 
                                    'endDate':endDate, 'lastUpdat':lastUpdate, 'status':status, 
                                    'metaIdent':metaIdentifier, 'metaStatus':metadataStatus, 'keywords':keywords, 'taxname':taxnameList,
                                    'comname':comnameList, 'distrib':distInfoString, 'assoc':assoclistString} #gEid?} #creating dict of geoatttribut                        
                                geoInput.append(geoattributes)
                            else:
                                continue
                            
                        elif gElement.get('type') == "Feature":
                            gEtype = gElement.get('type') #dict_keys(['type', 'id', 'geometry', 'properties'])
                            gEid = gElement.get('id') #harvest as GeoID to fields
                            gEgeometry = gElement.get('geometry') #type=dict_keys(['type', 'coordinates'])
                            try:
                                gEproperties = gElement.get('properties')
                                gname = gEproperties.get ('name', 'NotNamed') #harvested to geodataframe
                                #propertyDesc = gEproperties.get('description')
                            except: 
                                gname = 'NotDefined'
                            gtype = gEgeometry.get('type') #indicates geometry type: Polygon, Point, line
                            if gtype == 'Polygon':
                                gcoordinates = gEgeometry.get('coordinates')#list
                                gEcoordinates = gcoordinates[0] #list
                                poly_coord = Polygon(gEcoordinates)
                                geoattributes = {'id':gEid, 'ids': ids, 'name':gname, 'geometry':poly_coord, 'type':typee, 'title':title, 
                                   'altTitle':altTitle, 'typename':typeename, 'purpose':purpose, 'abstract':abstract, 'shortAb':shortAbstract, 
                                    'PointOC':PointOC, 'trustee':owner, 'PI':princ, 'custodian':custodian, 'admin':admin,
                                    'origin':originator, 'contrib':contributor, 'startDate':startDate, 
                                    'endDate':endDate, 'lastUpdat':lastUpdate, 'status':status, 
                                    'metaIdent':metaIdentifier, 'metaStatus':metadataStatus, 'keywords':keywords, 'taxname':taxnameList,
                                    'comname':comnameList, 'distrib':distInfoString, 'assoc':assoclistString} #gEid?} #creating dict of geoattributes
                                geoInput.append(geoattributes)
                            elif gtype == 'Point':
                                ptcoordinates = gEgeometry.get('coordinates')
                                pt_coord = Point(ptcoordinates)
                                geoattributes = {'id':gEid, 'ids': ids, 'name':gname, 'geometry':pt_coord, 'type':typee, 'title':title, 
                                   'altTitle':altTitle, 'typename':typeename, 'purpose':purpose, 'abstract':abstract, 'shortAb':shortAbstract, 
                                    'PointOC':PointOC, 'trustee':owner, 'PI':princ, 'custodian':custodian, 'admin':admin,
                                    'origin':originator, 'contrib':contributor, 'startDate':startDate, 
                                    'endDate':endDate, 'lastUpdat':lastUpdate, 'status':status, 
                                    'metaIdent':metaIdentifier, 'metaStatus':metadataStatus, 'keywords':keywords, 'taxname':taxnameList,
                                    'comname':comnameList, 'distrib':distInfoString, 'assoc':assoclistString} #gEid?} #creating dict of geoattributes
                                geoInput.append(geoattributes)
                            elif gtype == 'LineString':
                                ptcoordinates = gEgeometry.get('coordinates')
                                ln_coord = LineString(ptcoordinates)
                                geoattributes = {'id':gEid, 'name':gname, 'geometry':ln_coord, 'type':typee, 'title':title, 
                                   'altTitle':altTitle, 'typename':typeename, 'purpose':purpose, 'abstract':abstract, 'shortAb':shortAbstract, 
                                    'PointOC':PointOC, 'trustee':owner, 'PI':princ, 'custodian':custodian, 'admin':admin,
                                    'origin':originator, 'contrib':contributor, 'startDate':startDate, 
                                    'endDate':endDate, 'lastUpdate':lastUpdate, 'status':status, 
                                    'metaIdent':metaIdentifier, 'metaStatus':metadataStatus, 'keywords':keywords, 'taxname':taxnameList,
                                    'comname':comnameList, 'distrib':distInfoString, 'assoc':assoclistString} #gEid?} #creating dict of geoattributesgeoattributes = {'id':gEid, 'name':gname, 'geometry':pt_coord} #creating dict of geoattributes
                                geoInput.append(geoattributes)
                            elif gtype == 'MultiPolygon':
                                gcoordinates = gEgeometry.get('coordinates')#list
                                gEcoordinates = gcoordinates[0] #list
                                mpoly_coord = MultiPolygon(gEcoordinates)
                                geoattributes = {'id':gEid, 'ids': ids, 'name':gname, 'geometry':mpoly_coord, 'type':typee, 'title':title, 
                                    'altTitle':altTitle, 'typename':typeename, 'purpose':purpose, 'abstract':abstract, 'shortAb':shortAbstract, 
                                    'PointOC':PointOC, 'trustee':owner, 'PI':princ, 'custodian':custodian, 'admin':admin,
                                    'origin':originator, 'contrib':contributor, 'startDate':startDate, 
                                    'endDate':endDate, 'lastUpdat':lastUpdate, 'status':status, 
                                    'metaIdent':metaIdentifier, 'metaStatus':metadataStatus, 'keywords':keywords, 'taxname':taxnameList,
                                    'comname':comnameList, 'distrib':distInfoString, 'assoc':assoclistString} #gEid?} #creating dict of geoattributes
                                geoInput.append(geoattributes)
                            else:
                                continue    
            
                    poly = gpd.GeoDataFrame(geoInput, geometry = 'geometry', crs = "EPSG:4326")  #crs = lat, long designation 
                    polyname = str(workspace + title[0:12] +'.shp') #generate name for shapefile 
        
                            #if geography is desired, then merge metadata with extent and output shapefile
                            #if outShape == True:
                            #Create shapefile from csv with geographic info
                            #metadf = pd.read_csv(csvname, encoding = 'cp1252') #read completed csv into dataframe
            
                            #Subset for projects only
                            #projectOnly = pd.DataFrame(metadf.loc[metadf['typee'] == 'project'])
            
                            #merge geodataframe with metadata dataframe
                            #dfmerge = pd.merge(poly, projectOnly, how='cross')#,left_on='id',right_on='GeoID')
            
                    #write shapefile
                    poly.to_file(polyname, encoding = 'utf-8')#, driver = 'ESRI Shapefile', schema = {"geometry": "Polygon", "properties":{"id":"int"}})
                    print("Geometry found.") 
        else:
            continue
    return
    

In [None]:
#Search Migratory Birds Management RDR folder for mdeditor files to extract metadata
#Count number of preserved mdEditor records
import os
RDR = '\\\\ifw7ro-file.fws.doi.net\\datamgt\\mbm'
MBMmetadataNo = 0
program = "Migratory Bird Manangement"

# Pathway to the contacts file you want to use to check against existing vs. new contacts; i.e., master AK contacts file\n",
contact_md = 'C:\\Users\\tpatterson\\OneDrive - DOI\\Documents\\DM_Metadatafiles\\AK_contacts_profiles\\AK-contacts-mdeditor-20250122-150150.json'

# Pathway to csv file where to write metaata
csvname = 'C:\\Users\\tpatterson\\OneDrive - DOI\\Documents\\DM_Metadatafiles\\CatalogCSV\\catalogCSVPhase220250404.csv'

workspace = 'C:\\Users\\tpatterson\\OneDrive - DOI\\Documents\\DM_Metadatafiles\\CatalogCSV\\MBMExtentTest\\'

#loop through RDR folder structure and find mdeditor json files that is NOT in incoming folder
for root, dirs, files in os.walk(RDR,topdown=True):
    #print ("root=", root, "  dirs=", dirs, "  file=", files)
    for name in files:
        if 'incoming' not in root and 'archive' not in root and 'mdeditor' in name and 'init' not in name and name.endswith('.json'):
            jfile = os.path.join(root,name)
            mdEditor_read(jfile, contact_md, csvname, workspace, resourcetype ='project', geo=True)
            MBMmetadataNo += 1
            print (jfile, " Done.")
print ("Number of MBM completed mdeditor records in RDR = ", MBMmetadataNo) 


Search record resource type is project and record resource type is project.
Geometry found.
Not a project, moving on.
Not a project, moving on.
\\ifw7ro-file.fws.doi.net\datamgt\mbm\mbmjv_001_SeaDuck_Key_Habitat_Atlas\metadata\mbm7jv_001_seaDuckAtlas_mdeditor-20241115-171152.json  Done.
Search record resource type is project and record resource type is project.
Geometry found.
Not a project, moving on.
Not a project, moving on.
Not a project, moving on.
Not a project, moving on.
Not a project, moving on.
\\ifw7ro-file.fws.doi.net\datamgt\mbm\mbmjv_002_SDJV158_Isotopes\metadata\mbm7jv_002_mdeditor-20250225-110237.json  Done.
Search record resource type is project and record resource type is project.
Geometry found.
Not a project, moving on.
Not a project, moving on.
Not a project, moving on.
Not a project, moving on.
\\ifw7ro-file.fws.doi.net\datamgt\mbm\mbmjv_004_SDJV176_ScoterIPM\metadata\mbm7jv_ScoterIPM_mdeditor-20250404-100466.json  Done.
Not a project, moving on.
Not a project, mo

In [30]:
import geopandas as gpd
import pandas as pd
import os

workspace = "C:\\Users\\tpatterson\\OneDrive - DOI\\Documents\\DM_Metadatafiles\\CatalogCSV\\MBMExtentTest\\"

def merge_polygon_shapefiles(directory, output='merged.shp'):
    #merge all shapefiles in a directory intos a singe shapefile.
    shapefiles = []
    for filename in os.listdir(directory):
        if filename.endswith('.shp'):
            filepath = os.path.join(directory, filename)
            geo = gpd.read_file(filepath)
            geo = geo[geo['geometry'].geom_type.isin(['Polygon'])] 
            shapefiles.append(geo)
              

    if not shapefiles:
        raise ValueError ("No shapefiles found in the directory.")
    
    merged_geo = pd. concat(shapefiles, ignore_index=True)
    merged_geo.to_file(output)
    print(f"Successfully merged shapefiles to {output}")
    return

merge_polygon_shapefiles(workspace, output= workspace + 'merged.shp')


Successfully merged shapefiles to C:\Users\tpatterson\OneDrive - DOI\Documents\DM_Metadatafiles\CatalogCSV\MBMExtentTest\merged.shp
