Skip to content
Go to file
Cannot retrieve contributors at this time
177 lines (110 sloc) 5.18 KB
#more info
from rdflib import Namespace, Literal, URIRef, Graph
import csv, json, re
names = Graph()
#set up the photo dictionary
photo_dict = {}
#set up the location dictionary
photo_place = {}
#set up the date dictionary
photo_date = {}
with open('data/tulane-refined.csv', 'r') as csvfile2:
#dumps the file into the cvs library with some info on how it is formatted
tulane_refined = csv.reader(csvfile2, delimiter=',')
for row in tulane_refined:
full_name = row[4]
photoURL = row[39]
placeURI = row[43]
date = row[11]
#look for the photoURL in the dictionary; if it's already there, add the Tulane name to the value array
if photoURL in photo_dict:
#else create a new key:[value] pair
photo_dict[photoURL] = [full_name]
#print (photo_dict)
#print ("we have", len(photo_dict), "photos and", sum(len(v) for v in photo_dict.items()), "depictions")
#if there's a place URI, put it in the dictionary with the photoURL as key
if placeURI:
photo_place[photoURL] = placeURI
if date:
photo_date[photoURL] = date
#matt: we are going to do something similar as the photo_dict here but use the peoples URIs as the values instead of the tulane names
#so it will be something like
#{ "" : [URI_1,URI_2,URI_3] }
knows_of_dict = {}
#now let's create some triples
with open('data/tulane_results.json') as viaf_matches_data:
viaf_matches = json.load(viaf_matches_data)
for a_match in viaf_matches:
#print (viaf_matches[a_match]['tulane_last'])
if viaf_matches[a_match]['mapping_quality'] == 'high':
mappings = viaf_matches[a_match]['mapping'][0]
lc_source = None
wkp_source = None
#matt: added this so the check works below, otherwise it would reuse the variable in the loop, we need to reset it each itteration of the loop
subject = None
for a_source in mappings['sources']:
if a_source.find('WKP|') > -1:
wkp_source = a_source.split("|")[1]
if a_source.find('LC|') > -1:
lc_source = a_source.split("|")[1]
if wkp_source != None:
subject = URIRef("" + wkp_source.replace('"','%22'))
elif lc_source != None:
subject = URIRef("" + lc_source.replace(' ',''))
#use_source = mappings['sources'][0].split("|")[1]
print (viaf_matches[a_match]['tulane_name'], "Non-Wiki or LC auth!", mappings['sources'])
if subject:
names.add((subject, URIRef("") ,URIRef("")))
names.add((subject, URIRef("") , Literal(viaf_matches[a_match]['tulane_name'], lang='en') ))
for key, value in photo_dict.items():
if viaf_matches[a_match]['tulane_name'] in value:
names.add((subject, URIRef("") , URIRef(key) ))
#does this photo key exist in the dict for the knowsof, otherwise make it and add in this person's uri
if key in knows_of_dict:
#make sure it is not already in there
if subject not in knows_of_dict[key]:
#else create a new key:[value] pair
knows_of_dict[key] = [subject]
#does this photo have a placeURI in the photo_place dict?
if key in photo_place:
names.add(( URIRef(key), URIRef(""), URIRef(photo_place[key]) ))
#does this photo have a date in the date dict?
if key in photo_date:
#use some regex to determine the date format and append a datatype IRI; see
if re.match('^[0-9]{4}-[0-9]{2}-[0-9]{2}$', photo_date[key]):
date_IRI = ''
elif re.match('^[0-9]{4}-[0-9]{2}$', photo_date[key]):
date_IRI = ''
elif re.match('^[0-9]{4}$', photo_date[key]):
date_IRI = ''
date_IRI = ''
names.add(( URIRef(key), URIRef(""), Literal(photo_date[key], datatype=date_IRI) ))
#matt: now we have a dict of photo urls that have the uris that appear in them, so we can say they know each other
for a_uri in knows_of_dict:
#make the code a little more readable
photoURL = a_uri
people_uris = knows_of_dict[a_uri]
#loop through the URIs (if it is greater than 1 person)
if len(people_uris) > 1:
#loop through all the people
for person_x in people_uris:
#now loop though it again to get the others
for person_y in people_uris:
#we are not ourselves
if person_x != person_y:
#print (person_x, "knows", person_y)
#Build this triple and add it to the names graph
names.add((person_x, URIRef(""), person_y))
# Example, of how the dict would look based off the large sheet.
# {
# "" : ["name1","name2","name3"]
# }
names.serialize("data/names.nt", format="nt")
You can’t perform that action at this time.