This notebook creates two subgraphs from the Booksampo SW, 
- one for novels written in Finnish, and 
- one for novels written in another language but translated into Finnish



In [2]:
# import packages

from collections import defaultdict, OrderedDict, Counter
from datetime import datetime

from dateutil.relativedelta import relativedelta
import glob
from itertools import product, combinations
import matplotlib.pyplot as plt
import numpy    as np
#import networkx as nx
from operator import itemgetter
import pandas   as pd
import rdflib as rdflib
import re
from scipy.signal import convolve2d
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer, minmax_scale
from SPARQLWrapper import SPARQLWrapper, JSON, POST, TURTLE
import sys
from rdflib.namespace import XSD, Namespace
from rdflib.term import URIRef

import IPython
import pprint
pp = pprint.PrettyPrinter(indent=4)



from constants import *
from helpers import *

In [3]:
# 
from rdflib import Graph

from rdflib.namespace import RDF, SKOS
rel = rdflib.Namespace('http://ldf.fi/relse/')


In [4]:
# colors
import matplotlib.colors as mcolors
from matplotlib import cm
# set style
import seaborn as sns

# set theme for visualizations
sns.set_theme(style="white",palette="PuOr_r")

## Translated novels


In [5]:
# Query
Q = """
PREFIX rel: <http://ldf.fi/relse/>

CONSTRUCT {

## novel
?teos a rel:novel ;
    skos:prefLabel ?title ;
    rel:langOrig ?kieliuri ;
    rel:hasAuthor ?author ;
    rel:pubYear ?vuosi_ .

## author
?author a rel:author ;
	skos:prefLabel ?authorname ;
	foaf:gender ?gender ;
	rel:authorNationality ?kansallisuusuri .

## language
?kieliuri a rel:lang ;
	skos:prefLabel ?kieli .

## nationality
?kansallisuusuri a rel:nationality ;
	skos:prefLabel ?kansallisuus. 

}

WHERE {

  ?teos a kaunokki:romaani ; skos:prefLabel ?title .
  ?teos kaunokki:alkukieli ?kieliuri .
  FILTER (?kieliuri !=  <http://lexvo.org/id/iso639-3/fin>  ) # original language not Finnish
  OPTIONAL {?kieliuri skos:prefLabel ?kieli . }
  ?teos kaunokki:manifests_in ?julkaisu .
  ?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin> . # translated into Finnish
  FILTER EXISTS {?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin>  }
  ?julkaisu kaunokki:ilmestymisvuosi ?vuosi . ?vuosi skos:prefLabel ?vuosiluku .
  BIND(xsd:integer(?vuosiluku) AS ?vuosi_)
  
  # the novel has to have an author
  ?teos kaunokki:tekija ?author.
  ?author skos:prefLabel ?authorname .
OPTIONAL {?author kaunokki:kansallisuus ?kansallisuusuri . OPTIONAL { ?kansallisuusuri skos:prefLabel ?kansallisuus .} }
OPTIONAL  {?author foaf:gender ?gender }


} 
"""

In [6]:
# construct graph based on the query

# query graph
print('\n\n*** constructing the graph')
sparql = SPARQLWrapper("http://ldf.fi/booksampo-2022/sparql")

sparql.setQuery(PREFIXES + Q)

sparql.addCustomHttpHeader(*list(AUTHORIZATION_HEADER.items())[0])

sparql.setReturnFormat(TURTLE)
results = sparql.query().convert()



*** constructing the graph


In [7]:
# parse

from rdflib import Graph
g = Graph()
#g.parse(data=results, format="turtle") # "mygraph.ttl"
g.parse(data=results, format="turtle")

<Graph identifier=N15744ebf765c4cd9a1ed21b0aa1f95a8 (<class 'rdflib.graph.Graph'>)>

In [54]:
# g correct
# foreign
#g = Graph() 
#g.parse('../../data/nat_lang_graph.nt')

<Graph identifier=Nae544eb22f6243d2aabd69c1fd567e8c (<class 'rdflib.graph.Graph'>)>

In [20]:
# find all novels, see how many there are

from rdflib.namespace import RDF, SKOS
rel = rdflib.Namespace('http://ldf.fi/relse/') 

novels=[]
for s, p, o in g.triples((None,  None, None)):
    #print(o)
    if o ==rel.novel:
        #print(str(s))
        novels.append(str(s))
    #print(f"{s} is a  novel")
    
print("found" ,len(set(novels)), "novels")
print(novels[:5])


found 24963 novels
['http://www.yso.fi/onto/kaunokki#ateos_9249', 'http://data.kirjasampo.fi/abstractWork_4344055', 'http://seco.tkk.fi/saha3/u604b23fa-41eb-44f6-8368-1a09df1209d4', 'http://seco.tkk.fi/saha3/ud9041e8c-63c5-4808-9a7c-5d03826f9e3c', 'http://data.kirjasampo.fi/abstractWork_7177234']


In [14]:
# check one as an example to see that the stored information is correct

for s, p, o in g.triples((None,  None, None)):
    #print(o)
    if str(s) == 'http://www.yso.fi/onto/kaunokki#ateos_9249':
    
        print(s,p,o)
        
    


http://www.yso.fi/onto/kaunokki#ateos_9249 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://ldf.fi/relse/novel
http://www.yso.fi/onto/kaunokki#ateos_9249 http://ldf.fi/relse/langOrig http://lexvo.org/id/iso639-3/eng
http://www.yso.fi/onto/kaunokki#ateos_9249 http://www.w3.org/2004/02/skos/core#prefLabel Kolme morsianta
http://www.yso.fi/onto/kaunokki#ateos_9249 http://www.w3.org/2004/02/skos/core#prefLabel The Judas Kiss
http://www.yso.fi/onto/kaunokki#ateos_9249 http://ldf.fi/relse/hasAuthor http://www.yso.fi/onto/kaunokki#person_123175975389690
http://www.yso.fi/onto/kaunokki#ateos_9249 http://ldf.fi/relse/pubYear 1982
http://www.yso.fi/onto/kaunokki#ateos_9249 http://www.w3.org/2004/02/skos/core#prefLabel Judaskyssen


In [15]:
# title lookup

# query the database for novel titles

qres = g.query("""
PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?id ?label
WHERE {
  ?id a rel:novel ; skos:prefLabel ?label
}  """)

title_lookup=dict([(str(url), str(label)) for url, label in qres])

list(title_lookup.items())[:10]

[('http://www.btj.fi/at_1678175', 'Petra Delicado ja merkityt tytöt'),
 ('http://data.kirjasampo.fi/abstractWork_9789518830927',
  'Piraatti-Pete ja hirmuiset merirosvot'),
 ('http://www.yso.fi/onto/kaunokki#ateos_9589', 'Ensimmäinen kotka'),
 ('http://www.yso.fi/onto/kaunokki#ateos_3912', 'Yön suojissa'),
 ('http://data.kirjasampo.fi/abstractWork_6834230', "The orphan's tale"),
 ('http://data.kirjasampo.fi/abstractWork_4124516', 'The ordinary princess'),
 ('http://www.yso.fi/onto/kaunokki#ateos_2216', 'Mieheni oli kommunisti'),
 ('http://data.kirjasampo.fi/abstractWork_9789511255659', 'Suden hetki'),
 ('http://seco.tkk.fi/saha3/ue552c4bb-ecd8-495d-be5d-00c45f9caaa8',
  'Matka avaruudessa'),
 ('http://seco.tkk.fi/saha3/ucf938d75-fc5e-4637-9715-73092ba5482a',
  'Kadonneet kaverukset')]

In [27]:
# gender lookup

# query gender for each author

qres = g.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?teos ?gender
WHERE {
  ?teos a rel:novel ; rel:hasAuthor ?author. 
  ?author foaf:gender ?gender .
}  """)

gender_lookup=defaultdict(set)

for url, label in qres:
    gender_lookup[str(url)].update([str(label)])

gendernovels=len(gender_lookup.keys()) 

gender_lookup2 = {novel:"unk" for novel in novels if novel not in gender_lookup.keys() }
gender_lookup.update(gender_lookup2)
print("Author gender known for", gendernovels,"out of" ,len(gender_lookup), "novels")
print(list(gender_lookup.items())[:10],list(gender_lookup.items())[-10:])

Author gender known for 24389 out of 24963 novels
[('http://www.btj.fi/at_1678175', {'http://www.yso.fi/onto/kaunokki#female'}), ('http://data.kirjasampo.fi/abstractWork_9789518830927', {'http://www.yso.fi/onto/kaunokki#female'}), ('http://www.yso.fi/onto/kaunokki#ateos_9589', {'http://www.yso.fi/onto/kaunokki#male'}), ('http://www.yso.fi/onto/kaunokki#ateos_3912', {'http://www.yso.fi/onto/kaunokki#female'}), ('http://data.kirjasampo.fi/abstractWork_6834230', {'http://www.yso.fi/onto/kaunokki#female'}), ('http://data.kirjasampo.fi/abstractWork_4124516', {'http://www.yso.fi/onto/kaunokki#female'}), ('http://www.yso.fi/onto/kaunokki#ateos_2216', {'http://www.yso.fi/onto/kaunokki#male'}), ('http://data.kirjasampo.fi/abstractWork_9789511255659', {'http://www.yso.fi/onto/kaunokki#female'}), ('http://seco.tkk.fi/saha3/ue552c4bb-ecd8-495d-be5d-00c45f9caaa8', {'http://www.yso.fi/onto/kaunokki#male'}), ('http://seco.tkk.fi/saha3/u99617114-2455-49e4-8bed-05851eb04a3f', {'http://www.yso.fi/onto/k

## Correct nationalities

Remove/unify duplicates and unnecessary nationalities found in the data

In [29]:
# nationality lookup

qres = g.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?teos ?nationality
WHERE {
  ?teos a rel:novel ; rel:hasAuthor ?author. 
  ?author rel:authorNationality ?nationality .# ?nationality skos:prefLabel ?label.
  #FILTER (LANG(?label)="fi")
}  """)

#novelnat_lookup=dict([(str(url), str(label)) for url, label in qres])
#print(list(gender_lookup.items())[-10:])
import collections 

novelnat=defaultdict(set)
for url, label in qres:
    novelnat[str(url)].update([str(label)])
novelnats=len(novelnat.keys())
novelnat_lookup2 = {novel:"unk" for novel in novels if novel not in novelnat.keys() }
novelnat.update(novelnat_lookup2)

# print out the total number of novels with author nationality
print("Author nationality known for", novelnats ,"/" ,len(novelnat), "novels \n")


print(list(novelnat.items())[:10],list(novelnat.items())[-10:])

Author nationality known for 23549 / 24963 novels 

[('http://www.btj.fi/at_1678175', {'http://www.yso.fi/onto/koko/p13629'}), ('http://data.kirjasampo.fi/abstractWork_9789518830927', {'http://www.yso.fi/onto/koko/p2224'}), ('http://www.yso.fi/onto/kaunokki#ateos_9589', {'http://www.yso.fi/onto/koko/p73101'}), ('http://www.yso.fi/onto/kaunokki#ateos_3912', {'http://www.yso.fi/onto/koko/p73101'}), ('http://data.kirjasampo.fi/abstractWork_6834230', {'http://www.yso.fi/onto/koko/p73101'}), ('http://www.yso.fi/onto/kaunokki#ateos_2216', {'http://www.yso.fi/onto/koko/p73101'}), ('http://data.kirjasampo.fi/abstractWork_9789511255659', {'http://www.yso.fi/onto/koko/p73101'}), ('http://seco.tkk.fi/saha3/ue552c4bb-ecd8-495d-be5d-00c45f9caaa8', {'http://www.yso.fi/onto/koko/p73101'}), ('http://seco.tkk.fi/saha3/ucf938d75-fc5e-4637-9715-73092ba5482a', {'http://www.yso.fi/onto/koko/p54872'}), ('http://seco.tkk.fi/saha3/u99617114-2455-49e4-8bed-05851eb04a3f', {'http://www.yso.fi/onto/koko/p3437'})]

In [31]:
# nationality label lookup

qres = g.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?nationality (SAMPLE(?natlabel) AS ?natl)
WHERE {
  ?teos a rel:novel ; rel:hasAuthor ?author. 
  ?author rel:authorNationality ?nationality . ?nationality skos:prefLabel ?label.
  
  FILTER (LANG(?label)="fi") # only labels in Finnish
  ?nationality skos:prefLabel ?labelAny.
  #BIND( "unk" as ?label2)
  BIND(COALESCE(?label, ?labelAny) AS ?natlabel )
} GROUP BY ?nationality
""")

nat_lookup=defaultdict(set)
for url, label in qres:
    nat_lookup[str(url)].update([str(label)])
    

print(len(nat_lookup))
print(list(nat_lookup.items())[:10],list(nat_lookup.items())[-10:])

152
[('http://www.yso.fi/onto/koko/p13629', {'espanjalaiset'}), ('http://www.yso.fi/onto/koko/p2224', {'britit'}), ('http://www.yso.fi/onto/koko/p73101', {'yhdysvaltalaiset'}), ('http://www.yso.fi/onto/koko/p54872', {'kanadalaiset'}), ('http://www.yso.fi/onto/koko/p3437', {'ranskalaiset'}), ('http://www.yso.fi/onto/koko/p745', {'saksalaiset'}), ('http://www.yso.fi/onto/koko/p14632', {'alankomaalaiset'}), ('http://www.yso.fi/onto/koko/p6292', {'tanskalaiset'}), ('http://www.yso.fi/onto/koko/p607', {'englantilaiset'}), ('http://seco.tkk.fi/saha3/u996eb3eb-230f-4dc5-b607-8f06acbf66ad', {'libanonilaiset'})] [('http://seco.tkk.fi/saha3/uae9f2b4b-bf6b-4b38-afbe-8d92fa84372c', {'makedonialaiset'}), ('http://seco.tkk.fi/saha3/u4d0f8bfc-1dc4-4d8b-886f-8c4430553d48', {'jamaikalaiset'}), ('http://www.yso.fi/onto/koko/p3295', {'baltit'}), ('http://www.yso.fi/onto/koko/p35074', {'marit'}), ('http://seco.tkk.fi/saha3/ud472f914-dcc0-4ec3-9753-7815d6b15b93', {'norsunluurannikkolaiset'}), ('http://www.

In [32]:
natlabel_lookup=defaultdict(set)
for url, label in qres:
    natlabel_lookup[str(label)].update([str(url)])

In [33]:
# check which nationalities have multiple uris

for k,v in natlabel_lookup.items():
    if len(v)> 1:
        print(k,sorted(v))

sveitsiläiset ['http://seco.tkk.fi/onto/toimo/nationalities/Swiss', 'http://www.yso.fi/onto/koko/p7934']
portugalilaiset ['http://seco.tkk.fi/saha3/ud33f4204-1c6d-40a1-a3ee-2a8158f56921', 'http://www.yso.fi/onto/koko/p52159']
itävaltalaiset ['http://seco.tkk.fi/onto/toimo/nationalities/Austrian', 'http://www.yso.fi/onto/koko/p5291']
skotit ['http://seco.tkk.fi/onto/toimo/nationalities/Scottish_Scots_', 'http://www.yso.fi/onto/koko/p16273']
unkarilaiset ['http://seco.tkk.fi/onto/toimo/nationalities/Hungarian', 'http://www.yso.fi/onto/koko/p34461']
ukrainalaiset ['http://seco.tkk.fi/onto/toimo/nationalities/Ukrainian', 'http://www.yso.fi/onto/koko/p9118']
irakilaiset ['http://seco.tkk.fi/saha3/u84b75979-9611-4eac-96b9-fec1d03bf051', 'http://www.yso.fi/onto/koko/p69907']


In [34]:
# correct these to only use the the yso.fi/... codes
g2=g

allres=[]
for k,v in natlabel_lookup.items():
    if len(v)> 1:
        print(k,sorted(v))
        old= sorted(v)[0]
        new=sorted(v)[1]
        query="""
                 DELETE {?author rel:authorNationality <OLD> }
                 INSERT { ?author rel:authorNationality  <NEW> }
                 WHERE { ?author rel:authorNationality <OLD> }
                 """
        q=query.replace("OLD",old).replace("NEW",new)
        g2.update(q)
        
        qres = g2.query("""

        PREFIX rel: <http://ldf.fi/relse/> 
        PREFIX dct: <http://purl.org/dc/terms/>
        PREFIX foaf: <http://xmlns.com/foaf/0.1/>
        PREFIX owl: <http://www.w3.org/2002/07/owl#>
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

        SELECT ?s ?o WHERE { ?author rel:authorNationality ?s . ?s skos:prefLabel ?o. }

        """)
        allres+=qres


sveitsiläiset ['http://seco.tkk.fi/onto/toimo/nationalities/Swiss', 'http://www.yso.fi/onto/koko/p7934']
portugalilaiset ['http://seco.tkk.fi/saha3/ud33f4204-1c6d-40a1-a3ee-2a8158f56921', 'http://www.yso.fi/onto/koko/p52159']
itävaltalaiset ['http://seco.tkk.fi/onto/toimo/nationalities/Austrian', 'http://www.yso.fi/onto/koko/p5291']
skotit ['http://seco.tkk.fi/onto/toimo/nationalities/Scottish_Scots_', 'http://www.yso.fi/onto/koko/p16273']
unkarilaiset ['http://seco.tkk.fi/onto/toimo/nationalities/Hungarian', 'http://www.yso.fi/onto/koko/p34461']
ukrainalaiset ['http://seco.tkk.fi/onto/toimo/nationalities/Ukrainian', 'http://www.yso.fi/onto/koko/p9118']
irakilaiset ['http://seco.tkk.fi/saha3/u84b75979-9611-4eac-96b9-fec1d03bf051', 'http://www.yso.fi/onto/koko/p69907']


In [35]:
# nationality label lookup 2

qres = g2.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?nationality (SAMPLE(?natlabel) AS ?natl)
WHERE {
  ?teos a rel:novel ; rel:hasAuthor ?author. 
  ?author rel:authorNationality ?nationality . ?nationality skos:prefLabel ?label.
  FILTER (LANG(?label)="fi")
  ?nationality skos:prefLabel ?labelAny.
  #BIND( "unk" as ?label2)
  BIND(COALESCE(?label, ?labelAny) AS ?natlabel )
} GROUP BY ?nationality
""")

nat_lookup2=defaultdict(set)
for url, label in qres:
    nat_lookup2[str(url)].update([str(label)])
    

print("new number of different nationalities:",len(nat_lookup2),"\nold number of different nationalities:", len(nat_lookup))
print(list(nat_lookup2.items())[:10],list(nat_lookup2.items())[-10:])

new number of different nationalities: 145 
old number of different nationalities: 152
[('http://www.yso.fi/onto/koko/p13629', {'espanjalaiset'}), ('http://www.yso.fi/onto/koko/p2224', {'britit'}), ('http://www.yso.fi/onto/koko/p73101', {'yhdysvaltalaiset'}), ('http://www.yso.fi/onto/koko/p54872', {'kanadalaiset'}), ('http://www.yso.fi/onto/koko/p3437', {'ranskalaiset'}), ('http://www.yso.fi/onto/koko/p745', {'saksalaiset'}), ('http://www.yso.fi/onto/koko/p14632', {'alankomaalaiset'}), ('http://www.yso.fi/onto/koko/p6292', {'tanskalaiset'}), ('http://www.yso.fi/onto/koko/p607', {'englantilaiset'}), ('http://seco.tkk.fi/saha3/u996eb3eb-230f-4dc5-b607-8f06acbf66ad', {'libanonilaiset'})] [('http://seco.tkk.fi/saha3/uae9f2b4b-bf6b-4b38-afbe-8d92fa84372c', {'makedonialaiset'}), ('http://seco.tkk.fi/saha3/u4d0f8bfc-1dc4-4d8b-886f-8c4430553d48', {'jamaikalaiset'}), ('http://www.yso.fi/onto/koko/p3295', {'baltit'}), ('http://www.yso.fi/onto/koko/p35074', {'marit'}), ('http://seco.tkk.fi/saha3/

In [36]:
# double-check if some nationalities still have multiple uris
for k,v in nat_lookup2.items():
    if len(v)> 1:
        print(k,sorted(v))
        
# none yay!

In [37]:
# natlabel lookup updated

natlabel_lookup2=defaultdict(set)
for url, label in qres:
    natlabel_lookup2[str(label)].update([str(url)])

In [38]:
# print out all nationalities and examine their validity

alist=[]
for n in nat_lookup.values():
    alist += list(n)
print(sorted(alist))



['Burundi', 'Kroatia', 'Päiväntasaajan Guinea', 'Uusi-Seelanti', 'Venezuelan', 'Yhdysvallat', 'afganistanilaiset', 'ahvenanmaalaiset', 'alankomaalaiset', 'albaanit', 'algerialaiset', 'amerikankuubalaiset', 'amerikansuomalaiset', 'amerikkalaiset', 'angolalaiset', 'argentiinalaiset', 'armenialaiset', 'assyrialaiset', 'australialaiset', 'azerbaidžanilaiset', 'baltit', 'bangladeshiläiset', 'baskit', 'belgialainen', 'belgialaiset', 'bolivialaiset', 'bosniahertsegovinalaiset', 'botswanalaiset', 'brasilialaiset', 'britit', 'bulgarialaiset', 'chileläiset', 'dominikaanilaiset', 'egyptiläiset', 'englannin kieli', 'englantilaiset', 'espanjalaiset', 'eteläafrikkalaiset', 'eteläamerikkalaiset', 'etiopialaiset', 'flaamit', 'färingit', 'georgialaiset', 'ghanalaiset', 'grönlantilaiset', 'guadeloupelaiset', 'guatemalalaiset', 'haitilaiset', 'hantit', 'hollannin kieli', 'indonesialaiset', 'inkeriläiset', 'intialaiset', 'irakilaiset', 'irakilaiset', 'iranilaiset', 'irlantilaiset', 'islantilaiset', 'israe

In [39]:
# do the following manual corrections

# remove: toimittajat, sotilaat, jännityskirjallisuus

# update:
# amerikkalaiset = yhdysvaltalaiset
# yhdysvallat = yhdysvaltalaiset
# Uusi-Seelant = uusiseelantilaiset
# belgialainen = belgialaiset
# hollannin kieli = alankomaalaiset
# englannin kieli = britit



mylist= [("amerikkalaiset","yhdysvaltalaiset"),("englantilaiset","britit"), ("Yhdysvallat","yhdysvaltalaiset"),
         ("Uusi-Seelanti","uusiseelantilaiset"), ("belgialainen","belgialaiset"),("hollannin kieli","alankomaalaiset"),
         ("englannin kieli","britit"),("Kroatia","kroaatit"),("skotit","britit"),("walesilaiset","britit")]

In [40]:
# update based on mylist

for o,n in mylist:
    old=list(natlabel_lookup2[o])
    new=list(natlabel_lookup2[n])
    print(o, old, n, new)
    query="""
                 DELETE {?author rel:authorNationality <OLD> }
                 INSERT { ?author rel:authorNationality  <NEW> }
                 WHERE { ?author rel:authorNationality <OLD> }
                 """
    q=query.replace("OLD",old[0]).replace("NEW",new[0])
    g2.update(q)


amerikkalaiset ['http://www.yso.fi/onto/koko/p15010'] yhdysvaltalaiset ['http://www.yso.fi/onto/koko/p73101']
englantilaiset ['http://www.yso.fi/onto/koko/p607'] britit ['http://www.yso.fi/onto/koko/p2224']
Yhdysvallat ['http://www.yso.fi/onto/koko/p63178'] yhdysvaltalaiset ['http://www.yso.fi/onto/koko/p73101']
Uusi-Seelanti ['http://www.yso.fi/onto/koko/p57497'] uusiseelantilaiset ['http://seco.tkk.fi/saha3/u30d7a0cf-b7d3-439c-9cbb-47fede6352fd']
belgialainen ['http://seco.tkk.fi/onto/toimo/nationalities/Belgian'] belgialaiset ['http://www.yso.fi/onto/koko/p7333']
hollannin kieli ['http://www.yso.fi/onto/koko/p13543'] alankomaalaiset ['http://www.yso.fi/onto/koko/p14632']
englannin kieli ['http://www.yso.fi/onto/koko/p15528'] britit ['http://www.yso.fi/onto/koko/p2224']
Kroatia ['http://www.yso.fi/onto/koko/p61314'] kroaatit ['http://www.yso.fi/onto/koko/p2100']
skotit ['http://www.yso.fi/onto/koko/p16273'] britit ['http://www.yso.fi/onto/koko/p2224']
walesilaiset ['http://seco.tkk.f

In [41]:
# delete no-nationalities

for v in ["toimittajat","sotilaat","jännityskirjallisuus"]:
    old=list(natlabel_lookup[v])
    #new=list(natlabel_lookup[n])
    print(old)
    query="""
                 DELETE {?author rel:authorNationality <OLD> }
                 #INSERT { ?author rel:authorNationality  <NEW> }
                 WHERE { ?author rel:authorNationality <OLD> }
                 """
    q=query.replace("OLD",old[0])#.replace("NEW",new[0])
    g2.update(q)

['http://www.yso.fi/onto/koko/p35181']
['http://www.yso.fi/onto/koko/p31721']
['http://www.yso.fi/onto/koko/p1750']


In [43]:
# update lookup

qres = g2.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?teos ?nationality
WHERE {
  ?teos a rel:novel ; rel:hasAuthor ?author. 
  ?author rel:authorNationality ?nationality .# ?nationality skos:prefLabel ?label.
  #FILTER (LANG(?label)="fi")
}  """)

#novelnat_lookup=dict([(str(url), str(label)) for url, label in qres])
#print(list(gender_lookup.items())[-10:])
import collections 

novelnat=defaultdict(set)
for url, label in qres:
    novelnat[str(url)].update([str(label)])
    
novelnat_lookup2 = {novel:"unk" for novel in novels if novel not in novelnat.keys() }
novelnat.update(novelnat_lookup2)
print(len(novelnat))
print(list(novelnat.items())[:10],list(novelnat_lookup2.items())[-10:])

24963
[('http://www.btj.fi/at_1678175', {'http://www.yso.fi/onto/koko/p13629'}), ('http://data.kirjasampo.fi/abstractWork_9789518830927', {'http://www.yso.fi/onto/koko/p2224'}), ('http://www.yso.fi/onto/kaunokki#ateos_9589', {'http://www.yso.fi/onto/koko/p73101'}), ('http://www.yso.fi/onto/kaunokki#ateos_3912', {'http://www.yso.fi/onto/koko/p73101'}), ('http://data.kirjasampo.fi/abstractWork_6834230', {'http://www.yso.fi/onto/koko/p73101'}), ('http://www.yso.fi/onto/kaunokki#ateos_2216', {'http://www.yso.fi/onto/koko/p73101'}), ('http://data.kirjasampo.fi/abstractWork_9789511255659', {'http://www.yso.fi/onto/koko/p73101'}), ('http://seco.tkk.fi/saha3/ue552c4bb-ecd8-495d-be5d-00c45f9caaa8', {'http://www.yso.fi/onto/koko/p73101'}), ('http://seco.tkk.fi/saha3/ucf938d75-fc5e-4637-9715-73092ba5482a', {'http://www.yso.fi/onto/koko/p54872'}), ('http://seco.tkk.fi/saha3/u99617114-2455-49e4-8bed-05851eb04a3f', {'http://www.yso.fi/onto/koko/p3437'})] [('http://www.yso.fi/onto/kaunokki#ateos_7919

In [46]:
# count novels per nationality
countdict=defaultdict(int)

for k,v in novelnat.items():
    if v == "unk":
        countdict[v] +=1
    else:
        for val in v:
            label=list(nat_lookup[val])
            #print(label)
            if len(label) ==1:
                countdict[label[0]] += 1


In [47]:
# count per nationality
pd.DataFrame(countdict.items()).sort_values(by=1,ascending=False).head(30)

Unnamed: 0,0,1
2,yhdysvaltalaiset,6993
1,britit,5664
9,ruotsalaiset,2840
132,unk,1414
4,ranskalaiset,1166
5,saksalaiset,1088
13,norjalaiset,844
10,suomalaiset,744
15,venäläiset,569
7,tanskalaiset,532


In [48]:
# count how many works we have to compare with the earlier version

Q_kans ="""
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?kansallisuus (SAMPLE(?klabel) AS ?label) (COUNT(DISTINCT ?author) as ?authorcount) WHERE {
  
  ?teos  a rel:novel ; rel:hasAuthor ?author .
  #?teos kaunokki:manifests_in ?julkaisu .
  ?teos rel:pubYear ?vuosi . #?vuosi skos:prefLabel ?vuosiluku .
  BIND(regex(str(?vuosi), "199" ) AS ?decade)
  FILTER(?decade = xsd:boolean("true"))
  ?author rel:authorNationality ?kansallisuus . OPTIONAL { ?kansallisuus skos:prefLabel ?klabel FILTER(LANG(?klabel)="fi") }
  } GROUP BY ?kansallisuus
ORDER BY DESC(xsd:integer(?authorcount))
"""

decades=[190,191,192,193,194,195,196,197,198,199,200,201]

qres = g2.query(Q_kans)

In [49]:
df_kans=pd.DataFrame()
for d in decades:
    print(d)
    #sparql = SPARQLWrapper("http://ldf.fi/kirjasampo/sparql")
    q=Q_kans.replace("199",str(d))
    qres = g2.query(q)
    for row in qres:
        nat = str(row.asdict()['kansallisuus'].toPython())   
        natname = str(row.asdict()['label'].toPython())   
        a = str(row.asdict()['authorcount'].toPython()) 
        temp=pd.DataFrame({"nationality":nat,"nlabel":natname,"authors":a}.values() ).T.rename({0:"nationality",1:"nlabel",2:"authors"},axis=1)
        temp["decade"] = d*10
        df_kans=pd.concat([df_kans,temp])
df_kans

190
191
192
193
194
195
196
197
198
199
200
201


Unnamed: 0,nationality,nlabel,authors,decade
0,http://www.yso.fi/onto/koko/p745,saksalaiset,36,1900
0,http://www.yso.fi/onto/koko/p2224,britit,22,1900
0,http://www.yso.fi/onto/koko/p16897,ruotsalaiset,21,1900
0,http://www.yso.fi/onto/koko/p3437,ranskalaiset,19,1900
0,http://www.yso.fi/onto/koko/p73101,yhdysvaltalaiset,14,1900
...,...,...,...,...
0,http://www.yso.fi/onto/koko/p60528,Burundi,1,2010
0,http://www.yso.fi/onto/koko/p53709,perulaiset,1,2010
0,http://seco.tkk.fi/saha3/ua57be8b1-e990-4896-b...,bosniahertsegovinalaiset,1,2010
0,http://seco.tkk.fi/saha3/ubac7413a-895d-41a5-b...,venezuelalaiset,1,2010


In [50]:
# for example, see how many American authors:
df_kans[df_kans["nlabel"]=="yhdysvaltalaiset"].sort_values(by="nlabel").head(30)

Unnamed: 0,nationality,nlabel,authors,decade
0,http://www.yso.fi/onto/koko/p73101,yhdysvaltalaiset,14,1900
0,http://www.yso.fi/onto/koko/p73101,yhdysvaltalaiset,26,1910
0,http://www.yso.fi/onto/koko/p73101,yhdysvaltalaiset,44,1920
0,http://www.yso.fi/onto/koko/p73101,yhdysvaltalaiset,39,1930
0,http://www.yso.fi/onto/koko/p73101,yhdysvaltalaiset,106,1940
0,http://www.yso.fi/onto/koko/p73101,yhdysvaltalaiset,144,1950
0,http://www.yso.fi/onto/koko/p73101,yhdysvaltalaiset,235,1960
0,http://www.yso.fi/onto/koko/p73101,yhdysvaltalaiset,282,1970
0,http://www.yso.fi/onto/koko/p73101,yhdysvaltalaiset,436,1980
0,http://www.yso.fi/onto/koko/p73101,yhdysvaltalaiset,502,1990


In [51]:
natlabel_lookup["australialaiset"]

{'http://www.yso.fi/onto/koko/p9135'}

In [52]:
# sum of authors per decade
Q_dec ="""
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT  (COUNT(DISTINCT ?author) as ?authorcount) WHERE {
  
  ?teos  a rel:novel ; rel:hasAuthor ?author .
  #?teos kaunokki:manifests_in ?julkaisu .
  ?teos rel:pubYear ?vuosi . #?vuosi skos:prefLabel ?vuosiluku .
  BIND(regex(str(?vuosi), "199" ) AS ?decade)
  FILTER(?decade = xsd:boolean("true"))
  ?author rel:authorNationality ?kansallisuus . OPTIONAL { ?kansallisuus skos:prefLabel ?klabel FILTER(LANG(?klabel)="fi") }
  } #GROUP BY ?kansallisuus
ORDER BY DESC(xsd:integer(?authorcount))
"""

decades=[197,198,199,200,201]

df_dec=pd.DataFrame()
for d in decades:
    #print(d)
    #sparql = SPARQLWrapper("http://ldf.fi/kirjasampo/sparql")
    q=Q_dec.replace("199",str(d))
    qres = g2.query(q)
    for row in qres:
        #nat = str(row.asdict()['kansallisuus'].toPython())   
        #natname = str(row.asdict()['label'].toPython())   
        a = str(row.asdict()['authorcount'].toPython()) 
        temp=pd.DataFrame({"authors":a}.values() ).T.rename({0:"authors"},axis=1)
        temp["decade"] = d*10
        df_dec=pd.concat([df_dec,temp])
df_dec

Unnamed: 0,authors,decade
0,999,1970
0,1283,1980
0,1316,1990
0,1700,2000
0,1990,2010


In [53]:
# total per nationality and gender

Q_kans ="""
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?kansallisuus ?genderinfo (SAMPLE(?klabel) AS ?label) (COUNT(DISTINCT ?author) as ?authorcount) WHERE {
  
  ?teos  a rel:novel ; rel:hasAuthor ?author ; rel:pubYear ?year .
  FILTER(?year > 1949)
  ?author rel:authorNationality ?kansallisuus . ?kansallisuus skos:prefLabel ?klabel FILTER(LANG(?klabel)="fi") .
  
  OPTIONAL { ?author foaf:gender ?gender }
  BIND("unk" as ?gender2)
  BIND(COALESCE(?gender, ?gender2) AS ?genderinfo)
  } GROUP BY ?kansallisuus ?genderinfo
ORDER BY DESC(xsd:integer(?authorcount))
"""

decades=[197,198,199,200,201]

qres = g2.query(Q_kans)
df_authors=pd.DataFrame()


In [54]:
# for a check, compare authors in my graph and the original SW
for row in qres:
    nat = str(row.asdict()['kansallisuus'].toPython())   
    natname = str(row.asdict()['label'].toPython())   
    a = str(row.asdict()['authorcount'].toPython()) 
    ge = str(row.asdict()['genderinfo'].toPython()) 
    temp=pd.DataFrame({"nationality":nat,"nlabel":natname,"authors":a,"gender":ge}.values() ).T.rename({0:"nationality",1:"nlabel",2:"authors",3:"gender"},axis=1)
    #temp["decade"] = d*10
    df_authors=pd.concat([df_authors,temp])
df_authors["authors"]=pd.to_numeric(df_authors["authors"])
df_authors.sort_values(by="authors", ascending=False).head(20)

Unnamed: 0,nationality,nlabel,authors,gender
0,http://www.yso.fi/onto/koko/p73101,yhdysvaltalaiset,1048,http://www.yso.fi/onto/kaunokki#male
0,http://www.yso.fi/onto/koko/p73101,yhdysvaltalaiset,923,http://www.yso.fi/onto/kaunokki#female
0,http://www.yso.fi/onto/koko/p2224,britit,728,http://www.yso.fi/onto/kaunokki#male
0,http://www.yso.fi/onto/koko/p2224,britit,610,http://www.yso.fi/onto/kaunokki#female
0,http://www.yso.fi/onto/koko/p16897,ruotsalaiset,307,http://www.yso.fi/onto/kaunokki#female
0,http://www.yso.fi/onto/koko/p16897,ruotsalaiset,274,http://www.yso.fi/onto/kaunokki#male
0,http://www.yso.fi/onto/koko/p3437,ranskalaiset,268,http://www.yso.fi/onto/kaunokki#male
0,http://www.yso.fi/onto/koko/p745,saksalaiset,207,http://www.yso.fi/onto/kaunokki#male
0,http://www.yso.fi/onto/koko/p8327,venäläiset,164,http://www.yso.fi/onto/kaunokki#male
0,http://www.yso.fi/onto/koko/p745,saksalaiset,135,http://www.yso.fi/onto/kaunokki#female


In [56]:
# e.g. df Brits
df_authors[df_authors["nlabel"]=="britit"].sort_values(by="authors", ascending=False)#.tail(20)


Unnamed: 0,nationality,nlabel,authors,gender
0,http://www.yso.fi/onto/koko/p2224,britit,728,http://www.yso.fi/onto/kaunokki#male
0,http://www.yso.fi/onto/koko/p2224,britit,610,http://www.yso.fi/onto/kaunokki#female
0,http://www.yso.fi/onto/koko/p2224,britit,12,unk


In [55]:
# SPARQL query the Booksampo SW

HC="""
SELECT  (COUNT(DISTINCT ?author) as ?authorcount) (GROUP_CONCAT(?author; separator=";") AS ?authorUris) WHERE {
  
  ?teos  a kaunokki:romaani ; kaunokki:tekija ?author ; skos:prefLabel ?title.
  ?teos kaunokki:alkukieli ?kieliuri .
  FILTER (?kieliuri !=  <http://lexvo.org/id/iso639-3/fin>  )
  ?teos kaunokki:manifests_in ?julkaisu .
  ?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin> . # käännetty suomeksi
  ?julkaisu kaunokki:ilmestymisvuosi ?vuosi . ?vuosi skos:prefLabel ?vuosiluku .
  #BIND(regex(str(?vuosiluku), "197" ) AS ?decade)
  
  #FILTER(?decade = xsd:boolean("true"))
  FILTER(xsd:integer(?vuosiluku) > 1949)
  
  ?author kaunokki:kansallisuus ?kansallisuus . 
  OPTIONAL { ?author foaf:gender ?gender }
  VALUES ?kansallisuus { <http://www.yso.fi/onto/koko/p2224> <http://www.yso.fi/onto/koko/p607> 	<http://www.yso.fi/onto/koko/p15528> <http://www.yso.fi/onto/koko/p16273> <http://seco.tkk.fi/onto/toimo/nationalities/Scottish_Scots_> <http://seco.tkk.fi/saha3/u47895875-438a-47bb-9565-aec890451cb8> }
  #VALUES ?kansallisuus { <http://www.yso.fi/onto/koko/p15010> <http://www.yso.fi/onto/koko/p73101> <http://www.yso.fi/onto/koko/p63178> }
  OPTIONAL { ?kansallisuus skos:prefLabel ?klabel FILTER(LANG(?klabel)="fi") }
  } GROUP BY ?gender
  """


sparql = SPARQLWrapper("http://ldf.fi/booksampo-2022/sparql")
#sparql = SPARQLWrapper("http://ldf.fi/kirjasampo/sparql")
sparql.setQuery(PREFIXES + HC)
sparql.setReturnFormat(JSON)
sparql.addCustomHttpHeader(*list(AUTHORIZATION_HEADER.items())[0])
results = sparql.query().convert()

In [58]:
df_check = JSON2Pandas2(results)
df_check

# the numbers seem to match

Unnamed: 0,authorcount,authorUris
0,12,http://www.yso.fi/onto/kaunokki#person_1231759...
1,728,http://www.yso.fi/onto/kaunokki#person_1231759...
2,610,http://www.yso.fi/onto/kaunokki#person_1231759...


In [38]:
# seems like g2 got the nationalities right!

# save work until now

#g2.serialize(destination="../../data/nationality_graph.ttl", format="turtle")


<Graph identifier=N7d19e4acd9ab4cad97a82c5c437beb63 (<class 'rdflib.graph.Graph'>)>

## Correct languages



The process is somewhat similar to nationalities: print out all languages, 
see if there are duplicates or incorrect annotations, and clean them up

In [63]:
# initial language lookup
# all books should have a language to be included
qres = g.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?teos ?lang
WHERE {
  ?teos a rel:novel ; rel:langOrig ?lang. 
  #?author foaf:gender ?gender .
}  """)

lang_lookup=defaultdict(set)
for url, label in qres:
    lang_lookup[str(url)].update([str(label)])
#print(list(gender_lookup.items())[-10:])
print(len(lang_lookup))
lang_lookup2 = {novel:"unk" for novel in novels if novel not in lang_lookup.keys() }
lang_lookup.update(lang_lookup2)
print(len(lang_lookup)) 
print(list(lang_lookup.items())[:10],list(lang_lookup.items())[-10:])

24963
24963
[('http://www.btj.fi/at_1678175', {'http://lexvo.org/id/iso639-3/spa'}), ('http://data.kirjasampo.fi/abstractWork_9789518830927', {'http://lexvo.org/id/iso639-3/eng'}), ('http://www.yso.fi/onto/kaunokki#ateos_9589', {'http://lexvo.org/id/iso639-3/eng'}), ('http://www.yso.fi/onto/kaunokki#ateos_3912', {'http://lexvo.org/id/iso639-3/eng'}), ('http://data.kirjasampo.fi/abstractWork_6834230', {'http://lexvo.org/id/iso639-3/eng'}), ('http://data.kirjasampo.fi/abstractWork_4124516', {'http://lexvo.org/id/iso639-3/eng'}), ('http://www.yso.fi/onto/kaunokki#ateos_2216', {'http://lexvo.org/id/iso639-3/eng'}), ('http://data.kirjasampo.fi/abstractWork_9789511255659', {'http://lexvo.org/id/iso639-3/eng'}), ('http://seco.tkk.fi/saha3/ue552c4bb-ecd8-495d-be5d-00c45f9caaa8', {'http://lexvo.org/id/iso639-3/eng'}), ('http://seco.tkk.fi/saha3/ucf938d75-fc5e-4637-9715-73092ba5482a', {'http://lexvo.org/id/iso639-3/eng'})] [('http://seco.tkk.fi/saha3/ue876b69b-66d0-4f4d-b7f0-1fef463e98c9', {'htt

In [64]:
# split the name from the last slahs to get the language code

qres = g2.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?lang
WHERE {
  ?teos a rel:novel ; rel:langOrig ?lang. 
  
}  """)

langlabel_lookup = {}

for l in qres:
    lang=l["lang"].toPython()
    #print(lang)
    if lang not in langlabel_lookup.keys():
        langlabel_lookup[lang]= lang.split("/")[-1]
    else:
        print("oop",lang)

In [89]:
# examine the keys
print(langlabel_lookup.keys())

dict_keys(['http://lexvo.org/id/iso639-3/spa', 'http://lexvo.org/id/iso639-3/eng', 'http://lexvo.org/id/iso639-3/fra', 'http://lexvo.org/id/iso639-3/deu', 'http://lexvo.org/id/iso639-3/dan', 'http://lexvo.org/id/iso639-3/swe', 'http://lexvo.org/id/iso639-3/ita', 'http://lexvo.org/id/iso639-3/sme', 'http://lexvo.org/id/iso639-3/nor', 'http://lexvo.org/id/iso639-3/rus', 'http://lexvo.org/id/iso639-3/jpn', 'http://lexvo.org/id/iso639-3/ara', 'http://lexvo.org/id/iso639-3/heb', 'http://lexvo.org/id/iso639-3/ell', 'http://lexvo.org/id/iso639-3/isl', 'http://lexvo.org/id/iso639-3/est', 'http://lexvo.org/id/iso639-3/por', 'http://lexvo.org/id/iso639-3/zho', 'http://lexvo.org/id/iso639-3/pol', 'http://lexvo.org/id/iso639-3/afr', 'http://lexvo.org/id/iso639-3/nld', 'http://lexvo.org/id/iso639-3/ces', 'http://lexvo.org/id/iso639-3/lav', 'http://lexvo.org/id/iso639-3/hun', 'http://lexvo.org/id/iso639-3/yid', 'http://lexvo.org/id/iso639-3/bul', 'http://lexvo.org/id/iso639-3/lat', 'http://lexvo.org

In [77]:
# get the not-lexvo codes
oldlang=[]
for k,v in langlabel_lookup.items():
    if "lexvo" not in k:
        print(k,v)
        oldlang.append(k)

http://www.lingvoj.org/lang/smi smi
http://www.lingvoj.org/lang/en-gb en-gb
http://www.lingvoj.org/lang/bnt bnt
http://www.lingvoj.org/lang/fiu fiu
http://www.lingvoj.org/lang/be-x-old be-x-old


In [68]:
# correct vec, as it is a mistake in the annotations
oldlang=[]
for k,v in langlabel_lookup.items():
    if "vec" in k:
        print(k,v)
        oldlang.append(k)

http://lexvo.org/id/iso639-3/vec vec


In [69]:
# correct list
# smi http://www.lexvo.org/page/iso639-5/smi
# en-gb http://lexvo.org/id/iso639-3/eng
# bnt http://www.lexvo.org/page/iso639-5/bnt
# fiu http://www.lexvo.org/page/iso639-5/fiu
# be-x-old http://www.lexvo.org/page/iso639-3/bel

newlang=["http://www.lexvo.org/page/iso639-5/smi","http://lexvo.org/id/iso639-3/eng","http://www.lexvo.org/page/iso639-5/bnt",
        "http://www.lexvo.org/page/iso639-5/fiu","http://www.lexvo.org/page/iso639-3/bel"]
updated= ["http://lexvo.org/page/iso639-5/smi","http://lexvo.org/id/iso639-3/eng","http://lexvo.org/page/iso639-5/bnt",
        "http://lexvo.org/page/iso639-5/fiu","http://lexvo.org/page/iso639-3/bel"]
updated= ["http://lexvo.org/page/iso639-5/smi","http://lexvo.org/page/iso639-3/eng","http://lexvo.org/page/iso639-5/bnt",
        "http://lexvo.org/page/iso639-5/fiu","http://lexvo.org/id/iso639-3/bel"]

In [73]:
# vec to rus
for o,n in zip(["http://lexvo.org/id/iso639-3/vec"],["http://lexvo.org/id/iso639-3/rus"]):
    
    print(o,n)
    old= o
    new=n.replace("page","id")
    print(o,new)
    query="""
    PREFIX rel: <http://ldf.fi/relse/>
             DELETE {?teos rel:langOrig <OLD> }
             INSERT { ?teos rel:langOrig  <NEW> }
             WHERE { ?teos rel:langOrig <OLD> }
             """
    q=query.replace("OLD",old).replace("NEW",new)
    g2.update(q)

http://lexvo.org/id/iso639-3/vec http://lexvo.org/id/iso639-3/rus
http://lexvo.org/id/iso639-3/vec http://lexvo.org/id/iso639-3/rus


In [78]:
# correct my own mistake to have the right URIs with "id" instade of "page" in the URI
for o,n in zip(oldlang,newlang):
    
    print(o,n)
    #old= n
    #new=n.replace("page","id")
    print("after correction:\n",o,n)
    query="""
             DELETE {?teos rel:langOrig <OLD> }
             INSERT { ?teos rel:langOrig  <NEW> }
             WHERE { ?teos rel:langOrig <OLD> }
             """
    q=query.replace("OLD",o).replace("NEW",n)
    g2.update(q)

http://www.lingvoj.org/lang/smi http://www.lexvo.org/page/iso639-5/smi
after correction:
 http://www.lingvoj.org/lang/smi http://www.lexvo.org/page/iso639-5/smi
http://www.lingvoj.org/lang/en-gb http://lexvo.org/id/iso639-3/eng
after correction:
 http://www.lingvoj.org/lang/en-gb http://lexvo.org/id/iso639-3/eng
http://www.lingvoj.org/lang/bnt http://www.lexvo.org/page/iso639-5/bnt
after correction:
 http://www.lingvoj.org/lang/bnt http://www.lexvo.org/page/iso639-5/bnt
http://www.lingvoj.org/lang/fiu http://www.lexvo.org/page/iso639-5/fiu
after correction:
 http://www.lingvoj.org/lang/fiu http://www.lexvo.org/page/iso639-5/fiu
http://www.lingvoj.org/lang/be-x-old http://www.lexvo.org/page/iso639-3/bel
after correction:
 http://www.lingvoj.org/lang/be-x-old http://www.lexvo.org/page/iso639-3/bel


In [79]:
# correct my own mistake to have the right URIs with "id" instade of "page" in the URI
for o,n in zip(newlang,updated):
    
    print(o,n)
    old= n
    new=n.replace("page","id")
    print("after correction:\n",o,new)
    query="""
             DELETE {?teos rel:langOrig <OLD> }
             INSERT { ?teos rel:langOrig  <NEW> }
             WHERE { ?teos rel:langOrig <OLD> }
             """
    q=query.replace("OLD",old).replace("NEW",new)
    g2.update(q)
    
    

http://www.lexvo.org/page/iso639-5/smi http://lexvo.org/page/iso639-5/smi
after correction:
 http://www.lexvo.org/page/iso639-5/smi http://lexvo.org/id/iso639-5/smi
http://lexvo.org/id/iso639-3/eng http://lexvo.org/id/iso639-3/eng
after correction:
 http://lexvo.org/id/iso639-3/eng http://lexvo.org/id/iso639-3/eng
http://www.lexvo.org/page/iso639-5/bnt http://lexvo.org/page/iso639-5/bnt
after correction:
 http://www.lexvo.org/page/iso639-5/bnt http://lexvo.org/id/iso639-5/bnt
http://www.lexvo.org/page/iso639-5/fiu http://lexvo.org/page/iso639-5/fiu
after correction:
 http://www.lexvo.org/page/iso639-5/fiu http://lexvo.org/id/iso639-5/fiu
http://www.lexvo.org/page/iso639-3/bel http://lexvo.org/page/iso639-3/bel
after correction:
 http://www.lexvo.org/page/iso639-3/bel http://lexvo.org/id/iso639-3/bel


In [80]:
for n in newlang:
    g2.add((URIRef(n), RDF.type, rel.lang)) # make sure the type lang exists

In [81]:
# new language lookup

qres = g2.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?teos ?lang
WHERE {
  ?teos a rel:novel ; rel:langOrig ?lang  ; rel:pubYear ?year .
  FILTER(?year > 1899)
  #?author foaf:gender ?gender .
}  """)

lang_lookup=defaultdict(set)
for url, label in qres:
    lang_lookup[str(url)].update([str(label)])

print(len(lang_lookup))

print(list(lang_lookup.items())[:10],list(lang_lookup.items())[-10:])

24680
[('http://www.btj.fi/at_1678175', {'http://lexvo.org/id/iso639-3/spa'}), ('http://data.kirjasampo.fi/abstractWork_9789518830927', {'http://lexvo.org/id/iso639-3/eng'}), ('http://www.yso.fi/onto/kaunokki#ateos_9589', {'http://lexvo.org/id/iso639-3/eng'}), ('http://www.yso.fi/onto/kaunokki#ateos_3912', {'http://lexvo.org/id/iso639-3/eng'}), ('http://data.kirjasampo.fi/abstractWork_6834230', {'http://lexvo.org/id/iso639-3/eng'}), ('http://data.kirjasampo.fi/abstractWork_4124516', {'http://lexvo.org/id/iso639-3/eng'}), ('http://www.yso.fi/onto/kaunokki#ateos_2216', {'http://lexvo.org/id/iso639-3/eng'}), ('http://data.kirjasampo.fi/abstractWork_9789511255659', {'http://lexvo.org/id/iso639-3/eng'}), ('http://seco.tkk.fi/saha3/ue552c4bb-ecd8-495d-be5d-00c45f9caaa8', {'http://lexvo.org/id/iso639-3/eng'}), ('http://seco.tkk.fi/saha3/ucf938d75-fc5e-4637-9715-73092ba5482a', {'http://lexvo.org/id/iso639-3/eng'})] [('http://seco.tkk.fi/saha3/ue876b69b-66d0-4f4d-b7f0-1fef463e98c9', {'http://le

In [82]:
oldlang=[]
for k,v in langlabel_lookup.items():
    if "page"  in k:
        print(k,v)
        oldlang.append(k)
oldlang # so all with correct "id" URIs

[]

In [83]:
# count languages

countdict=defaultdict(int)
n=0
for k,v in lang_lookup.items():
    if v == "unk":
        print(v)
        countdict[v] +=1
    else:
        for val in v:
            label=val.split("/")[-1]
            #print(label)
            #if len(label) ==1:
            countdict[label] += 1
            #elif len(label) > 1:
             #   print(label)

In [92]:

#see the tail

natdf=pd.DataFrame(countdict.items())
natdf.sort_values(by=1,ascending=False).tail(30)

Unnamed: 0,0,1
53,fas,5
54,grc,5
37,hrv,5
51,slk,4
26,lat,4
38,kat,3
28,ukr,3
49,lit,3
48,nno,3
39,ind,2


In [91]:

# SPARQL checks

HC="""
SELECT  (COUNT(DISTINCT ?teos) as ?authorcount) (GROUP_CONCAT(?author; separator=";") AS ?authorUris) WHERE {
  
  ?teos  a kaunokki:romaani ; kaunokki:tekija ?author ; skos:prefLabel ?title.
  ?teos kaunokki:alkukieli ?kieliuri .
  FILTER (?kieliuri !=  <http://lexvo.org/id/iso639-3/fin>  )
  VALUES ?kieliuri { <http://lexvo.org/id/iso639-3/ces> }
  ?teos kaunokki:manifests_in ?julkaisu .
  ?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin> . # käännetty suomeksi
  ?julkaisu kaunokki:ilmestymisvuosi ?vuosi . ?vuosi skos:prefLabel ?vuosiluku .
  #BIND(regex(str(?vuosiluku), "197" ) AS ?decade)
  
  #FILTER(?decade = xsd:boolean("true"))
  FILTER(xsd:integer(?vuosiluku) > 1899)
  
  
  }
  """


sparql = SPARQLWrapper("http://ldf.fi/booksampo-2022/sparql")
#sparql = SPARQLWrapper("http://ldf.fi/kirjasampo/sparql")
sparql.setQuery(PREFIXES + HC)
sparql.setReturnFormat(JSON)
sparql.addCustomHttpHeader(*list(AUTHORIZATION_HEADER.items())[0])
results = sparql.query().convert()

df_check = JSON2Pandas2(results)
df_check

Unnamed: 0,authorcount,authorUris
0,52,http://www.yso.fi/onto/kaunokki#person_1231759...


In [93]:
#matches with my amount
natdf[natdf[0]=="ces"]

Unnamed: 0,0,1
21,ces,52


In [94]:
# remove all language labels
for s, p, o in g2.triples((None, RDF.type, rel.lang)):
    #g.add((s, FOAF['name'], o))
    print(s)
    g2.remove((s, SKOS.prefLabel, None))

http://lexvo.org/id/iso639-3/zho
http://lexvo.org/id/iso639-3/hin
http://lexvo.org/id/iso639-3/slv
http://lexvo.org/id/iso639-3/fit
http://lexvo.org/id/iso639-3/jpn
http://lexvo.org/id/iso639-3/ben
http://lexvo.org/id/iso639-3/sqi
http://lexvo.org/id/iso639-3/rus
http://lexvo.org/id/iso639-3/swe
http://www.lingvoj.org/lang/be-x-old
http://lexvo.org/id/iso639-3/spa
http://lexvo.org/id/iso639-3/afr
http://lexvo.org/id/iso639-3/hye
http://lexvo.org/id/iso639-3/dan
http://lexvo.org/id/iso639-3/som
http://lexvo.org/id/iso639-3/pol
http://lexvo.org/id/iso639-3/vec
http://lexvo.org/id/iso639-3/hrv
http://lexvo.org/id/iso639-3/chm
http://lexvo.org/id/iso639-3/nor
http://www.lingvoj.org/lang/en-gb
http://lexvo.org/id/iso639-3/heb
http://lexvo.org/id/iso639-3/tur
http://lexvo.org/id/iso639-3/kor
http://lexvo.org/id/iso639-3/yid
http://lexvo.org/id/iso639-3/eng
http://lexvo.org/id/iso639-3/hun
http://lexvo.org/id/iso639-3/slk
http://lexvo.org/id/iso639-3/vie
http://lexvo.org/id/iso639-3/hbs
http:

In [78]:
#save!

#g2.serialize(destination="../../data/nat_lang_graph2.nt",format="nt")




<Graph identifier=Nae544eb22f6243d2aabd69c1fd567e8c (<class 'rdflib.graph.Graph'>)>

# Finnish graph

In [96]:
# Query
Q = """
PREFIX rel: <http://ldf.fi/relse/>

CONSTRUCT {


?teos a rel:novel ;
    skos:prefLabel ?title ;
    rel:langOrig ?kieliuri ;
    rel:hasAuthor ?author ;
    rel:pubYear ?vuosi_ .


?author a rel:author ;
	skos:prefLabel ?authorname ;
	foaf:gender ?gender ;
	rel:authorNationality ?kansallisuusuri .

?kieliuri a rel:lang .
	#skos:prefLabel ?kieli .

?kansallisuusuri a rel:nationality ;
	skos:prefLabel ?kansallisuus. 

}

WHERE {

  ?teos a kaunokki:romaani ; skos:prefLabel ?title .
  ?teos kaunokki:alkukieli ?kieliuri .
  FILTER (?kieliuri =  <http://lexvo.org/id/iso639-3/fin>  ) # original language Finnish
  
  #OPTIONAL {?kieliuri skos:prefLabel ?kieli . }
  ?teos kaunokki:manifests_in ?julkaisu .
  
  ?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin> . # publication language Finnish
  
  ?julkaisu kaunokki:ilmestymisvuosi ?vuosi . ?vuosi skos:prefLabel ?vuosiluku .
  ?julkaisu kaunokki:onEnsimmainenVersio ?true .
  BIND(xsd:integer(?vuosiluku) AS ?vuosi_)
  FILTER(?vuosi_ > 1969)
  ?teos kaunokki:tekija ?author.
  ?author skos:prefLabel ?authorname .
OPTIONAL {?author kaunokki:kansallisuus ?kansallisuusuri . OPTIONAL { ?kansallisuusuri skos:prefLabel ?kansallisuus .} }
OPTIONAL  {?author foaf:gender ?gender }


} 
"""


# construct graph

# query graph
print('\n\n*** constructing the graph')
sparql = SPARQLWrapper("http://ldf.fi/booksampo-2022/sparql")

sparql.setQuery(PREFIXES + Q)

sparql.addCustomHttpHeader(*list(AUTHORIZATION_HEADER.items())[0])

sparql.setReturnFormat(TURTLE)
results = sparql.query().convert()



*** constructing the graph


In [97]:
f = Graph()
#g.parse(data=results, format="turtle") # "mygraph.ttl"
f.parse(data=results, format="turtle")

<Graph identifier=N367f0fe58cf34b0a92b6cc8c078a9ef3 (<class 'rdflib.graph.Graph'>)>

In [98]:
Q_fi ="""

SELECT  (COUNT(DISTINCT ?teosfi) AS ?novels) (COUNT(DISTINCT ?author) AS ?authors) #?vuosi #(MIN(?vuosi) AS ?julkaisuvuosi)  
{
?teosfi a rel:novel .
  ?teosfi rel:langOrig <http://lexvo.org/id/iso639-3/fin> .
    ?teosfi rel:hasAuthor ?author .
  ?teosfi rel:pubYear ?vuosi . #?vuosir skos:prefLabel ?vuosiluku .
  #BIND(xsd:integer(?vuosiluku) AS ?vuosi)
  BIND(regex(str(?vuosi), "199" ) AS ?decade)
  FILTER(?decade = xsd:boolean("true"))
  
  } #GROUP BY ?vuosi
  
"""

decades=[197,198,199,200,201]
df_authorsfi=pd.DataFrame()
for d in decades:
    q=Q_fi.replace("199", str(d))
    qres = f.query(q)
    
    for row in qres:
        n = str(row.asdict()['novels'].toPython())   
        #natname = str(row.asdict()['label'].toPython())   
        a = str(row.asdict()['authors'].toPython()) 
        temp=pd.DataFrame({"authors":a, "novels":n}.values() ).T.rename({0:"authors",1:"novels"},axis=1)
        temp["decade"] = d*10
    df_authorsfi=pd.concat([df_authorsfi,temp])

In [99]:
df_authorsfi

Unnamed: 0,authors,novels,decade
0,707,1455,1970
0,813,1891,1980
0,974,2281,1990
0,1569,3609,2000
0,2926,6409,2010


In [100]:
# nationalities

Q_kans ="""
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?kansallisuus (SAMPLE(?klabel) AS ?label) (COUNT(DISTINCT ?author) as ?authorcount) (COUNT(DISTINCT ?teos) as ?workcount) WHERE {
  
  ?teos  a rel:novel ; rel:hasAuthor ?author .
  
  ?teos rel:pubYear ?vuosi . #?vuosi skos:prefLabel ?vuosiluku .
  #BIND(regex(str(?vuosi), "199" ) AS ?decade)
  #FILTER(?decade = xsd:boolean("true"))
  ?author rel:authorNationality ?kansallisuus . OPTIONAL { ?kansallisuus skos:prefLabel ?klabel FILTER(LANG(?klabel)="fi") }
  } GROUP BY ?kansallisuus
ORDER BY DESC(xsd:integer(?authorcount))
"""

decades=[197,198,199,200,201]

qres = f.query(Q_kans)

In [101]:
fidf=pd.DataFrame()
for row in qres:
    n = str(row.asdict()['kansallisuus'].toPython())   
    natname = str(row.asdict()['label'].toPython())   
    a = str(row.asdict()['authorcount'].toPython()) 
    w = str(row.asdict()['workcount'].toPython()) 
    temp=pd.DataFrame({"authors":a, "novels":w,"nationality":n, "natname":natname}.values() ).T.rename({0:"authors",1:"novels",2:"nationality",3:"label"},axis=1)
    fidf=pd.concat([fidf,temp])
    
fidf.sort_values(by="label")

Unnamed: 0,authors,novels,nationality,label
0,1,2,http://www.yso.fi/onto/koko/p57886,afganistanilaiset
0,1,1,http://www.yso.fi/onto/koko/p14632,alankomaalaiset
0,1,2,http://www.yso.fi/onto/koko/p9135,australialaiset
0,1,1,http://www.yso.fi/onto/koko/p34688,inkeriläiset
0,1,1,http://www.yso.fi/onto/koko/p40957,inkerinsuomalaiset
0,2,2,http://www.yso.fi/onto/koko/p5227,intialaiset
0,2,4,http://www.yso.fi/onto/koko/p53753,iranilaiset
0,1,2,http://seco.tkk.fi/saha3/u5aa143b0-c88f-47a0-8...,israelilaiset
0,4,10,http://www.yso.fi/onto/koko/p54872,kanadalaiset
0,3,7,http://www.yso.fi/onto/koko/p34058,karjalaiset


In [11]:
fidf=pd.DataFrame()
for row in qres:
    n = str(row.asdict()['kansallisuus'].toPython())   
    natname = str(row.asdict()['label'].toPython())   
    a = str(row.asdict()['authorcount'].toPython()) 
    w = str(row.asdict()['workcount'].toPython()) 
    temp=pd.DataFrame({"authors":a, "novels":w,"nationality":n, "natname":natname}.values() ).T.rename({0:"authors",1:"novels",2:"nationality",3:"label"},axis=1)
    fidf=pd.concat([fidf,temp])
    
fidf.sort_values(by="label")

Unnamed: 0,authors,novels,nationality,label
0,1,2,http://www.yso.fi/onto/koko/p57886,afganistanilaiset
0,1,1,http://www.yso.fi/onto/koko/p14632,alankomaalaiset
0,1,2,http://www.yso.fi/onto/koko/p9135,australialaiset
0,1,1,http://www.yso.fi/onto/koko/p34688,inkeriläiset
0,1,1,http://www.yso.fi/onto/koko/p40957,inkerinsuomalaiset
0,2,2,http://www.yso.fi/onto/koko/p5227,intialaiset
0,2,4,http://www.yso.fi/onto/koko/p53753,iranilaiset
0,1,2,http://seco.tkk.fi/saha3/u5aa143b0-c88f-47a0-8...,israelilaiset
0,4,10,http://www.yso.fi/onto/koko/p54872,kanadalaiset
0,3,7,http://www.yso.fi/onto/koko/p34058,karjalaiset


In [102]:
# corrections similar to the translated works

In [103]:
# englantilaiset => britit
# skotit => britit

tocorrect = [('http://www.yso.fi/onto/koko/p607','http://www.yso.fi/onto/koko/p2224'),('http://seco.tkk.fi/onto/toimo/nationalities/Scottish_Scots_','http://www.yso.fi/onto/koko/p2224'),
             ('http://www.yso.fi/onto/koko/p16273','http://www.yso.fi/onto/koko/p2224'),('http://seco.tkk.fi/onto/toimo/nationalities/Hungarian','http://www.yso.fi/onto/koko/p34461')]

# remove: opettajat, nainen, metsätyöntekijät
todelete=["http://www.yso.fi/onto/koko/p36891","http://www.yso.fi/onto/koko/p52909","http://www.yso.fi/onto/koko/p33980"]

In [104]:
for o,n in tocorrect:
    old=o
    new=n
    print( old, new)
    query="""
                 DELETE {?author rel:authorNationality <OLD> }
                 INSERT { ?author rel:authorNationality  <NEW> }
                 WHERE { ?author rel:authorNationality <OLD> }
                 """
    q=query.replace("OLD",old).replace("NEW",new)
    f.update(q)

http://www.yso.fi/onto/koko/p607 http://www.yso.fi/onto/koko/p2224
http://seco.tkk.fi/onto/toimo/nationalities/Scottish_Scots_ http://www.yso.fi/onto/koko/p2224
http://www.yso.fi/onto/koko/p16273 http://www.yso.fi/onto/koko/p2224
http://seco.tkk.fi/onto/toimo/nationalities/Hungarian http://www.yso.fi/onto/koko/p34461


In [105]:

for v in todelete:
    old=v
    #new=list(natlabel_lookup[n])
    print(old)
    query="""
                 DELETE {?author rel:authorNationality <OLD> }
                 #INSERT { ?author rel:authorNationality  <NEW> }
                 WHERE { ?author rel:authorNationality <OLD> }
                 """
    q=query.replace("OLD",old)#.replace("NEW",new[0])
    f.update(q)

http://www.yso.fi/onto/koko/p36891
http://www.yso.fi/onto/koko/p52909
http://www.yso.fi/onto/koko/p33980


In [16]:
# save the graph

#f.serialize(destination="../../data/finnish_graph_new.nt",format="nt",encoding="utf8")#.decode('utf8')




<Graph identifier=N11e2fd8b4a624308bc0614b116ae860e (<class 'rdflib.graph.Graph'>)>

## correct things in theme graph

In [44]:
# cluster per genre information
# load genre graphs
l = Graph() 
l.parse("../../data/finnish_genre_theme.nt",format="nt")

k = Graph() 
k.parse("../../data/translated_genre_theme.nt",format="nt")

<Graph identifier=N5dfcfa9489f14d8fa8cf9daab2c991a1 (<class 'rdflib.graph.Graph'>)>

In [45]:
# initial language lookup

qres = k.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?lang
WHERE {
  ?teos a rel:novel ; rel:langOrig ?lang. 
  
}  """)

langlabel_lookup = {}

for l in qres:
    lang=l["lang"].toPython()
    #print(lang)
    if lang not in langlabel_lookup.keys():
        langlabel_lookup[lang]= lang.split("/")[-1]
    else:
        print("oop",lang)

In [46]:
#
print(langlabel_lookup.items())

dict_items([('http://lexvo.org/id/iso639-3/eng', 'eng'), ('http://lexvo.org/id/iso639-3/ita', 'ita'), ('http://lexvo.org/id/iso639-3/deu', 'deu'), ('http://lexvo.org/id/iso639-3/isl', 'isl'), ('http://lexvo.org/id/iso639-3/swe', 'swe'), ('http://lexvo.org/id/iso639-3/rus', 'rus'), ('http://lexvo.org/id/iso639-3/fra', 'fra'), ('http://lexvo.org/id/iso639-3/dan', 'dan'), ('http://lexvo.org/id/iso639-3/spa', 'spa'), ('http://lexvo.org/id/iso639-3/tur', 'tur'), ('http://lexvo.org/id/iso639-3/ces', 'ces'), ('http://lexvo.org/id/iso639-3/por', 'por'), ('http://lexvo.org/id/iso639-3/nor', 'nor'), ('http://lexvo.org/id/iso639-3/hin', 'hin'), ('http://lexvo.org/id/iso639-3/kor', 'kor'), ('http://lexvo.org/id/iso639-3/lav', 'lav'), ('http://lexvo.org/id/iso639-3/est', 'est'), ('http://lexvo.org/id/iso639-3/yid', 'yid'), ('http://lexvo.org/id/iso639-3/hun', 'hun'), ('http://lexvo.org/id/iso639-3/nld', 'nld'), ('http://lexvo.org/id/iso639-3/pol', 'pol'), ('http://lexvo.org/id/iso639-3/kat', 'kat')

In [49]:
oldlang=[]
for key,v in langlabel_lookup.items():
    if "lexvo" not in key:
        print(key,v)
        oldlang.append(key)

http://www.lingvoj.org/lang/be-x-old be-x-old
http://www.lingvoj.org/lang/bnt bnt
http://www.lingvoj.org/lang/smi smi
http://www.lingvoj.org/lang/en-gb en-gb
http://www.lingvoj.org/lang/fiu fiu


In [50]:
for key,v in langlabel_lookup.items():
    if "vec" in key:
        print(key,v)

http://lexvo.org/id/iso639-3/vec vec


In [51]:
# correct list
# smi http://www.lexvo.org/page/iso639-5/smi
# en-gb http://lexvo.org/id/iso639-3/eng
# bnt http://www.lexvo.org/page/iso639-5/bnt
# fiu http://www.lexvo.org/page/iso639-5/fiu
# be-x-old http://www.lexvo.org/page/iso639-3/bel
# 'http://lexvo.org/id/iso639-3/vec': 'vec',

#newlang=["http://www.lexvo.org/page/iso639-5/smi","http://lexvo.org/id/iso639-3/eng","http://www.lexvo.org/page/iso639-5/bnt",
#        "http://www.lexvo.org/page/iso639-5/fiu","http://www.lexvo.org/page/iso639-3/bel"]
#updated= ["http://lexvo.org/page/iso639-5/smi","http://lexvo.org/id/iso639-3/eng","http://lexvo.org/page/iso639-5/bnt",
#        "http://lexvo.org/page/iso639-5/fiu","http://lexvo.org/page/iso639-3/bel"]
updated= ["http://lexvo.org/page/iso639-5/smi","http://lexvo.org/id/iso639-3/eng","http://lexvo.org/page/iso639-5/bnt",
        "http://lexvo.org/page/iso639-5/fiu"]
newlang={"http://www.lingvoj.org/lang/be-x-old":"http://lexvo.org/page/iso639-3/bel","http://www.lingvoj.org/lang/bnt":"http://lexvo.org/page/iso639-5/bnt","http://www.lingvoj.org/lang/smi":"http://lexvo.org/id/iso639-5/smi","http://www.lingvoj.org/lang/fiu":"http://www.lexvo.org/page/iso639-5/fiu","http://www.lingvoj.org/lang/en-gb":"http://lexvo.org/id/iso639-3/eng","http://lexvo.org/id/iso639-3/vec":'http://lexvo.org/id/iso639-3/rus'}

In [52]:
for key,val in newlang.items():
    
    print(key,val)
    old= key
    new=val.replace("page","id")
    print(old,new)
    query="""
    PREFIX rel: <http://ldf.fi/relse/>
             DELETE {?teos rel:langOrig <OLD> }
             INSERT { ?teos rel:langOrig  <NEW> }
             WHERE { ?teos rel:langOrig <OLD> }
             """
    q=query.replace("OLD",old).replace("NEW",new)
    k.update(q)
    #print("After second update:")
    

http://www.lingvoj.org/lang/be-x-old http://lexvo.org/page/iso639-3/bel
http://www.lingvoj.org/lang/be-x-old http://lexvo.org/id/iso639-3/bel
http://www.lingvoj.org/lang/bnt http://lexvo.org/page/iso639-5/bnt
http://www.lingvoj.org/lang/bnt http://lexvo.org/id/iso639-5/bnt
http://www.lingvoj.org/lang/smi http://lexvo.org/id/iso639-5/smi
http://www.lingvoj.org/lang/smi http://lexvo.org/id/iso639-5/smi
http://www.lingvoj.org/lang/fiu http://www.lexvo.org/page/iso639-5/fiu
http://www.lingvoj.org/lang/fiu http://www.lexvo.org/id/iso639-5/fiu
http://www.lingvoj.org/lang/en-gb http://lexvo.org/id/iso639-3/eng
http://www.lingvoj.org/lang/en-gb http://lexvo.org/id/iso639-3/eng
http://lexvo.org/id/iso639-3/vec http://lexvo.org/id/iso639-3/rus
http://lexvo.org/id/iso639-3/vec http://lexvo.org/id/iso639-3/rus


In [53]:
# save
k.serialize(destination="../../data/foreign_theme_genre.nt",format="nt")




<Graph identifier=N5dfcfa9489f14d8fa8cf9daab2c991a1 (<class 'rdflib.graph.Graph'>)>