# create a graph for genre information

In [2]:
# import 

from collections import defaultdict, OrderedDict, Counter
from datetime import datetime

from dateutil.relativedelta import relativedelta
import glob
from itertools import product, combinations
import matplotlib.pyplot as plt
import numpy    as np
#import networkx as nx
from operator import itemgetter
import pandas   as pd
import rdflib as rdflib
import re
from scipy.signal import convolve2d
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer, minmax_scale
from SPARQLWrapper import SPARQLWrapper, JSON, POST, TURTLE
import sys
from rdflib.namespace import XSD, Namespace
from rdflib.term import URIRef

import IPython
import pprint
pp = pprint.PrettyPrinter(indent=4)



from constants import *
from helpers import *

In [3]:

from rdflib import Graph

from rdflib.namespace import RDF, SKOS
rel = rdflib.Namespace('http://ldf.fi/relse/')


In [4]:
# colors
import matplotlib.colors as mcolors
from matplotlib import cm
# set style
import seaborn as sns

#set_theme() 
sns.set_theme(style="white",palette="PuOr_r")

In [6]:
# Query
Q = """
PREFIX rel: <http://ldf.fi/relse/>

CONSTRUCT {


?teos a rel:novel ;
    skos:prefLabel ?title ;
    rel:langOrig ?kieliuri ;
    rel:hasAuthor ?author ;
    rel:pubYear ?vuosi_ ;
    rel:hasGenre ?genre ;
    rel:hasTheme ?theme.

?author a rel:author ;
    foaf:gender ?gender.

?kieliuri a rel:lang ;
	skos:prefLabel ?kieli .

    
?theme a rel:theme ;
    rel:inNovel ?teos ;
    skos:prefLabel ?themelabel.

?genre a rel:genre ;
    rel:inNovel ?teos ;
    skos:prefLabel ?genrelabel .

}

WHERE {

  ?teos a kaunokki:romaani ; skos:prefLabel ?title .
  ?teos kaunokki:alkukieli ?kieliuri .
  FILTER (?kieliuri !=  <http://lexvo.org/id/iso639-3/fin>  )
  OPTIONAL {?kieliuri skos:prefLabel ?kieli . }
  ?teos kaunokki:manifests_in ?julkaisu .
  ?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin> . # käännetty suomeksi
  FILTER EXISTS {?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin>  }
  ?julkaisu kaunokki:ilmestymisvuosi ?vuosi . ?vuosi skos:prefLabel ?vuosiluku .
  BIND(xsd:integer(?vuosiluku) AS ?vuosi_)
  FILTER(xsd:integer(?vuosiluku) > 1970)
  ?teos kaunokki:tekija ?author.
  ?author skos:prefLabel ?authorname .
  OPTIONAL { ?author foaf:gender ?gender . }

 OPTIONAL {?teos kaunokki:genre ?genre . }#?genre skos:prefLabel ?genrelabel . FILTER(LANG(?genrelabel)="fi") }
 OPTIONAL {?teos kaunokki:teema ?theme . }#?theme skos:prefLabel ?themelabel . FILTER(LANG(?themelabel)="fi")}
} 
"""

In [7]:
# construct graph

# query graph
print('\n\n*** constructing the graph')
sparql = SPARQLWrapper("http://ldf.fi/booksampo-2022/sparql")

sparql.setQuery(PREFIXES + Q)

sparql.addCustomHttpHeader(*list(AUTHORIZATION_HEADER.items())[0])

sparql.setReturnFormat(TURTLE)
results = sparql.query().convert()



*** constructing the graph


In [8]:
# parse

from rdflib import Graph
g = Graph()

g.parse(data=results, format="turtle")

<Graph identifier=Na535c912ebbc4d159a1e44b622c1f140 (<class 'rdflib.graph.Graph'>)>

In [9]:
# find all novels

from rdflib.namespace import RDF, SKOS
rel = rdflib.Namespace('http://ldf.fi/relse/')

novels=[]
for s, p, o in g.triples((None,  None, None)):
    #print(o)
    if o ==rel.novel:
        #print(str(s))
        novels.append(str(s))
    #print(f"{s} is a  novel")
    
print("found" ,len(set(novels)), "novels")
print(novels[:5])


found 18035 novels
['http://www.yso.fi/onto/kaunokki#ateos_26904', 'http://www.yso.fi/onto/kaunokki#ateos_9216', 'http://www.btj.fi/at_1796592', 'http://www.btj.fi/at_1944861', 'http://www.yso.fi/onto/kaunokki#ateos_2239']


In [10]:
# check turtschaninoff for an example
# https://www.kirjasampo.fi/fi/kulsa/http%253A%252F%252Fdata.kirjasampo.fi%252FabstractWork_9789513172398

novels=[]
for s, p, o in g.triples((None,  None, None)):
    #print(o)
    if str(s) =="http://data.kirjasampo.fi/abstractWork_9789513172398":
        print(s,p,o)
        #novels.append(str(s))
    #print(f"{s} is a  novel")
    


http://data.kirjasampo.fi/abstractWork_9789513172398 http://ldf.fi/relse/hasAuthor http://seco.tkk.fi/saha3/u407917fd-84d0-4ad5-a092-23a1ab44885a
http://data.kirjasampo.fi/abstractWork_9789513172398 http://ldf.fi/relse/hasTheme http://www.yso.fi/onto/koko/p49911
http://data.kirjasampo.fi/abstractWork_9789513172398 http://ldf.fi/relse/langOrig http://lexvo.org/id/iso639-3/swe
http://data.kirjasampo.fi/abstractWork_9789513172398 http://ldf.fi/relse/hasGenre http://www.yso.fi/onto/koko/p50553
http://data.kirjasampo.fi/abstractWork_9789513172398 http://ldf.fi/relse/hasTheme http://www.yso.fi/onto/koko/p18265
http://data.kirjasampo.fi/abstractWork_9789513172398 http://ldf.fi/relse/hasTheme http://www.yso.fi/onto/koko/p35623
http://data.kirjasampo.fi/abstractWork_9789513172398 http://ldf.fi/relse/hasTheme http://www.yso.fi/onto/koko/p72664
http://data.kirjasampo.fi/abstractWork_9789513172398 http://ldf.fi/relse/hasTheme http://www.yso.fi/onto/koko/p31779
http://data.kirjasampo.fi/abstractWor

In [11]:
# title lookup

# query the database for novel titles

qres = g.query("""
PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?id ?label
WHERE {
  ?id a rel:novel ; skos:prefLabel ?label
}  """)

title_lookup=dict([(str(url), str(label)) for url, label in qres])

list(title_lookup.items())[:10]

[('http://www.btj.fi/at_1678175', 'Ritos de muerte'),
 ('http://data.kirjasampo.fi/abstractWork_9789518830927',
  'Piraatti-Pete ja hirmuiset merirosvot'),
 ('http://www.yso.fi/onto/kaunokki#ateos_9589', 'The first eagle'),
 ('http://www.yso.fi/onto/kaunokki#ateos_3912', 'Yön suojissa'),
 ('http://data.kirjasampo.fi/abstractWork_6834230', 'Sirkustyttö'),
 ('http://data.kirjasampo.fi/abstractWork_4124516', 'The ordinary princess'),
 ('http://www.yso.fi/onto/kaunokki#ateos_2216', 'Gift med en kommunist'),
 ('http://data.kirjasampo.fi/abstractWork_9789511255659', 'Suden hetki'),
 ('http://seco.tkk.fi/saha3/ucf938d75-fc5e-4637-9715-73092ba5482a',
  'De försvunna kompisarna'),
 ('http://seco.tkk.fi/saha3/u99617114-2455-49e4-8bed-05851eb04a3f', 'Hannah')]

In [12]:
# gender lookup

qres = g.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?teos ?gender
WHERE {
  ?teos a rel:novel ; rel:hasAuthor ?author. 
  ?author foaf:gender ?gender .
}  """)

gender_lookup=defaultdict(set)
for url, label in qres:
    gender_lookup[str(url)].update([str(label)])
#print(list(gender_lookup.items())[-10:])
print(len(gender_lookup))
gender_lookup2 = {novel:"unk" for novel in novels if novel not in gender_lookup.keys() }
gender_lookup.update(gender_lookup2)
print(len(gender_lookup))
print(list(gender_lookup.items())[:10],list(gender_lookup.items())[-10:])

17644
17644
[('http://www.btj.fi/at_1678175', {'http://www.yso.fi/onto/kaunokki#female'}), ('http://data.kirjasampo.fi/abstractWork_9789518830927', {'http://www.yso.fi/onto/kaunokki#female'}), ('http://www.yso.fi/onto/kaunokki#ateos_9589', {'http://www.yso.fi/onto/kaunokki#male'}), ('http://www.yso.fi/onto/kaunokki#ateos_3912', {'http://www.yso.fi/onto/kaunokki#female'}), ('http://data.kirjasampo.fi/abstractWork_6834230', {'http://www.yso.fi/onto/kaunokki#female'}), ('http://data.kirjasampo.fi/abstractWork_4124516', {'http://www.yso.fi/onto/kaunokki#female'}), ('http://www.yso.fi/onto/kaunokki#ateos_2216', {'http://www.yso.fi/onto/kaunokki#male'}), ('http://data.kirjasampo.fi/abstractWork_9789511255659', {'http://www.yso.fi/onto/kaunokki#female'}), ('http://seco.tkk.fi/saha3/u99617114-2455-49e4-8bed-05851eb04a3f', {'http://www.yso.fi/onto/kaunokki#male'}), ('http://seco.tkk.fi/saha3/ue552c4bb-ecd8-495d-be5d-00c45f9caaa8', {'http://www.yso.fi/onto/kaunokki#male'})] [('http://www.yso.fi/

# correct languages



In [13]:
# initial language lookup

qres = g.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?teos ?lang
WHERE {
  ?teos a rel:novel ; rel:langOrig ?lang. 
  #?author foaf:gender ?gender .
}  """)

lang_lookup=defaultdict(set)
for url, label in qres:
    lang_lookup[str(url)].update([str(label)])
#print(list(gender_lookup.items())[-10:])
print(len(lang_lookup))
lang_lookup2 = {novel:"unk" for novel in novels if novel not in lang_lookup.keys() }
lang_lookup.update(lang_lookup2)
print(len(lang_lookup))
print(list(lang_lookup.items())[:10],list(lang_lookup.items())[-10:])

18035
18035
[('http://www.btj.fi/at_1678175', {'http://lexvo.org/id/iso639-3/spa'}), ('http://data.kirjasampo.fi/abstractWork_9789518830927', {'http://lexvo.org/id/iso639-3/eng'}), ('http://www.yso.fi/onto/kaunokki#ateos_9589', {'http://lexvo.org/id/iso639-3/eng'}), ('http://www.yso.fi/onto/kaunokki#ateos_3912', {'http://lexvo.org/id/iso639-3/eng'}), ('http://data.kirjasampo.fi/abstractWork_6834230', {'http://lexvo.org/id/iso639-3/eng'}), ('http://data.kirjasampo.fi/abstractWork_4124516', {'http://lexvo.org/id/iso639-3/eng'}), ('http://www.yso.fi/onto/kaunokki#ateos_2216', {'http://lexvo.org/id/iso639-3/eng'}), ('http://data.kirjasampo.fi/abstractWork_9789511255659', {'http://lexvo.org/id/iso639-3/eng'}), ('http://seco.tkk.fi/saha3/ucf938d75-fc5e-4637-9715-73092ba5482a', {'http://lexvo.org/id/iso639-3/eng'}), ('http://seco.tkk.fi/saha3/u99617114-2455-49e4-8bed-05851eb04a3f', {'http://lexvo.org/id/iso639-3/fra'})] [('http://www.yso.fi/onto/kaunokki#ateos_708', {'http://lexvo.org/id/iso6

In [14]:
g2 =g

In [15]:
# split the name and do something

qres = g2.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?lang
WHERE {
  ?teos a rel:novel ; rel:langOrig ?lang. 
  
}  """)

langlabel_lookup = {}

for l in qres:
    lang=l["lang"].toPython()
    #print(lang)
    if lang not in langlabel_lookup.keys():
        langlabel_lookup[lang]= lang.split("/")[-1]
    else:
        print("oop",lang)

In [16]:
oldlang=[]
for k,v in langlabel_lookup.items():
    if "lexvo" not in k:
        print(k,v)
        oldlang.append(k)

http://www.lingvoj.org/lang/smi smi
http://www.lingvoj.org/lang/en-gb en-gb
http://www.lingvoj.org/lang/bnt bnt
http://www.lingvoj.org/lang/fiu fiu
http://www.lingvoj.org/lang/be-x-old be-x-old


In [17]:
# correct list
# smi http://www.lexvo.org/page/iso639-5/smi
# en-gb http://lexvo.org/id/iso639-3/eng
# bnt http://www.lexvo.org/page/iso639-5/bnt
# fiu http://www.lexvo.org/page/iso639-5/fiu
# be-x-old http://www.lexvo.org/page/iso639-3/bel

newlang=["http://www.lexvo.org/page/iso639-5/smi","http://lexvo.org/id/iso639-3/eng","http://www.lexvo.org/page/iso639-5/bnt",
        "http://www.lexvo.org/page/iso639-5/fiu","http://www.lexvo.org/page/iso639-3/bel"]

updated= ["http://lexvo.org/page/iso639-5/smi","http://lexvo.org/id/iso639-3/eng","http://lexvo.org/page/iso639-5/bnt",
        "http://lexvo.org/page/iso639-5/fiu","http://lexvo.org/page/iso639-3/bel"]
updated= ["http://lexvo.org/page/iso639-5/smi","http://lexvo.org/id/iso639-3/eng","http://lexvo.org/page/iso639-5/bnt",
        "http://lexvo.org/page/iso639-5/fiu","http://lexvo.org/page/iso639-3/bel"]

In [18]:
# correct vec to Russian
for o,n in zip(["http://lexvo.org/id/iso639-3/vec"],["http://lexvo.org/id/iso639-3/rus"]):
    
    print(o,n)
    old= o
    new=n.replace("page","id")
    print(o,new)
    query="""
    PREFIX rel: <http://ldf.fi/relse/>
             DELETE {?teos rel:langOrig <OLD> }
             INSERT { ?teos rel:langOrig  <NEW> }
             WHERE { ?teos rel:langOrig <OLD> }
             """
    q=query.replace("OLD",old).replace("NEW",new)
    g2.update(q)

http://lexvo.org/id/iso639-3/vec http://lexvo.org/id/iso639-3/rus
http://lexvo.org/id/iso639-3/vec http://lexvo.org/id/iso639-3/rus


In [19]:
for o,n in zip(oldlang,newlang):
    
    #print(o,n)
    old= o
    new=n.replace("page","id")
    print(old,new)
    query="""
             DELETE {?teos rel:langOrig <OLD> }
             INSERT { ?teos rel:langOrig  <NEW> }
             WHERE { ?teos rel:langOrig <OLD> }
             """
    q=query.replace("OLD",old).replace("NEW",new)
    g2.update(q)
    #print("After second update:")
    

http://www.lingvoj.org/lang/smi http://www.lexvo.org/id/iso639-5/smi
http://www.lingvoj.org/lang/en-gb http://lexvo.org/id/iso639-3/eng
http://www.lingvoj.org/lang/bnt http://www.lexvo.org/id/iso639-5/bnt
http://www.lingvoj.org/lang/fiu http://www.lexvo.org/id/iso639-5/fiu
http://www.lingvoj.org/lang/be-x-old http://www.lexvo.org/id/iso639-3/bel


In [20]:
for n in newlang:
    new=n.replace("page","id")
    g2.add((URIRef(new), RDF.type, rel.lang))

In [21]:
# new language lookup

qres = g2.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT ?teos ?lang
WHERE {
  ?teos a rel:novel ; rel:langOrig ?lang  ; rel:pubYear ?year .
  FILTER(?year > 1899)
  #?author foaf:gender ?gender .
}  """)

lang_lookup=defaultdict(set)
for url, label in qres:
    lang_lookup[str(url)].update([str(label)])
#print(list(gender_lookup.items())[-10:])
print(len(lang_lookup))
#lang_lookup2 = {novel:"unk" for novel in novels if novel not in lang_lookup.keys() }
#lang_lookup.update(lang_lookup2)
print(len(lang_lookup))
print(list(lang_lookup.items())[:10],list(lang_lookup.items())[-10:])

18035
18035
[('http://www.btj.fi/at_1678175', {'http://lexvo.org/id/iso639-3/spa'}), ('http://data.kirjasampo.fi/abstractWork_9789518830927', {'http://lexvo.org/id/iso639-3/eng'}), ('http://www.yso.fi/onto/kaunokki#ateos_9589', {'http://lexvo.org/id/iso639-3/eng'}), ('http://www.yso.fi/onto/kaunokki#ateos_3912', {'http://lexvo.org/id/iso639-3/eng'}), ('http://data.kirjasampo.fi/abstractWork_6834230', {'http://lexvo.org/id/iso639-3/eng'}), ('http://data.kirjasampo.fi/abstractWork_4124516', {'http://lexvo.org/id/iso639-3/eng'}), ('http://www.yso.fi/onto/kaunokki#ateos_2216', {'http://lexvo.org/id/iso639-3/eng'}), ('http://data.kirjasampo.fi/abstractWork_9789511255659', {'http://lexvo.org/id/iso639-3/eng'}), ('http://seco.tkk.fi/saha3/ucf938d75-fc5e-4637-9715-73092ba5482a', {'http://lexvo.org/id/iso639-3/eng'}), ('http://seco.tkk.fi/saha3/u99617114-2455-49e4-8bed-05851eb04a3f', {'http://lexvo.org/id/iso639-3/fra'})] [('http://www.yso.fi/onto/kaunokki#ateos_708', {'http://lexvo.org/id/iso6

In [23]:
oldlang=[]
for k,v in langlabel_lookup.items():
    if "page"  in k:
        print(k,v)
        oldlang.append(k)
        
oldlang

[]

In [24]:
# count languages

countdict=defaultdict(int)
n=0
for k,v in lang_lookup.items():
    if v == "unk":
        print(v)
        countdict[v] +=1
    else:
        for val in v:
            label=val.split("/")[-1]
            #print(label)
            #if len(label) ==1:
            countdict[label] += 1
            #elif len(label) > 1:
             #   print(label)

In [25]:


pd.DataFrame(countdict.items()).sort_values(by=1,ascending=False).tail(30)

Unnamed: 0,0,1
22,bul,5
17,afr,5
48,fas,5
39,ron,4
24,ukr,3
50,grc,3
44,lit,3
34,kat,3
35,srp,3
59,hye,2


In [26]:
langlabel_lookup.keys()

dict_keys(['http://lexvo.org/id/iso639-3/spa', 'http://lexvo.org/id/iso639-3/eng', 'http://lexvo.org/id/iso639-3/fra', 'http://lexvo.org/id/iso639-3/dan', 'http://lexvo.org/id/iso639-3/swe', 'http://lexvo.org/id/iso639-3/deu', 'http://lexvo.org/id/iso639-3/ita', 'http://lexvo.org/id/iso639-3/sme', 'http://lexvo.org/id/iso639-3/nor', 'http://lexvo.org/id/iso639-3/rus', 'http://lexvo.org/id/iso639-3/jpn', 'http://lexvo.org/id/iso639-3/ara', 'http://lexvo.org/id/iso639-3/heb', 'http://lexvo.org/id/iso639-3/isl', 'http://lexvo.org/id/iso639-3/por', 'http://lexvo.org/id/iso639-3/est', 'http://lexvo.org/id/iso639-3/pol', 'http://lexvo.org/id/iso639-3/afr', 'http://lexvo.org/id/iso639-3/nld', 'http://lexvo.org/id/iso639-3/ell', 'http://lexvo.org/id/iso639-3/lav', 'http://lexvo.org/id/iso639-3/hun', 'http://lexvo.org/id/iso639-3/bul', 'http://lexvo.org/id/iso639-3/cat', 'http://lexvo.org/id/iso639-3/ukr', 'http://lexvo.org/id/iso639-3/ces', 'http://lexvo.org/id/iso639-3/yid', 'http://lexvo.org

In [28]:
# looks right
# SPARQL checks for Bulgarian

HC="""
SELECT  (COUNT(DISTINCT ?teos) as ?authorcount) (GROUP_CONCAT(?author; separator=";") AS ?authorUris) WHERE {
  
  ?teos  a kaunokki:romaani ; kaunokki:tekija ?author ; skos:prefLabel ?title.
  ?teos kaunokki:alkukieli ?kieliuri .
  FILTER (?kieliuri !=  <http://lexvo.org/id/iso639-3/fin>  )
  VALUES ?kieliuri { <http://lexvo.org/id/iso639-3/bul> }
  ?teos kaunokki:manifests_in ?julkaisu .
  ?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin> . # käännetty suomeksi
  ?julkaisu kaunokki:ilmestymisvuosi ?vuosi . ?vuosi skos:prefLabel ?vuosiluku .
  #BIND(regex(str(?vuosiluku), "197" ) AS ?decade)
  
  #FILTER(?decade = xsd:boolean("true"))
  FILTER(xsd:integer(?vuosiluku) > 1970)
 
  }
  """


sparql = SPARQLWrapper("http://ldf.fi/booksampo-2022/sparql")
#sparql = SPARQLWrapper("http://ldf.fi/kirjasampo/sparql")
sparql.setQuery(PREFIXES + HC)
sparql.setReturnFormat(JSON)
sparql.addCustomHttpHeader(*list(AUTHORIZATION_HEADER.items())[0])
results = sparql.query().convert()

df_check = JSON2Pandas2(results)
df_check # corresponds to my number

Unnamed: 0,authorcount,authorUris
0,5,http://www.yso.fi/onto/kaunokki#person_1231760...


In [29]:
# remove all language labels
for s, p, o in g2.triples((None, RDF.type, rel.lang)):
    #g.add((s, FOAF['name'], o))
    print(s)
    g2.remove((s, SKOS.prefLabel, None))

http://lexvo.org/id/iso639-3/zho
http://lexvo.org/id/iso639-3/hin
http://lexvo.org/id/iso639-3/slv
http://lexvo.org/id/iso639-3/fit
http://lexvo.org/id/iso639-3/jpn
http://lexvo.org/id/iso639-3/ben
http://lexvo.org/id/iso639-3/sqi
http://lexvo.org/id/iso639-3/rus
http://lexvo.org/id/iso639-3/swe
http://www.lingvoj.org/lang/be-x-old
http://lexvo.org/id/iso639-3/spa
http://lexvo.org/id/iso639-3/afr
http://lexvo.org/id/iso639-3/hye
http://lexvo.org/id/iso639-3/dan
http://lexvo.org/id/iso639-3/som
http://lexvo.org/id/iso639-3/pol
http://lexvo.org/id/iso639-3/vec
http://lexvo.org/id/iso639-3/hrv
http://lexvo.org/id/iso639-3/chm
http://lexvo.org/id/iso639-3/nor
http://www.lingvoj.org/lang/en-gb
http://lexvo.org/id/iso639-3/heb
http://lexvo.org/id/iso639-3/tur
http://lexvo.org/id/iso639-3/kor
http://lexvo.org/id/iso639-3/yid
http://lexvo.org/id/iso639-3/eng
http://lexvo.org/id/iso639-3/hun
http://lexvo.org/id/iso639-3/slk
http://lexvo.org/id/iso639-3/vie
http://lexvo.org/id/iso639-3/hbs
http:

In [36]:
#save

#g2.serialize(destination="../../data/translated_theme_genre_new.nt",format="nt")




<Graph identifier=N476beea9994d4ceebe49e22e54265029 (<class 'rdflib.graph.Graph'>)>

# Finnish graph

In [30]:
# Query
Q = """
PREFIX rel: <http://ldf.fi/relse/>

CONSTRUCT {


?teos a rel:novel ;
    skos:prefLabel ?title ;
    #rel:langOrig ?kieliuri ;
    rel:hasAuthor ?author ;
    rel:pubYear ?vuosi_ ;
    rel:hasGenre ?genre ;
    rel:hasTheme ?theme.

?author a rel:author ;
    foaf:gender ?gender.

#?kieliuri a rel:lang ;
#    skos:prefLabel ?kieli .

    
?theme a rel:theme .
    #rel:inNovel ?teos ;
   # skos:prefLabel ?themelabel.

?genre a rel:genre .
    #rel:inNovel ?teos ;
    #skos:prefLabel ?genrelabel .

}

WHERE {

  ?teos a kaunokki:romaani ; skos:prefLabel ?title .
  ?teos kaunokki:alkukieli  <http://lexvo.org/id/iso639-3/fin>  . # alkukieli suomi
  #OPTIONAL {?kieliuri skos:prefLabel ?kieli . }
  ?teos kaunokki:manifests_in ?julkaisu .
  ?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin> . # suomenkielinen julkaisu
  ?julkaisu kaunokki:ilmestymisvuosi ?vuosi . ?vuosi skos:prefLabel ?vuosiluku .
  ?julkaisu kaunokki:onEnsimmainenVersio ?true .
  BIND(xsd:integer(?vuosiluku) AS ?vuosi_)
  FILTER(?vuosi_ > 1970)
  ?teos kaunokki:tekija ?author.
  ?author skos:prefLabel ?authorname .
OPTIONAL {?author kaunokki:kansallisuus ?kansallisuusuri . OPTIONAL { ?kansallisuusuri skos:prefLabel ?kansallisuus .} }
OPTIONAL  {?author foaf:gender ?gender }


 OPTIONAL {?teos kaunokki:genre ?genre  } #?genre skos:prefLabel ?genrelabel . FILTER(LANG(?genrelabel)="fi") }
 OPTIONAL {?teos kaunokki:teema ?theme  } #?theme skos:prefLabel ?themelabel . FILTER(LANG(?themelabel)="fi")}


} 
"""


# construct graph

# query graph
print('\n\n*** constructing the graph')
sparql = SPARQLWrapper("http://ldf.fi/booksampo-2022/sparql")

sparql.setQuery(PREFIXES + Q)

sparql.addCustomHttpHeader(*list(AUTHORIZATION_HEADER.items())[0])

sparql.setReturnFormat(TURTLE)
results = sparql.query().convert()



*** constructing the graph


In [31]:
f = Graph()
#g.parse(data=results, format="turtle") # "mygraph.ttl"
f.parse(data=results, format="turtle")

<Graph identifier=Ndb5480985be74649a5a7497cf43f5d9c (<class 'rdflib.graph.Graph'>)>

In [32]:
Q_fi ="""

SELECT  (COUNT(DISTINCT ?teosfi) AS ?novels) (COUNT(DISTINCT ?author) AS ?authors) #?vuosi #(MIN(?vuosi) AS ?julkaisuvuosi)  
{
?teosfi a rel:novel .
  #?teosfi rel:langOrig <http://lexvo.org/id/iso639-3/fin> .
    ?teosfi rel:hasAuthor ?author .
  ?teosfi rel:pubYear ?vuosi . #?vuosir skos:prefLabel ?vuosiluku .
  #BIND(xsd:integer(?vuosiluku) AS ?vuosi)
  BIND(regex(str(?vuosi), "199" ) AS ?decade)
  FILTER(?decade = xsd:boolean("true"))
  
  } #GROUP BY ?vuosi
  
"""

decades=[197,198,199,200,201]
df_authorsfi=pd.DataFrame()
for d in decades:
    q=Q_fi.replace("199", str(d))
    qres = f.query(q)
    
    for row in qres:
        n = str(row.asdict()['novels'].toPython())   
        #natname = str(row.asdict()['label'].toPython())   
        a = str(row.asdict()['authors'].toPython()) 
        temp=pd.DataFrame({"authors":a, "novels":n}.values() ).T.rename({0:"authors",1:"novels"},axis=1)
        temp["decade"] = d*10
    df_authorsfi=pd.concat([df_authorsfi,temp])
df_authorsfi

Unnamed: 0,authors,novels,decade
0,645,1315,1970
0,813,1891,1980
0,974,2281,1990
0,1569,3609,2000
0,2926,6409,2010


In [33]:
df_authorsfi

Unnamed: 0,authors,novels,decade
0,645,1315,1970
0,813,1891,1980
0,974,2281,1990
0,1569,3609,2000
0,2926,6409,2010


In [42]:
# save
#f.serialize(destination="../../data/finnish_genre_theme_new.nt",format="nt",encoding="utf8")#.decode('utf8')




<Graph identifier=Nf22e48df1ad9426cbf8ea05b2414f809 (<class 'rdflib.graph.Graph'>)>

## correct things in theme graph

In [44]:
# cluster per genre information
# load genre graphs
l = Graph() 
l.parse("../../data/finnish_genre_theme.nt",format="nt")

k = Graph() 
k.parse("../../data/translated_genre_theme.nt",format="nt")

<Graph identifier=N5dfcfa9489f14d8fa8cf9daab2c991a1 (<class 'rdflib.graph.Graph'>)>

In [45]:
# initial language lookup

qres = k.query("""
#PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX dct: <http://purl.org/dc/terms/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX schema: <http://schema.org/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX xml: <http://www.w3.org/XML/1998/namespace>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?lang
WHERE {
  ?teos a rel:novel ; rel:langOrig ?lang. 
  
}  """)

langlabel_lookup = {}

for l in qres:
    lang=l["lang"].toPython()
    #print(lang)
    if lang not in langlabel_lookup.keys():
        langlabel_lookup[lang]= lang.split("/")[-1]
    else:
        print("oop",lang)

In [46]:
#
print(langlabel_lookup.items())

dict_items([('http://lexvo.org/id/iso639-3/eng', 'eng'), ('http://lexvo.org/id/iso639-3/ita', 'ita'), ('http://lexvo.org/id/iso639-3/deu', 'deu'), ('http://lexvo.org/id/iso639-3/isl', 'isl'), ('http://lexvo.org/id/iso639-3/swe', 'swe'), ('http://lexvo.org/id/iso639-3/rus', 'rus'), ('http://lexvo.org/id/iso639-3/fra', 'fra'), ('http://lexvo.org/id/iso639-3/dan', 'dan'), ('http://lexvo.org/id/iso639-3/spa', 'spa'), ('http://lexvo.org/id/iso639-3/tur', 'tur'), ('http://lexvo.org/id/iso639-3/ces', 'ces'), ('http://lexvo.org/id/iso639-3/por', 'por'), ('http://lexvo.org/id/iso639-3/nor', 'nor'), ('http://lexvo.org/id/iso639-3/hin', 'hin'), ('http://lexvo.org/id/iso639-3/kor', 'kor'), ('http://lexvo.org/id/iso639-3/lav', 'lav'), ('http://lexvo.org/id/iso639-3/est', 'est'), ('http://lexvo.org/id/iso639-3/yid', 'yid'), ('http://lexvo.org/id/iso639-3/hun', 'hun'), ('http://lexvo.org/id/iso639-3/nld', 'nld'), ('http://lexvo.org/id/iso639-3/pol', 'pol'), ('http://lexvo.org/id/iso639-3/kat', 'kat')

In [49]:
oldlang=[]
for key,v in langlabel_lookup.items():
    if "lexvo" not in key:
        print(key,v)
        oldlang.append(key)

http://www.lingvoj.org/lang/be-x-old be-x-old
http://www.lingvoj.org/lang/bnt bnt
http://www.lingvoj.org/lang/smi smi
http://www.lingvoj.org/lang/en-gb en-gb
http://www.lingvoj.org/lang/fiu fiu


In [50]:
for key,v in langlabel_lookup.items():
    if "vec" in key:
        print(key,v)

http://lexvo.org/id/iso639-3/vec vec


In [51]:
# correct list
# smi http://www.lexvo.org/page/iso639-5/smi
# en-gb http://lexvo.org/id/iso639-3/eng
# bnt http://www.lexvo.org/page/iso639-5/bnt
# fiu http://www.lexvo.org/page/iso639-5/fiu
# be-x-old http://www.lexvo.org/page/iso639-3/bel
# 'http://lexvo.org/id/iso639-3/vec': 'vec',

#newlang=["http://www.lexvo.org/page/iso639-5/smi","http://lexvo.org/id/iso639-3/eng","http://www.lexvo.org/page/iso639-5/bnt",
#        "http://www.lexvo.org/page/iso639-5/fiu","http://www.lexvo.org/page/iso639-3/bel"]
#updated= ["http://lexvo.org/page/iso639-5/smi","http://lexvo.org/id/iso639-3/eng","http://lexvo.org/page/iso639-5/bnt",
#        "http://lexvo.org/page/iso639-5/fiu","http://lexvo.org/page/iso639-3/bel"]
updated= ["http://lexvo.org/page/iso639-5/smi","http://lexvo.org/id/iso639-3/eng","http://lexvo.org/page/iso639-5/bnt",
        "http://lexvo.org/page/iso639-5/fiu"]
newlang={"http://www.lingvoj.org/lang/be-x-old":"http://lexvo.org/page/iso639-3/bel","http://www.lingvoj.org/lang/bnt":"http://lexvo.org/page/iso639-5/bnt","http://www.lingvoj.org/lang/smi":"http://lexvo.org/id/iso639-5/smi","http://www.lingvoj.org/lang/fiu":"http://www.lexvo.org/page/iso639-5/fiu","http://www.lingvoj.org/lang/en-gb":"http://lexvo.org/id/iso639-3/eng","http://lexvo.org/id/iso639-3/vec":'http://lexvo.org/id/iso639-3/rus'}

In [52]:
for key,val in newlang.items():
    
    print(key,val)
    old= key
    new=val.replace("page","id")
    print(old,new)
    query="""
    PREFIX rel: <http://ldf.fi/relse/>
             DELETE {?teos rel:langOrig <OLD> }
             INSERT { ?teos rel:langOrig  <NEW> }
             WHERE { ?teos rel:langOrig <OLD> }
             """
    q=query.replace("OLD",old).replace("NEW",new)
    k.update(q)
    #print("After second update:")
    

http://www.lingvoj.org/lang/be-x-old http://lexvo.org/page/iso639-3/bel
http://www.lingvoj.org/lang/be-x-old http://lexvo.org/id/iso639-3/bel
http://www.lingvoj.org/lang/bnt http://lexvo.org/page/iso639-5/bnt
http://www.lingvoj.org/lang/bnt http://lexvo.org/id/iso639-5/bnt
http://www.lingvoj.org/lang/smi http://lexvo.org/id/iso639-5/smi
http://www.lingvoj.org/lang/smi http://lexvo.org/id/iso639-5/smi
http://www.lingvoj.org/lang/fiu http://www.lexvo.org/page/iso639-5/fiu
http://www.lingvoj.org/lang/fiu http://www.lexvo.org/id/iso639-5/fiu
http://www.lingvoj.org/lang/en-gb http://lexvo.org/id/iso639-3/eng
http://www.lingvoj.org/lang/en-gb http://lexvo.org/id/iso639-3/eng
http://lexvo.org/id/iso639-3/vec http://lexvo.org/id/iso639-3/rus
http://lexvo.org/id/iso639-3/vec http://lexvo.org/id/iso639-3/rus


In [53]:
# save
#k.serialize(destination="../../data/foreign_theme_genre.nt",format="nt")




<Graph identifier=N5dfcfa9489f14d8fa8cf9daab2c991a1 (<class 'rdflib.graph.Graph'>)>