In [1]:
# import 

from collections import defaultdict, OrderedDict, Counter
from datetime import datetime

from dateutil.relativedelta import relativedelta
import glob
from itertools import product, combinations
import matplotlib.pyplot as plt
import numpy    as np
#import networkx as nx
from operator import itemgetter
import pandas   as pd
import rdflib as rdflib
import re
from scipy.signal import convolve2d
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer, minmax_scale
from SPARQLWrapper import SPARQLWrapper, JSON, POST, TURTLE
import sys
from rdflib.namespace import XSD, Namespace
from rdflib.term import URIRef

import IPython
import pprint
pp = pprint.PrettyPrinter(indent=4)



from constants import *
from helpers import *


from rdflib import Graph

from rdflib.namespace import RDF, SKOS
rel = rdflib.Namespace('http://ldf.fi/relse/')

In [2]:
# helper functions

def to_df(reslist, sort_column):
    my_df=pd.DataFrame(reslist).sort_values(by=sort_column)
    if "year" in my_df.columns:
        my_df["year"]=pd.to_numeric(my_df["year"])
    return my_df


# add period function
def add_period(df_fo, columnname="year"):
    df_fo['period'] = pd.cut(df_fo[columnname], bins=bins, include_lowest=True, precision=0)
    df_fo['period'] = df_fo['period'].astype("str")

    df_fo["period"]=df_fo["period"].apply(lambda x: int(x.split(",")[0].replace("[","")))
    return df_fo

# decade function
def add_decade(df_fi, columname="year"):
    df_fi["decade"]=(df_fi["year"]-1)- ((df_fi["year"]-1)%10)
    
    return df_fi

In [64]:
# Query

# translated
Q = """
PREFIX rel: <http://ldf.fi/relse/>

CONSTRUCT {


?teos a rel:novel ;
    skos:prefLabel ?title ;
    rel:hasPublication ?julkaisu ;
    rel:pubYear ?vuosi_ .

?julkaisu a rel:publication ;
    rel:hasPublisher ?publisher ;   # here we need the physical manifestation to connect with the publishers
    rel:pubYear ?vuosi_ .


?publisher a rel:publisher ;
	skos:prefLabel ?publisher_name .

}

WHERE {

  ?teos a kaunokki:romaani ; skos:prefLabel ?title .
  ?teos kaunokki:alkukieli ?kieliuri .
  FILTER (?kieliuri !=  <http://lexvo.org/id/iso639-3/fin>  ) # original language not Finnish
  OPTIONAL {?kieliuri skos:prefLabel ?kieli . }
  ?teos kaunokki:manifests_in ?julkaisu .
  ?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin> . # translated into Finnish
  FILTER EXISTS {?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin>  }
  ?julkaisu kaunokki:ilmestymisvuosi ?vuosi . ?vuosi skos:prefLabel ?vuosiluku .
  BIND(xsd:integer(?vuosiluku) AS ?vuosi_)
  ?julkaisu kaunokki:hasPublisher ?publisher .
  OPTIONAL { ?publisher skos:prefLabel ?publisher_name }


} 
"""

In [65]:
# construct graph

# query graph
print('\n\n*** constructing the graph')
sparql = SPARQLWrapper("http://ldf.fi/booksampo-2022/sparql")

sparql.setQuery(PREFIXES + Q)

sparql.addCustomHttpHeader(*list(AUTHORIZATION_HEADER.items())[0])

sparql.setReturnFormat(TURTLE)
results = sparql.query().convert()



*** constructing the graph


In [66]:
# parse

from rdflib import Graph
g_k = Graph()

g_k.parse(data=results, format="turtle")

<Graph identifier=N1250e6d4a834426b99702436d394ab74 (<class 'rdflib.graph.Graph'>)>

In [8]:
# same for Finnish literature


Q_fi = """
PREFIX rel: <http://ldf.fi/relse/>

CONSTRUCT {


?teos a rel:novel ;
    skos:prefLabel ?title ;
    rel:hasPublication ?julkaisu ;
    rel:pubYear ?vuosi_ .

?julkaisu a rel:publication ;
    rel:hasPublisher ?publisher ;
    rel:pubYear ?vuosi_ .


?publisher a rel:publisher ;
	skos:prefLabel ?publisher_name .

}

WHERE {

  ?teos a kaunokki:romaani ; skos:prefLabel ?title .
  ?teos kaunokki:alkukieli <http://lexvo.org/id/iso639-3/fin>  .
  #OPTIONAL {?kieliuri skos:prefLabel ?kieli . }
  ?teos kaunokki:manifests_in ?julkaisu .
  ?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin> . # Finnish publication
  FILTER EXISTS {?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin>  }
  ?julkaisu kaunokki:ilmestymisvuosi ?vuosi . ?vuosi skos:prefLabel ?vuosiluku .
  BIND(xsd:integer(?vuosiluku) AS ?vuosi_)
  ?julkaisu kaunokki:hasPublisher ?publisher .
  OPTIONAL { ?publisher skos:prefLabel ?publisher_name }


} 
"""

In [9]:
# query graph
print('\n\n*** constructing the graph')
sparql = SPARQLWrapper("http://ldf.fi/booksampo-2022/sparql")

sparql.setQuery(PREFIXES + Q_fi)

sparql.addCustomHttpHeader(*list(AUTHORIZATION_HEADER.items())[0])

sparql.setReturnFormat(TURTLE)
results = sparql.query().convert()



*** constructing the graph


In [10]:
# Finnish graph with publishers
f_k = Graph()

f_k.parse(data=results, format="turtle")

<Graph identifier=Ne2017f7517c4461983cda88e3ce97954 (<class 'rdflib.graph.Graph'>)>

In [16]:
# existing graphs


f = Graph()
f.parse('../../data/finnish_graph2.nt')

<Graph identifier=Ncea187a55d244759b6dcdbe92c4717c5 (<class 'rdflib.graph.Graph'>)>

In [15]:
# foreign
g = Graph() 

g.parse('../../data/nat_lang_graph.nt')

<Graph identifier=N923e653f27c44ae49ee367bf9e8d18b6 (<class 'rdflib.graph.Graph'>)>

In [19]:
# genre

# load genre graphs
g_fi = Graph() 
g_fi.parse("../../data/finnish_genre_theme.nt",format="nt")

g_fo = Graph() 
g_fo.parse("../../data/translated_theme_genre_new.nt",format="nt")

<Graph identifier=N512deb3617664963927072c8210a7a5b (<class 'rdflib.graph.Graph'>)>

In [22]:
# load genre df
genre_df= pd.read_csv("../../data/genre_df.csv",sep="\t", index_col=0)
genre_df.head()

Unnamed: 0,genre,name,subgenres,novels,genrename
0,http://www.yso.fi/onto/koko/p14999,hevoskirjat,0,1424,elain
1,http://www.yso.fi/onto/koko/p17804,eläinkertomukset,0,813,elain
2,http://www.yso.fi/onto/koko/p9137,eläinsadut,0,64,elain
3,http://www.yso.fi/onto/koko/p17961,faabelit,0,9,elain
4,http://www.yso.fi/onto/koko/p1399,eläinrunot,0,1,elain


In [26]:
# make a genre dict
# {genrename: [genre,genre,genre]}
genre_dict=genre_df.groupby("genrename")["genre"].unique().to_dict()
# see one category for example
genre_dict["elain"]

'http://www.yso.fi/onto/koko/p14999'

In [27]:
from rdflib.namespace import RDF, SKOS
rel = rdflib.Namespace('http://ldf.fi/relse/')

# add the broader genre to the values of the dict values


# Finnish
g2=g_fi
check_q= """

    PREFIX rel: <http://ldf.fi/relse/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT ?broader (COUNT(DISTINCT ?teos) AS ?novels) WHERE 
    { ?genre rel:broaderCategory <CAT> .  ?teos rel:hasGenre ?genre . BIND (<CAT> as ?broader ) }
    GROUP BY ?broader

    """

allres={key:[] for key in genre_dict.keys()}
for key,val in genre_dict.items():
    #if len(v)> 1:
    print(key)
    # insert broader
    
    
    g2.add((URIRef(key), RDF.type, rel.broaderCategory))
    
    
    for v in val:
        g2.add((URIRef(v),  rel.broaderCategory, URIRef(key)))
            
    
    c_q=check_q.replace("CAT", key)
    qres = g2.query(c_q)
    allres[key] += qres
    
    
category="lapset"
print("Found", allres[category], "novels of category", category)

elain
era
erotiikka
fantasia
historia
huumori
jannitys
kauhu
lapset
nuoret
rakkaus
scifi
sota
urheilu
uskonto
Found [(rdflib.term.URIRef('lapset'), rdflib.term.Literal('986', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))] novels of category lapset


In [28]:
# add broader category to translated

from rdflib.namespace import RDF, SKOS
rel = rdflib.Namespace('http://ldf.fi/relse/')

# add the broader genre to these
g_fo2=g_fo
check_q= """

    PREFIX rel: <http://ldf.fi/relse/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT ?broader (COUNT(DISTINCT ?teos) AS ?novels) WHERE 
    { ?genre rel:broaderCategory <CAT> .  ?teos rel:hasGenre ?genre . BIND (<CAT> as ?broader ) }
    GROUP BY ?broader

    """

allres={key:[] for key in genre_dict.keys()}
for key,val in genre_dict.items():
    #if len(v)> 1:
    print(key)
    # insert broader
    
    
    g_fo2.add((URIRef(key), RDF.type, rel.broaderCategory))
    
    
    for v in val:
        g_fo2.add((URIRef(v),  rel.broaderCategory, URIRef(key)))
            
    
    c_q=check_q.replace("CAT", key)
    qres = g_fo2.query(c_q)
    allres[key] += qres

category="lapset"
print("Found", allres[category], "novels of category", category)

elain
era
erotiikka
fantasia
historia
huumori
jannitys
kauhu
lapset
nuoret
rakkaus
scifi
sota
urheilu
uskonto
Found [(rdflib.term.URIRef('lapset'), rdflib.term.Literal('960', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))] novels of category lapset


## query all these to DFs and merge

### Finnish

In [57]:
# step 1: query from the f and g networks [teos, author, lang, opt(gender), opt(nat) ]



In [29]:
# Finnish
Q_auth ="""

PREFIX foaf: <http://xmlns.com/foaf/0.1/> 
PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> 
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

SELECT DISTINCT ?year ?teos  ?author ?nat ?gender 

WHERE {
  ?teos a rel:novel ; rel:pubYear ?year .
  
  FILTER(?year < 2021)

  FILTER(?year > 1970)
  
 
  ?teos rel:hasAuthor ?author. 
    OPTIONAL { ?author rel:authorNationality ?nationality. }
    BIND(COALESCE(?nationality, "nonat") AS ?nat)
    
    OPTIONAL { ?author foaf:gender ?foafgender }
    BIND(COALESCE(?foafgender, "unk") AS ?gender)

  
} GROUP BY  ?author ?nationality ?teos ?year ?gender

"""

qres = f.query(Q_auth)
# convert to dict
au_fi=[{"year":str(row.asdict()['year'].toPython()),"author":str(row.asdict()['author'].toPython()),"nat":str(row.asdict()['nat'].toPython()),"gender":str(row.asdict()['gender'].toPython()),"novel":str(row.asdict()['teos'].toPython())}  for row in qres]


In [31]:
df_fi=pd.DataFrame(au_fi).sort_values(by="year")

In [32]:
# check the df
print(len(df_fi), df_fi.novel.nunique(), df_fi.author.nunique())

17050 16506 5485


In [35]:
# step 2: get publisher information

# kaikki suomenkieliset


Q_y ="""

PREFIX foaf: <http://xmlns.com/foaf/0.1/> 
PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> 
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

SELECT DISTINCT ?year ?teos  ?publisher ?pname  
WHERE {
  ?teos a rel:novel ; rel:hasPublication ?pub .
  ?pub rel:pubYear ?year .
  ?pub rel:hasPublisher ?publisher .
  OPTIONAL { ?publisher skos:prefLabel ?label .  }
  BIND(COALESCE(?label, "noname") AS ?pname)
  FILTER(?year < 2021)

  FILTER(?year > 1970)
  


  
} GROUP BY ?year ?publisher ?author ?nationality ?teos ?pub

"""

qres = f_k.query(Q_y)
# convert to dict
pub_fi=[{"year":str(row.asdict()['year'].toPython()),"publisher":str(row.asdict()['publisher'].toPython()),"pubName":str(row.asdict()['pname'].toPython()),"novel":str(row.asdict()['teos'].toPython())}  for row in qres]


In [38]:
df_pub_fi=pd.DataFrame(pub_fi)#.sort_values(by="year")
df_pub_fi.sort_values(by="year").head()


Unnamed: 0,year,publisher,pubName,novel
4199,1971,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Gummerus Kustannus Oy,http://www.yso.fi/onto/kaunokki#ateos_34519
5037,1971,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Otava,http://www.yso.fi/onto/kaunokki#ateos_30894
5004,1971,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,RV-kirjat,http://www.yso.fi/onto/kaunokki#ateos_29177
15866,1971,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Karisto Oy,http://www.yso.fi/onto/kaunokki#ateos_44439
4919,1971,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,WSOY,http://www.yso.fi/onto/kaunokki#ateos_37518


In [46]:
print(df_pub_fi.novel.nunique())

16245


In [39]:
# step 3: genre information

## viihde: select rakkaus, jannitys ja huumori

# Finnish
Q= """

    PREFIX rel: <http://ldf.fi/relse/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT DISTINCT ?broader ?teos ?year WHERE 
    { ?genre rel:broaderCategory <CAT> .  
        ?teos rel:hasGenre ?genre . 
        BIND (<CAT> as ?broader ) 
        ?teos rel:pubYear ?year .
        #?teos rel:hasAuthor ?author.
       #?author foaf:gender ?gender .
       # OPTIONAL {?author rel:authorNationality ?nat1 . }
        
        #BIND(COALESCE(?gender1, "unk") AS ?gender)
        
        
        }
    GROUP BY ?teos ?year ?broader # ?author

    """

viihde_genres_fi=[]
for cat in ["rakkaus","jannitys","huumori"]:
    print(cat)
    q= Q.replace("CAT",cat)
    qres = g2.query(q)
    genre =[{"genre":str(row.asdict()['broader'].toPython()),"year":str(row.asdict()['year'].toPython()),"novel":str(row.asdict()['teos'].toPython())}  for row in qres]
    viihde_genres_fi.append(genre)

rakkaus
jannitys
huumori


In [48]:
df_fi_genre= pd.DataFrame()

for a in viihde_genres_fi:
    df_fi_genre= pd.concat([df_fi_genre, pd.DataFrame(a)])

print(len(df_fi_genre))
print(df_fi_genre.novel.nunique())
df_fi_genre["year"]=pd.to_numeric(df_fi_genre["year"])
df_fi_genre["decade"]=(df_fi_genre["year"]-1)- ((df_fi_genre["year"]-1)%10)
df_fi_genre.sort_values(by="year", inplace=True)
print(df_fi_genre.novel.nunique(), len(df_fi_genre))
df_fi_genre.head()

4687
4503
4503 4687


Unnamed: 0,genre,year,novel,decade
1291,jannitys,1971,http://www.yso.fi/onto/kaunokki#ateos_45406,1970
830,jannitys,1971,http://www.yso.fi/onto/kaunokki#ateos_28462,1970
1679,jannitys,1971,http://www.yso.fi/onto/kaunokki#ateos_40845,1970
255,rakkaus,1971,http://www.yso.fi/onto/kaunokki#ateos_49172,1970
2341,jannitys,1971,http://www.yso.fi/onto/kaunokki#ateos_44793,1970


In [54]:
df_fi_genre.groupby("novel").size().sort_values()

novel
http://data.kirjasampo.fi/abstractWork_1531529    1
http://www.yso.fi/onto/kaunokki#ateos_23011       1
http://www.yso.fi/onto/kaunokki#ateos_23051       1
http://www.yso.fi/onto/kaunokki#ateos_23105       1
http://www.yso.fi/onto/kaunokki#ateos_23120       1
                                                 ..
http://www.yso.fi/onto/kaunokki#ateos_46856       2
http://www.yso.fi/onto/kaunokki#ateos_46753       2
http://www.yso.fi/onto/kaunokki#ateos_17067       2
http://www.yso.fi/onto/kaunokki#ateos_15996       2
http://www.yso.fi/onto/kaunokki#ateos_18311       2
Length: 4503, dtype: int64

In [51]:
# merge all these together 

# df_fi and publishers
df_fi_all=df_fi.merge(df_pub_fi, on=["novel","year"], suffixes=["","_pub"], how="left")
# make sure that also the NAs are there so that we don't lose any novels
print(df_fi_all.novel.nunique())
df_fi_all[df_fi_all["publisher"].isna()]

16506


Unnamed: 0,year,author,nat,gender,novel,publisher,pubName
12,1971,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_54281,,
24,1971,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_24186,,
47,1971,http://www.seco.tkk.fi/applications/saha#Insta...,http://www.yso.fi/onto/koko/p16897,http://www.yso.fi/onto/kaunokki#female,http://data.kirjasampo.fi/abstractWork_9510039748,,
49,1971,http://www.yso.fi/onto/kaunokki#person_1232726...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_49339,,
51,1971,http://www.yso.fi/onto/kaunokki#person_1231760...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://seco.tkk.fi/saha3/ue3308952-f397-4f36-a...,,
...,...,...,...,...,...,...,...
11347,2013,http://seco.tkk.fi/saha3/u760c0712-1593-40a6-8...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#female,http://seco.tkk.fi/saha3/uc280bec2-6d07-4a74-b...,,
14540,2017,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#female,http://data.kirjasampo.fi/abstractWork_4454545,,
14557,2017,http://seco.tkk.fi/saha3/u161e1513-7f91-4502-a...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#female,http://data.kirjasampo.fi/abstractWork_4454545,,
14863,2018,http://www.btj.fi/actor_Piiroinen%2C+Aulis,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_6880667,,


In [52]:
df_fi_all["year"]=pd.to_numeric(df_fi_all["year"])


In [53]:
# finally genre
# df_fi and publishers
df_fi_all=df_fi_all.merge(df_fi_genre, on=["novel","year"], suffixes=["","_genre"], how="left")
# make sure that also the NAs are there so that we don't lose any novels
print(df_fi_all.novel.nunique())
df_fi_all[df_fi_all["genre"].isna()]

16506


Unnamed: 0,year,author,nat,gender,novel,publisher,pubName,genre,decade
0,1971,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://seco.tkk.fi/saha3/u1b33a416-df8e-4bd0-a...,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Otava,,
1,1971,http://www.yso.fi/onto/kaunokki#person_1231760...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://seco.tkk.fi/saha3/u69e83573-b30a-44df-a...,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Gummerus Kustannus Oy,,
2,1971,http://www.yso.fi/onto/kaunokki#Ruuska_Jalmari,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_36637,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Eget förlag,,
4,1971,http://www.yso.fi/onto/kaunokki#person_1232726...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_21907,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Otava,,
5,1971,http://www.yso.fi/onto/kaunokki#person_1232726...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_44504,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Otava,,
...,...,...,...,...,...,...,...,...,...
17235,2020,http://data.kirjasampo.fi/actor_Iso-Kamula%2C+...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_7093735,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Compania Comder HB,,
17236,2020,http://www.yso.fi/onto/kaunokki#writer_37,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#female,http://data.kirjasampo.fi/abstractWork_7111974,http://www.btj.fi/actor_Icasos,Icasos,,
17237,2020,http://www.yso.fi/onto/kaunokki#Raisanen_Veikko,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_7085948,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,BoD - Books on Demand,,
17238,2020,http://data.kirjasampo.fi/actor_Rautkorpi%2C+Emmi,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#female,http://data.kirjasampo.fi/abstractWork_7128775,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,BoD - Books on Demand,,


In [55]:
# check one novel
# looks right
df_fi_all[df_fi_all["novel"]=="http://www.yso.fi/onto/kaunokki#ateos_18311"]

Unnamed: 0,year,author,nat,gender,novel,publisher,pubName,genre,decade
9563,2009,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_18311,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Tammi,jannitys,2000.0
9564,2009,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_18311,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Tammi,huumori,2000.0


In [56]:
# save 
df_fi_all.to_csv("../../data/df_fi_all2.csv", sep="\t")

### Translated works



In [57]:
# step 1: query from the f and g networks [teos, author, lang, opt(gender), opt(nat) ]



In [60]:

Q_auth ="""

PREFIX foaf: <http://xmlns.com/foaf/0.1/> 
PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> 
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

SELECT ?year ?teos  ?author ?nat ?gender ?lang #(COUNT(DISTINCT ?lang) as ?langCount)# (COUNT(DISTINCT ?nationality) as ?nationalityCount) #?gender #(COUNT(DISTINCT ?authorF) as ?f_authors) (COUNT(DISTINCT ?authorM) as ?m_authors)  
WHERE {
  ?teos a rel:novel ; rel:pubYear ?year .
  
  FILTER(?year < 2021)

  FILTER(?year > 1970)
  
   #languages
  ?teos rel:langOrig ?lang . 
  
 
  ?teos rel:hasAuthor ?author. 
    OPTIONAL { ?author rel:authorNationality ?nationality. }
    BIND(COALESCE(?nationality, "nonat") AS ?nat)
    
    OPTIONAL { ?author foaf:gender ?foafgender }
    BIND(COALESCE(?foafgender, "unk") AS ?gender)

  
} GROUP BY  ?author ?nationality ?teos ?year ?gender

"""

qres = g.query(Q_auth)
# convert to dict
au_fo=[{"year":str(row.asdict()['year'].toPython()),"author":str(row.asdict()['author'].toPython()),"nat":str(row.asdict()['nat'].toPython()),"gender":str(row.asdict()['gender'].toPython()),"novel":str(row.asdict()['teos'].toPython()),"lang":str(row.asdict()['lang'].toPython())}  for row in qres]


In [61]:
df_fo=pd.DataFrame(au_fo).sort_values(by="year")

In [62]:
# check the df
print(len(df_fo), df_fo.novel.nunique(), df_fo.author.nunique())

18820 17317 6025


In [67]:
# step 2: get publisher information

# kaikki suomenkieliset


Q_y ="""

PREFIX foaf: <http://xmlns.com/foaf/0.1/> 
PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> 
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

SELECT DISTINCT ?year ?teos  ?publisher ?pname  
WHERE {
  ?teos a rel:novel ; rel:hasPublication ?pub .
  ?pub rel:pubYear ?year .
  ?pub rel:hasPublisher ?publisher .
  OPTIONAL { ?publisher skos:prefLabel ?label .  }
  BIND(COALESCE(?label, "noname") AS ?pname)
  FILTER(?year < 2021)

  FILTER(?year > 1970)
  


  
} GROUP BY ?year ?publisher ?author ?nationality ?teos ?pub

"""

qres = g_k.query(Q_y)
# convert to dict
pub_fo=[{"year":str(row.asdict()['year'].toPython()),"publisher":str(row.asdict()['publisher'].toPython()),"pubName":str(row.asdict()['pname'].toPython()),"novel":str(row.asdict()['teos'].toPython())}  for row in qres]


In [69]:
df_pub_fo=pd.DataFrame(pub_fo)#.sort_values(by="year")
df_pub_fo.sort_values(by="year").head()


Unnamed: 0,year,publisher,pubName,novel
14668,1971,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,WSOY,http://www.yso.fi/onto/kaunokki#ateos_9531
12223,1971,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,WSOY,http://www.yso.fi/onto/kaunokki#ateos_6956
12213,1971,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Gummerus Kustannus Oy,http://www.yso.fi/onto/kaunokki#ateos_12739
9393,1971,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Weilin + Göös,http://www.yso.fi/onto/kaunokki#ateos_6328
12057,1971,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,WSOY,http://www.yso.fi/onto/kaunokki#ateos_2809


In [70]:
print(df_pub_fo.novel.nunique())

17210


In [71]:
# step 3: genre information

## viihde: select rakkaus, jannitys ja huumori

# Finnish
Q= """

    PREFIX rel: <http://ldf.fi/relse/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT DISTINCT ?broader ?teos ?year WHERE 
    { ?genre rel:broaderCategory <CAT> .  
        ?teos rel:hasGenre ?genre . 
        BIND (<CAT> as ?broader ) 
        ?teos rel:pubYear ?year .
        #?teos rel:hasAuthor ?author.
       #?author foaf:gender ?gender .
       # OPTIONAL {?author rel:authorNationality ?nat1 . }
        
        #BIND(COALESCE(?gender1, "unk") AS ?gender)
        
        
        }
    GROUP BY ?teos ?year ?broader # ?author

    """

viihde_genres_fo=[]
for cat in ["rakkaus","jannitys","huumori"]:
    print(cat)
    q= Q.replace("CAT",cat)
    qres = g_fo2.query(q)
    genre =[{"genre":str(row.asdict()['broader'].toPython()),"year":str(row.asdict()['year'].toPython()),"novel":str(row.asdict()['teos'].toPython())}  for row in qres]
    viihde_genres_fo.append(genre)

rakkaus
jannitys
huumori


In [72]:
df_fo_genre= pd.DataFrame()

for a in viihde_genres_fo:
    df_fo_genre= pd.concat([df_fo_genre, pd.DataFrame(a)])

print(len(df_fo_genre))
print(df_fo_genre.novel.nunique())
df_fo_genre["year"]=pd.to_numeric(df_fo_genre["year"])
df_fo_genre["decade"]=(df_fo_genre["year"]-1)- ((df_fo_genre["year"]-1)%10)
df_fo_genre.sort_values(by="year", inplace=True)
print(df_fo_genre.novel.nunique(), len(df_fo_genre))
df_fo_genre.head()

9073
8502
8502 9073


Unnamed: 0,genre,year,novel,decade
1373,jannitys,1971,http://www.yso.fi/onto/kaunokki#ateos_2505,1970
3328,jannitys,1971,http://www.yso.fi/onto/kaunokki#ateos_3427,1970
2694,jannitys,1971,http://www.yso.fi/onto/kaunokki#ateos_2383,1970
4894,jannitys,1971,http://seco.tkk.fi/saha3/u1b23ca86-f031-4a88-a...,1970
4897,jannitys,1971,http://seco.tkk.fi/saha3/uf5c32aec-6eb9-4262-8...,1970


In [73]:
df_fo_genre.groupby("novel").size().sort_values()

novel
http://data.kirjasampo.fi/abstractWorkPart_4213098_5627860f-92b8-4eba-b3b4-a53014267cd4    1
http://www.yso.fi/onto/kaunokki#ateos_2985                                                 1
http://www.yso.fi/onto/kaunokki#ateos_2984                                                 1
http://www.yso.fi/onto/kaunokki#ateos_2981                                                 1
http://www.yso.fi/onto/kaunokki#ateos_298                                                  1
                                                                                          ..
http://www.yso.fi/onto/kaunokki#ateos_12794                                                3
http://www.yso.fi/onto/kaunokki#ateos_3984                                                 4
http://www.yso.fi/onto/kaunokki#ateos_9740                                                 4
http://www.yso.fi/onto/kaunokki#ateos_9989                                                 4
http://www.yso.fi/onto/kaunokki#ateos_4129                      

In [74]:
# merge all these together 

# df_fo and publishers
df_fo_all=df_fo.merge(df_pub_fo, on=["novel","year"], suffixes=["","_pub"], how="left")
# make sure that also the NAs are there so that we don't lose any novels
print(df_fo_all.novel.nunique())
df_fo_all[df_fo_all["publisher"].isna()]

17317


Unnamed: 0,year,author,nat,gender,novel,lang,publisher,pubName
40,1971,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p73101,http://www.yso.fi/onto/kaunokki#male,http://seco.tkk.fi/saha3/u79c830a6-1b29-484a-8...,http://lexvo.org/id/iso639-3/eng,,
55,1971,http://seco.tkk.fi/saha3/u842b5cb8-e370-4b1a-9...,http://www.yso.fi/onto/koko/p73101,http://www.yso.fi/onto/kaunokki#female,http://seco.tkk.fi/saha3/u04c6578c-9c5d-4d4f-8...,http://lexvo.org/id/iso639-3/eng,,
131,1971,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p73101,unk,http://www.yso.fi/onto/kaunokki#ateos_9621,http://lexvo.org/id/iso639-3/eng,,
443,1973,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p73101,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_5565,http://lexvo.org/id/iso639-3/eng,,
480,1973,http://www.yso.fi/onto/kaunokki#character_1231...,http://www.yso.fi/onto/koko/p73101,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_3666,http://lexvo.org/id/iso639-3/eng,,
...,...,...,...,...,...,...,...,...
14764,2012,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p32570,http://www.yso.fi/onto/kaunokki#female,http://seco.tkk.fi/saha3/u47e87ada-677f-4fc9-b...,http://lexvo.org/id/iso639-3/sme,,
15955,2015,http://data.kirjasampo.fi/actor_Piruzyan%2C+Anait,http://www.yso.fi/onto/koko/p8327,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_4461900,http://lexvo.org/id/iso639-3/rus,,
17131,2017,http://data.kirjasampo.fi/actor_Crammond%2C+Linda,http://www.yso.fi/onto/koko/p2224,http://www.yso.fi/onto/kaunokki#female,http://data.kirjasampo.fi/abstractWork_4495503,http://lexvo.org/id/iso639-3/eng,,
18520,2020,http://seco.tkk.fi/saha3/ub191b13f-862c-4301-8...,http://www.yso.fi/onto/koko/p16897,http://www.yso.fi/onto/kaunokki#female,http://data.kirjasampo.fi/abstractWork_6951008,http://lexvo.org/id/iso639-3/swe,,


In [75]:
df_fo_all["year"]=pd.to_numeric(df_fo_all["year"])


In [76]:
# finally genre
# df_fi and publishers
df_fo_all=df_fo_all.merge(df_fo_genre, on=["novel","year"], suffixes=["","_genre"], how="left")
# make sure that also the NAs are there so that we don't lose any novels
print(df_fo_all.novel.nunique())
df_fo_all[df_fo_all["genre"].isna()]

17317


Unnamed: 0,year,author,nat,gender,novel,lang,publisher,pubName,genre,decade
0,1971,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p2224,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_8518,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,WSOY,,
3,1971,http://seco.tkk.fi/saha3/u93bae653-78a3-4a7a-a...,http://www.yso.fi/onto/koko/p16897,http://www.yso.fi/onto/kaunokki#female,http://seco.tkk.fi/saha3/u7d29e490-f7c3-48ca-b...,http://lexvo.org/id/iso639-3/swe,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Otava,,
4,1971,http://www.yso.fi/onto/kaunokki#person_1231760...,http://www.yso.fi/onto/koko/p12201,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_6328,http://lexvo.org/id/iso639-3/ita,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Weilin + Göös,,
7,1971,http://www.yso.fi/onto/kaunokki#person_1231760...,http://www.yso.fi/onto/koko/p8327,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_11472,http://lexvo.org/id/iso639-3/rus,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Edistys,,
11,1971,http://www.seco.tkk.fi/applications/saha#Insta...,http://www.yso.fi/onto/koko/p6011,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_6393,http://lexvo.org/id/iso639-3/spa,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,WSOY,,
...,...,...,...,...,...,...,...,...,...,...
19389,2020,http://www.yso.fi/onto/kaunokki#character_1231...,http://www.yso.fi/onto/koko/p16897,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_6986774,http://lexvo.org/id/iso639-3/swe,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Like,,
19392,2020,http://data.kirjasampo.fi/actor_Griffiths%2C+Andy,http://www.yso.fi/onto/koko/p9135,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_4310734,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Otava,,
19393,2020,http://www.yso.fi/onto/kaunokki#person_1232726...,http://www.yso.fi/onto/koko/p73101,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_7029362,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Tammi,,
19394,2020,http://data.kirjasampo.fi/actor_Fricke%2C+Lucy,http://www.yso.fi/onto/koko/p745,http://www.yso.fi/onto/kaunokki#female,http://data.kirjasampo.fi/abstractWork_7104489,http://lexvo.org/id/iso639-3/deu,http://data.kirjasampo.fi/actor_Kustantamo+Huippu,Kustantamo Huippu,,


In [78]:
# check one novel
# looks right
df_fo_all[df_fo_all["novel"]=="http://www.yso.fi/onto/kaunokki#ateos_3984"]

Unnamed: 0,year,author,nat,gender,novel,lang,publisher,pubName,genre,decade
6853,1995,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p73101,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_3984,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Tammi,jannitys,1990.0
6854,1995,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p73101,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_3984,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Tammi,huumori,1990.0
12779,2008,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p73101,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_3984,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,WSOY,huumori,2000.0
12780,2008,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p73101,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_3984,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,WSOY,jannitys,2000.0


In [79]:
# save

df_fo_all.to_csv("../../data/df_fo_all2.csv", sep="\t")

In [81]:
print(len(df_fo.drop_duplicates(["novel","year"])),  
     len(df_fo_all.drop_duplicates(["novel","year"])),
      len(df_pub_fo.drop_duplicates(["novel","year"])),
      len(df_fo_genre.drop_duplicates(["novel","year"]))
     )


17539 17539 17429 8599


In [82]:
print(len(df_fi.drop_duplicates(["novel","year"])),  
     len(df_fi_all.drop_duplicates(["novel","year"])),
      len(df_pub_fi.drop_duplicates(["novel","year"])),
      len(df_fi_genre.drop_duplicates(["novel","year"]))
     )


16568 16568 16303 4526
