In [1]:
# import 

from collections import defaultdict, OrderedDict, Counter
from datetime import datetime

from dateutil.relativedelta import relativedelta
import glob
from itertools import product, combinations
import matplotlib.pyplot as plt
import numpy    as np
#import networkx as nx
from operator import itemgetter
import pandas   as pd
import rdflib as rdflib
import re
from scipy.signal import convolve2d
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer, minmax_scale
from SPARQLWrapper import SPARQLWrapper, JSON, POST, TURTLE
import sys
from rdflib.namespace import XSD, Namespace
from rdflib.term import URIRef

import IPython
import pprint
pp = pprint.PrettyPrinter(indent=4)



from constants import *
from helpers import *


from rdflib import Graph

from rdflib.namespace import RDF, SKOS
rel = rdflib.Namespace('http://ldf.fi/relse/')

In [2]:
# helper functions

def to_df(reslist, sort_column):
    my_df=pd.DataFrame(reslist).sort_values(by=sort_column)
    if "year" in my_df.columns:
        my_df["year"]=pd.to_numeric(my_df["year"])
    return my_df


# add period function
def add_period(df_fo, columnname="year"):
    df_fo['period'] = pd.cut(df_fo[columnname], bins=bins, include_lowest=True, precision=0)
    df_fo['period'] = df_fo['period'].astype("str")

    df_fo["period"]=df_fo["period"].apply(lambda x: int(x.split(",")[0].replace("[","")))
    return df_fo

# decade function
def add_decade(df_fi, columname="year"):
    df_fi["decade"]=(df_fi["year"]-1)- ((df_fi["year"]-1)%10)
    
    return df_fi

In [3]:
# Query

# translated
Q = """
PREFIX rel: <http://ldf.fi/relse/>

CONSTRUCT {


?teos a rel:novel ;
    skos:prefLabel ?title ;
    rel:hasPublication ?julkaisu ;
    rel:pubYear ?vuosi_ .

?julkaisu a rel:publication ;
    rel:hasPublisher ?publisher ;
    rel:pubYear ?vuosi_ .


?publisher a rel:publisher ;
	skos:prefLabel ?publisher_name .

}

WHERE {

  ?teos a kaunokki:romaani ; skos:prefLabel ?title .
  ?teos kaunokki:alkukieli ?kieliuri .
  FILTER (?kieliuri !=  <http://lexvo.org/id/iso639-3/fin>  )
  OPTIONAL {?kieliuri skos:prefLabel ?kieli . }
  ?teos kaunokki:manifests_in ?julkaisu .
  ?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin> . # käännetty suomeksi
  FILTER EXISTS {?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin>  }
  ?julkaisu kaunokki:ilmestymisvuosi ?vuosi . ?vuosi skos:prefLabel ?vuosiluku .
  BIND(xsd:integer(?vuosiluku) AS ?vuosi_)
  ?julkaisu kaunokki:hasPublisher ?publisher .
  OPTIONAL { ?publisher skos:prefLabel ?publisher_name }


} 
"""

In [4]:
# construct graph

# query graph
print('\n\n*** constructing the graph')
sparql = SPARQLWrapper("http://ldf.fi/booksampo-2022/sparql")

sparql.setQuery(PREFIXES + Q)

sparql.addCustomHttpHeader(*list(AUTHORIZATION_HEADER.items())[0])

sparql.setReturnFormat(TURTLE)
results = sparql.query().convert()



*** constructing the graph


In [5]:
# parse

from rdflib import Graph
g_k = Graph()
#g.parse(data=results, format="turtle") # "mygraph.ttl"
g_k.parse(data=results, format="turtle")

<Graph identifier=N29325a315a60471cbdf71b2169841ef4 (<class 'rdflib.graph.Graph'>)>

In [6]:
# same for Finnish literature


Q_fi = """
PREFIX rel: <http://ldf.fi/relse/>

CONSTRUCT {


?teos a rel:novel ;
    skos:prefLabel ?title ;
    rel:hasPublication ?julkaisu ;
    rel:pubYear ?vuosi_ .

?julkaisu a rel:publication ;
    rel:hasPublisher ?publisher ;
    rel:pubYear ?vuosi_ .


?publisher a rel:publisher ;
	skos:prefLabel ?publisher_name .

}

WHERE {

  ?teos a kaunokki:romaani ; skos:prefLabel ?title .
  ?teos kaunokki:alkukieli <http://lexvo.org/id/iso639-3/fin>  .
  #OPTIONAL {?kieliuri skos:prefLabel ?kieli . }
  ?teos kaunokki:manifests_in ?julkaisu .
  ?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin> . # käännetty suomeksi
  FILTER EXISTS {?julkaisu kaunokki:kieli <http://lexvo.org/id/iso639-3/fin>  }
  ?julkaisu kaunokki:ilmestymisvuosi ?vuosi . ?vuosi skos:prefLabel ?vuosiluku .
  BIND(xsd:integer(?vuosiluku) AS ?vuosi_)
  ?julkaisu kaunokki:hasPublisher ?publisher .
  OPTIONAL { ?publisher skos:prefLabel ?publisher_name }


} 
"""

In [7]:
# query graph
print('\n\n*** constructing the graph')
sparql = SPARQLWrapper("http://ldf.fi/booksampo-2022/sparql")

sparql.setQuery(PREFIXES + Q_fi)

sparql.addCustomHttpHeader(*list(AUTHORIZATION_HEADER.items())[0])

sparql.setReturnFormat(TURTLE)
results = sparql.query().convert()



*** constructing the graph


In [8]:
f_k = Graph()

f_k.parse(data=results, format="turtle")

<Graph identifier=N5aefa24a68e244e0bf61f4c56cec02ff (<class 'rdflib.graph.Graph'>)>

In [9]:
# existing graphs


f = Graph()
f.parse('../../data/finnish_graph_new.nt')

<Graph identifier=Ncaf91d23c98b4cf486c78ca286132f0b (<class 'rdflib.graph.Graph'>)>

In [10]:
# foreign
g = Graph() #/u/98/peurat1/unix/Desktop/kirjasampo/data/graph2.ttl
#f.parse(data="../../data/finnish_graph2.nt", format="nt")

g.parse('../../data/nat_lang_graph.nt')

<Graph identifier=N6e7c7b50129e40548c4440db888fffbb (<class 'rdflib.graph.Graph'>)>

In [11]:
# genre

genre=pd.read_csv("../../data/genre_df.csv", sep="\t", index_col=0)
genre_dict={}
for i,v in genre.groupby("genrename").apply(lambda x: list(x["genre"])).iteritems():
   # print(i,v)
    #genre_dict[i]=" ".join(["<"+uri+">" for uri in v])
    genre_dict[i] =v

    
# cluster per genre information
# load genre graphs
g_fi = Graph() 
g_fi.parse("../../data/finnish_genre_theme.nt",format="nt")

g_fo = Graph() 
g_fo.parse("../data/translated_theme_genre_new.nt",format="nt")

<Graph identifier=Na95170cdd8c24829bb21dd025d811cd9 (<class 'rdflib.graph.Graph'>)>

In [12]:
# add together

F = f + f_k

In [13]:
G = g+g_k

In [14]:
from rdflib.namespace import RDF, SKOS
rel = rdflib.Namespace('http://ldf.fi/relse/')

# add the broader genre to these
g2=g_fi
check_q= """

    PREFIX rel: <http://ldf.fi/relse/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT ?broader (COUNT(DISTINCT ?teos) AS ?novels) WHERE 
    { ?genre rel:broaderCategory <CAT> .  ?teos rel:hasGenre ?genre . BIND (<CAT> as ?broader ) }
    GROUP BY ?broader

    """

allres={key:[] for key in genre_dict.keys()}
for key,val in genre_dict.items():
    #if len(v)> 1:
    print(key)
    # insert broader
    
    
    g2.add((URIRef(key), RDF.type, rel.broaderCategory))
    
    
    for v in val:
        g2.add((URIRef(v),  rel.broaderCategory, URIRef(key)))
            
    
    c_q=check_q.replace("CAT", key)
    qres = g2.query(c_q)
    allres[key] += qres
    
    
category="lapset"
print("Found", allres[category], "novels of category", category)

elain
era
erotiikka
fantasia
historia
huumori
jannitys
kauhu
lapset
nuoret
rakkaus
scifi
sota
urheilu
uskonto
Found [(rdflib.term.URIRef('lapset'), rdflib.term.Literal('986', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))] novels of category lapset


In [15]:
# add broader category to foreign

from rdflib.namespace import RDF, SKOS
rel = rdflib.Namespace('http://ldf.fi/relse/')

# add the broader genre to these
g_fo2=g_fo
check_q= """

    PREFIX rel: <http://ldf.fi/relse/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT ?broader (COUNT(DISTINCT ?teos) AS ?novels) WHERE 
    { ?genre rel:broaderCategory <CAT> .  ?teos rel:hasGenre ?genre . BIND (<CAT> as ?broader ) }
    GROUP BY ?broader

    """

allres={key:[] for key in genre_dict.keys()}
for key,val in genre_dict.items():
    #if len(v)> 1:
    print(key)
    # insert broader
    
    
    g_fo2.add((URIRef(key), RDF.type, rel.broaderCategory))
    
    
    for v in val:
        g_fo2.add((URIRef(v),  rel.broaderCategory, URIRef(key)))
            
    
    c_q=check_q.replace("CAT", key)
    qres = g_fo2.query(c_q)
    allres[key] += qres

category="lapset"
print("Found", allres[category], "novels of category", category)

elain
era
erotiikka
fantasia
historia
huumori
jannitys
kauhu
lapset
nuoret
rakkaus
scifi
sota
urheilu
uskonto
Found [(rdflib.term.URIRef('lapset'), rdflib.term.Literal('960', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#integer')))] novels of category lapset


In [16]:
# step 1: query from the F and G networks [teos, publication, year, publisher]

# step 2: query from the author and lang networks df[teos,author,nat,lang]

# step 3: query from the genre network df[teos,genre]



In [17]:
# kaikki suomenkieliset

# suomenkielisen "marginaalikirjallisuuden kustantajat"

Q_y ="""

PREFIX foaf: <http://xmlns.com/foaf/0.1/> 
PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> 
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

SELECT ?year ?teos  ?publisher ?pname ?author  #(COUNT(DISTINCT ?lang) as ?langCount)# (COUNT(DISTINCT ?nationality) as ?nationalityCount) #?gender #(COUNT(DISTINCT ?authorF) as ?f_authors) (COUNT(DISTINCT ?authorM) as ?m_authors)  
WHERE {
  ?teos a rel:novel ; rel:hasPublication ?pub .
  ?pub rel:pubYear ?year .
  ?pub rel:hasPublisher ?publisher .
  OPTIONAL { ?publisher skos:prefLabel ?label .  }
  BIND(COALESCE(?label, "noname") AS ?pname)
  FILTER(?year < 2021)

  FILTER(?year > 1970)
  
 
  ?teos rel:hasAuthor ?author. 

  
} GROUP BY ?year ?publisher ?author ?nationality ?teos

"""

qres = F.query(Q_y)
# convert to dict
div_fi=[{"year":str(row.asdict()['year'].toPython()),"publisher":str(row.asdict()['publisher'].toPython()),"pubName":str(row.asdict()['pname'].toPython()),"author":str(row.asdict()['author'].toPython()),"novel":str(row.asdict()['teos'].toPython())}  for row in qres]


In [18]:
df_fi=pd.DataFrame(div_fi).sort_values(by="year")
#f_y=f_y[f_y["country"]!="-"]
print(df_fi.publisher.nunique(), len(df_fi))
df_fi.drop_duplicates().tail(10)

778 16496


Unnamed: 0,year,publisher,pubName,author,novel
6290,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Teos,http://www.yso.fi/onto/kaunokki#person_1231760...,http://data.kirjasampo.fi/abstractWork_7086810
6300,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Tammi,http://www.yso.fi/onto/kaunokki#person_1231759...,http://data.kirjasampo.fi/abstractWork_7029269
6319,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Tammi,http://seco.tkk.fi/saha3/u161e1513-7f91-4502-a...,http://data.kirjasampo.fi/abstractWork_9789513...
6320,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Tammi,http://www.yso.fi/onto/kaunokki#person_1231759...,http://data.kirjasampo.fi/abstractWork_9789513...
15067,2020,http://data.kirjasampo.fi/actor_Sanoma+Pro,Sanoma Pro,http://www.yso.fi/onto/kaunokki#person_1231759...,http://data.kirjasampo.fi/abstractWork_7094068
6327,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Karisto Oy,http://seco.tkk.fi/saha3/ue814a7ea-cc2f-4dfd-9...,http://data.kirjasampo.fi/abstractWork_7045181
6378,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,WSOY,http://data.kirjasampo.fi/actor_Rantala%2C+Heli,http://data.kirjasampo.fi/abstractWork_7081511
6379,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,BoD - Books on Demand,http://www.yso.fi/onto/kaunokki#person_1231759...,http://data.kirjasampo.fi/abstractWork_7151242
6282,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Omakustanne,http://data.kirjasampo.fi/actor_Kivi%2C+Jouko,http://data.kirjasampo.fi/abstractWork_7148070
16495,2020,http://www.btj.fi/actor_Aurinko,Aurinko Kustannus,http://seco.tkk.fi/saha3/uf6c4db79-57f6-49a5-b...,http://data.kirjasampo.fi/abstractWork_7057637


In [19]:
# finnish authors, gender and nationality

In [20]:
Q_auth ="""

PREFIX foaf: <http://xmlns.com/foaf/0.1/> 
PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> 
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

SELECT ?year ?teos  ?author ?nat ?gender #(COUNT(DISTINCT ?lang) as ?langCount)# (COUNT(DISTINCT ?nationality) as ?nationalityCount) #?gender #(COUNT(DISTINCT ?authorF) as ?f_authors) (COUNT(DISTINCT ?authorM) as ?m_authors)  
WHERE {
  ?teos a rel:novel ; rel:pubYear ?year .
  
  FILTER(?year < 2021)

  FILTER(?year > 1970)
  
  # languages
  #?teos rel:langOrig ?lang . 
  
 
  ?teos rel:hasAuthor ?author. 
    OPTIONAL { ?author rel:authorNationality ?nationality. }
    BIND(COALESCE(?nationality, "nonat") AS ?nat)
    
    OPTIONAL { ?author foaf:gender ?foafgender }
    BIND(COALESCE(?foafgender, "unk") AS ?gender)

  
} GROUP BY  ?author ?nationality ?teos ?year ?gender

"""

qres = f.query(Q_auth)
# convert to dict
au_fi=[{"year":str(row.asdict()['year'].toPython()),"author":str(row.asdict()['author'].toPython()),"nat":str(row.asdict()['nat'].toPython()),"gender":str(row.asdict()['gender'].toPython()),"novel":str(row.asdict()['teos'].toPython())}  for row in qres]


In [21]:
audf_fi=pd.DataFrame(au_fi).rename({"teos":"novel"}, axis=1)
#f_y=f_y[f_y["country"]!="-"]
print(audf_fi.author.nunique(), len(audf_fi), len(audf_fi.drop_duplicates(["novel","author"])),  len(audf_fi.drop_duplicates(["novel","author","year"])))
audf_fi.drop_duplicates().tail(10)

5380 16816 16687 16738


Unnamed: 0,year,author,nat,gender,novel
16806,1973,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,unk,http://www.yso.fi/onto/kaunokki#ateos_45153
16807,1993,http://seco.tkk.fi/saha3/ufd455299-db24-4d0d-8...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#female,http://seco.tkk.fi/saha3/u9b5b7c82-881f-4232-a...
16808,1978,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://seco.tkk.fi/saha3/u52018df8-5fc4-4f4d-9...
16809,2018,http://data.kirjasampo.fi/actor_Aalto%2C+Jorma,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_6846349
16810,2008,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_31972
16811,2019,http://data.kirjasampo.fi/actor_Nikki%2C+Teemu,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_6969400
16812,2019,http://data.kirjasampo.fi/actor_P%C3%B6s%C3%B6...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_6969400
16813,2020,http://data.kirjasampo.fi/actor_Silvan%2C+Heidi,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#female,http://data.kirjasampo.fi/abstractWork_7037934
16814,1990,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_38615
16815,2000,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_15464


In [22]:
df_fi2=audf_fi.merge(df_fi, on=["novel","author","year"], how="left", suffixes=["_au",""])

In [23]:
df_fi2

Unnamed: 0,year,author,nat,gender,novel,publisher,pubName
0,2017,http://data.kirjasampo.fi/actor_Sovij%C3%A4rvi...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_4492055,http://data.kirjasampo.fi/actor_Arsmat,Arsmat
1,2013,http://www.btj.fi/actor_Piiroinen%2C+Aulis,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_4014830,http://www.seco.tkk.fi/applications/saha#Insta...,Mediapinta Oy
2,1972,http://seco.tkk.fi/saha3/uc84440c2-42e7-4ba7-b...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_20145,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Weilin + Göös
3,2010,http://data.kirjasampo.fi/actor_Vinnurva%2C+Pe...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_9789529...,http://data.kirjasampo.fi/actor_IPE+Tuoteapu,IPE Tuoteapu
4,2017,http://www.btj.fi/actor_Matilainen%2C+Pekka,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_4409323,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Atena
...,...,...,...,...,...,...,...
16845,2019,http://data.kirjasampo.fi/actor_Nikki%2C+Teemu,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_6969400,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Otava
16846,2019,http://data.kirjasampo.fi/actor_P%C3%B6s%C3%B6...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_6969400,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Otava
16847,2020,http://data.kirjasampo.fi/actor_Silvan%2C+Heidi,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#female,http://data.kirjasampo.fi/abstractWork_7037934,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Myllylahti
16848,1990,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_38615,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Otava


In [24]:
# 
print(len(df_fi.drop_duplicates(["novel","year"])),  
     len(df_fi2.drop_duplicates(["novel","year"])),
      len(audf_fi.drop_duplicates(["novel","year"]))
     )

# df_fi2 has all!

16087 16363 16363


In [25]:
# df_fi2 is the correct one to use



In [26]:
#df_fi2.to_csv("../../data/fi_au_pub.csv", sep="\t")

In [27]:
## viihde: select rakkaus, jannitys ja huumori

# hae kaikki teokset

Q= """

    PREFIX rel: <http://ldf.fi/relse/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT DISTINCT ?broader ?teos ?year WHERE 
    { ?genre rel:broaderCategory <CAT> .  
        ?teos rel:hasGenre ?genre . 
        BIND (<CAT> as ?broader ) 
        ?teos rel:pubYear ?year .
        #?teos rel:hasAuthor ?author.
       #?author foaf:gender ?gender .
       # OPTIONAL {?author rel:authorNationality ?nat1 . }
        
        #BIND(COALESCE(?gender1, "unk") AS ?gender)
        
        
        }
    GROUP BY ?teos ?year ?broader # ?author

    """

viihde_genres_fi=[]
for cat in ["rakkaus","jannitys","huumori"]:
    print(cat)
    q= Q.replace("CAT",cat)
    qres = g2.query(q)
    genre =[{"genre":str(row.asdict()['broader'].toPython()),"year":str(row.asdict()['year'].toPython()),"novel":str(row.asdict()['teos'].toPython())}  for row in qres]
    viihde_genres_fi.append(genre)

rakkaus
jannitys
huumori


In [28]:
df_fi_genre= pd.DataFrame()

for a in viihde_genres_fi:
    df_fi_genre= pd.concat([df_fi_genre, pd.DataFrame(a)])

print(len(df_fi_genre))
print(df_fi_genre.novel.nunique())
df_fi_genre["year"]=pd.to_numeric(df_fi_genre["year"])
df_fi_genre["decade"]=(df_fi_genre["year"]-1)- ((df_fi_genre["year"]-1)%10)
df_fi_genre.sort_values(by="year", inplace=True)
df_fi_genre.head()

4687
4503


Unnamed: 0,genre,year,novel,decade
1291,jannitys,1971,http://www.yso.fi/onto/kaunokki#ateos_45406,1970
830,jannitys,1971,http://www.yso.fi/onto/kaunokki#ateos_28462,1970
1679,jannitys,1971,http://www.yso.fi/onto/kaunokki#ateos_40845,1970
255,rakkaus,1971,http://www.yso.fi/onto/kaunokki#ateos_49172,1970
2341,jannitys,1971,http://www.yso.fi/onto/kaunokki#ateos_44793,1970


In [29]:
df_fi_final=df_fi2.merge(df_fi_genre, on=["novel"], how="left", suffixes=["","_genre"])

In [30]:
print(len(df_fi.drop_duplicates(["novel","year"])),  
     len(df_fi2.drop_duplicates(["novel","year"])),
      len(audf_fi.drop_duplicates(["novel","year"])),
      len(df_fi_final.drop_duplicates(["novel","year"]))
     )


16087 16363 16363 16363


In [31]:
#df_fi_final.to_csv("../../data/df_fi_all.csv", sep="\t")

In [32]:
# same for foreign works

# kaikki ei-suomenkieliset



Q_y ="""

PREFIX foaf: <http://xmlns.com/foaf/0.1/> 
PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> 
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

SELECT ?year ?teos  ?publisher ?pname ?author  #(COUNT(DISTINCT ?lang) as ?langCount)# (COUNT(DISTINCT ?nationality) as ?nationalityCount) #?gender #(COUNT(DISTINCT ?authorF) as ?f_authors) (COUNT(DISTINCT ?authorM) as ?m_authors)  
WHERE {
  ?teos a rel:novel ; rel:hasPublication ?pub .
  ?pub rel:pubYear ?year .
  ?pub rel:hasPublisher ?publisher .
  OPTIONAL { ?publisher skos:prefLabel ?label .  }
  BIND(COALESCE(?label, "noname") AS ?pname)
  FILTER(?year < 2021)

  FILTER(?year > 1970)
  
 
  ?teos rel:hasAuthor ?author. 

  
} GROUP BY ?year ?publisher ?author ?teos

"""

qres = G.query(Q_y)
# convert to dict
div_fo=[{"year":str(row.asdict()['year'].toPython()),"publisher":str(row.asdict()['publisher'].toPython()),"pubName":str(row.asdict()['pname'].toPython()),"author":str(row.asdict()['author'].toPython()),"novel":str(row.asdict()['teos'].toPython())}  for row in qres]


In [33]:
df_fo=pd.DataFrame(div_fo).sort_values(by="year")

print(df_fo.publisher.nunique(), len(df_fo))
df_fo.drop_duplicates().tail(10)

323 18100


Unnamed: 0,year,publisher,pubName,author,novel
107,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Otava,http://data.kirjasampo.fi/actor_Bengtsdotter%2...,http://data.kirjasampo.fi/abstractWork_7084706
7081,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Otava,http://data.kirjasampo.fi/actor_Lundberg%2C+Sofia,http://data.kirjasampo.fi/abstractWork_6984203
13997,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Tammi,http://data.kirjasampo.fi/actor_Aciman%2C+Andr,http://data.kirjasampo.fi/abstractWork_6980724
1342,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Gummerus Kustannus Oy,http://data.kirjasampo.fi/actor_Luiselli%2C+Va...,http://data.kirjasampo.fi/abstractWork_7056178
10681,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Gummerus Kustannus Oy,http://data.kirjasampo.fi/actor_Offill%2C+Jenny,http://data.kirjasampo.fi/abstractWork_7089780
10396,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,WSOY,http://data.kirjasampo.fi/actor_Hurley%2C+Andr...,http://data.kirjasampo.fi/abstractWork_7081432
7077,2020,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Readme.fi,http://seco.tkk.fi/saha3/uc3d9a509-6b8e-4e36-b...,http://data.kirjasampo.fi/abstractWork_7039492
9915,2020,http://data.kirjasampo.fi/actor_Tiberius+kirjat,Tiberius kirjat,http://www.yso.fi/onto/kaunokki#person_1231759...,http://seco.tkk.fi/saha3/u4f745a51-f364-44aa-b...
8386,2020,http://data.kirjasampo.fi/actor_HarperCollins+...,HarperCollins Nordic AB,http://www.yso.fi/onto/kaunokki#person_1231759...,http://data.kirjasampo.fi/abstractWork_4293866
12834,2020,http://data.kirjasampo.fi/actor_Aula+%26+Co,Aula & Co,http://data.kirjasampo.fi/actor_Colling+Nielse...,http://data.kirjasampo.fi/abstractWork_6847141


In [34]:
#df_fo[df_fo["publisher"].isna()]

In [36]:
Q_auth ="""

PREFIX foaf: <http://xmlns.com/foaf/0.1/> 
PREFIX kaunokki: <http://www.yso.fi/onto/kaunokki#> 
PREFIX rel: <http://ldf.fi/relse/> 
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> 
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

SELECT ?year ?teos  ?author ?nat ?gender ?lang #(COUNT(DISTINCT ?lang) as ?langCount)# (COUNT(DISTINCT ?nationality) as ?nationalityCount) #?gender #(COUNT(DISTINCT ?authorF) as ?f_authors) (COUNT(DISTINCT ?authorM) as ?m_authors)  
WHERE {
  ?teos a rel:novel ; rel:pubYear ?year .
  
  FILTER(?year < 2021)

  FILTER(?year > 1970)
  
   #languages
  ?teos rel:langOrig ?lang . 
  
 
  ?teos rel:hasAuthor ?author. 
    OPTIONAL { ?author rel:authorNationality ?nationality. }
    BIND(COALESCE(?nationality, "nonat") AS ?nat)
    
    OPTIONAL { ?author foaf:gender ?foafgender }
    BIND(COALESCE(?foafgender, "unk") AS ?gender)

  
} GROUP BY  ?author ?nationality ?teos ?year ?gender

"""

qres = g.query(Q_auth)
# convert to dict
au_fo=[{"year":str(row.asdict()['year'].toPython()),"author":str(row.asdict()['author'].toPython()),"nat":str(row.asdict()['nat'].toPython()),"gender":str(row.asdict()['gender'].toPython()),"novel":str(row.asdict()['teos'].toPython()),"lang":str(row.asdict()['lang'].toPython())}  for row in qres]


In [45]:
audf_fo=pd.DataFrame(au_fo).rename({"teos":"novel"}, axis=1)
#f_y=f_y[f_y["country"]!="-"]
print(df_fo.novel.nunique(), len(df_fo),audf_fo.author.nunique(), len(audf_fo), len(audf_fo.drop_duplicates(["novel"])),  len(audf_fo.drop_duplicates(["novel","author","year"])))
audf_fo.drop_duplicates().tail(10)

17189 18100 6025 18820 17317 18156


Unnamed: 0,year,author,nat,gender,novel,lang
18810,2017,http://www.btj.fi/actor_Strandberg%2C+Mats,http://www.yso.fi/onto/koko/p16897,http://www.yso.fi/onto/kaunokki#male,http://data.kirjasampo.fi/abstractWork_4437454,http://lexvo.org/id/iso639-3/swe
18811,2009,http://www.btj.fi/actor_Verhulst%2C+Dimitri,http://www.yso.fi/onto/koko/p7333,http://www.yso.fi/onto/kaunokki#male,http://www.btj.fi/at_1399348,http://lexvo.org/id/iso639-3/nld
18812,1994,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p73101,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_5064,http://lexvo.org/id/iso639-3/eng
18813,1978,http://www.yso.fi/onto/kaunokki#person_1231760...,http://seco.tkk.fi/saha3/ube6cbdbc-1b7d-4e42-9...,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_7658,http://lexvo.org/id/iso639-3/spa
18814,1981,http://seco.tkk.fi/saha3/ud929b2e8-1ab3-4f9f-9...,http://www.yso.fi/onto/koko/p2224,http://www.yso.fi/onto/kaunokki#male,http://seco.tkk.fi/saha3/ufc9799ee-59d2-437b-8...,http://lexvo.org/id/iso639-3/eng
18815,2011,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://www.btj.fi/at_1591998,http://lexvo.org/id/iso639-3/swe
18816,1996,http://www.yso.fi/onto/kaunokki#character_1231...,http://www.yso.fi/onto/koko/p2224,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_5288,http://lexvo.org/id/iso639-3/eng
18817,1996,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p2224,unk,http://www.yso.fi/onto/kaunokki#ateos_5288,http://lexvo.org/id/iso639-3/eng
18818,1978,http://seco.tkk.fi/saha3/u0548430a-0211-458d-8...,http://www.yso.fi/onto/koko/p2224,http://www.yso.fi/onto/kaunokki#male,http://seco.tkk.fi/saha3/u66fb0c24-e465-4722-b...,http://lexvo.org/id/iso639-3/eng
18819,2006,http://seco.tkk.fi/saha3/udb595e9a-651d-4e99-b...,http://www.yso.fi/onto/koko/p2224,http://www.yso.fi/onto/kaunokki#male,http://seco.tkk.fi/saha3/u011edfe4-1abc-4397-8...,http://lexvo.org/id/iso639-3/eng


In [38]:
df_fo2=audf_fo.merge(df_fo, on=["novel","author","year"], how="left", suffixes=["_au",""])

In [47]:
df_fo2[df_fo2["publisher"].isna()]

Unnamed: 0,year,author,nat,gender,novel,lang,publisher,pubName
27,1982,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p2224,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_6298,http://lexvo.org/id/iso639-3/eng,,
101,1997,http://www.yso.fi/onto/kaunokki#person_1232726...,http://seco.tkk.fi/saha3/u5aa143b0-c88f-47a0-8...,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_40961,http://lexvo.org/id/iso639-3/eng,,
546,1988,http://www.yso.fi/onto/kaunokki#person_1232726...,http://www.yso.fi/onto/koko/p73101,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_6765,http://lexvo.org/id/iso639-3/eng,,
604,1973,http://www.yso.fi/onto/kaunokki#character_1231...,http://www.yso.fi/onto/koko/p73101,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_3666,http://lexvo.org/id/iso639-3/eng,,
677,1977,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p2224,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_12744,http://lexvo.org/id/iso639-3/eng,,
...,...,...,...,...,...,...,...,...
18676,2001,http://www.yso.fi/onto/kaunokki#person_1231759...,nonat,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_3255,http://lexvo.org/id/iso639-3/eng,,
18740,2001,http://www.yso.fi/onto/kaunokki#person_1231760...,http://www.yso.fi/onto/koko/p2224,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_11079,http://lexvo.org/id/iso639-3/eng,,
18756,1984,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p8327,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_10368,http://lexvo.org/id/iso639-3/rus,,
18862,1994,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p53709,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_2047,http://lexvo.org/id/iso639-3/eng,,


In [48]:
# 
print(len(df_fo.drop_duplicates(["novel","year"])),  
     len(df_fo2.drop_duplicates(["novel","year"])),
      len(audf_fo.drop_duplicates(["novel","year"]))
     )

# df_fo2 has all!

17407 17539 17539


In [49]:
## viihde: select rakkaus, jannitys ja huumori

# hae kaikki teokset

Q= """

    PREFIX rel: <http://ldf.fi/relse/> 
    PREFIX dct: <http://purl.org/dc/terms/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT DISTINCT ?broader ?teos ?year ?lang  ?author WHERE 
    { ?genre rel:broaderCategory <CAT> .  
        ?teos rel:hasGenre ?genre . 
        BIND (<CAT> as ?broader ) 
        ?teos rel:pubYear ?year .
        ?teos rel:hasAuthor ?author.
       #?author foaf:gender ?gender .
       # OPTIONAL {?author rel:authorNationality ?nat1 . }
        
        #BIND(COALESCE(?gender1, "unk") AS ?gender)
        ?teos rel:langOrig ?lang .
        
        }
    GROUP BY ?teos ?lang ?year ?broader  ?author

    """

viihde_genres=[]
for cat in ["rakkaus","jannitys","huumori"]:
    print(cat)
    q= Q.replace("CAT",cat)
    qres = g_fo2.query(q)
    genre =[{"genre":str(row.asdict()['broader'].toPython()),"year":str(row.asdict()['year'].toPython()),"novel":str(row.asdict()['teos'].toPython()),"lang":str(row.asdict()['lang'].toPython()),"author":str(row.asdict()['author'].toPython())}  for row in qres]
    viihde_genres.append(genre)

rakkaus
jannitys
huumori


In [50]:
# now add genre

df_fo_genre= pd.DataFrame()

for a in viihde_genres:
    df_fo_genre= pd.concat([df_fo_genre, pd.DataFrame(a)])

print(len(df_fo_genre))
print(df_fo_genre.novel.nunique())
df_fo_genre["year"]=pd.to_numeric(df_fo_genre["year"])
df_fo_genre["decade"]=(df_fo_genre["year"]-1)- ((df_fo_genre["year"]-1)%10)
df_fo_genre.sort_values(by="year", inplace=True)
df_fo_genre.head()

9379
8502


Unnamed: 0,genre,year,novel,lang,author,decade
2300,jannitys,1971,http://www.yso.fi/onto/kaunokki#ateos_7501,http://lexvo.org/id/iso639-3/fra,http://www.yso.fi/onto/kaunokki#person_1231760...,1970
1237,rakkaus,1971,http://www.yso.fi/onto/kaunokki#ateos_3931,http://lexvo.org/id/iso639-3/fra,http://www.yso.fi/onto/kaunokki#person_1231760...,1970
2418,jannitys,1971,http://www.yso.fi/onto/kaunokki#ateos_4288,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#person_1231759...,1970
2424,jannitys,1971,http://seco.tkk.fi/saha3/u6774e1fb-9c17-40b8-b...,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#person_1231759...,1970
4723,jannitys,1971,http://www.btj.fi/at_1479755,http://lexvo.org/id/iso639-3/swe,http://seco.tkk.fi/saha3/u0e364bf4-73f6-446a-9...,1970


In [51]:
df_fo_final=df_fo2.merge(df_fo_genre, on=["novel"], how="left", suffixes=["","_genre"])

In [52]:
print(len(df_fo.drop_duplicates(["novel","year"])),  
     len(df_fo2.drop_duplicates(["novel","year"])),
      len(audf_fo.drop_duplicates(["novel","year"])),
      len(df_fo_final.drop_duplicates(["novel","year"]))
     )


17407 17539 17539 17539


In [53]:
df_fo_final

Unnamed: 0,year,author,nat,gender,novel,lang,publisher,pubName,genre,year_genre,lang_genre,author_genre,decade
0,1984,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p35259,http://www.yso.fi/onto/kaunokki#male,http://www.yso.fi/onto/kaunokki#ateos_8105,http://lexvo.org/id/iso639-3/swe,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,WSOY,,,,,
1,2005,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p2224,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_1981,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Otava,,,,,
2,2005,http://www.yso.fi/onto/kaunokki#person_1231759...,http://seco.tkk.fi/saha3/u4efb5155-076b-4800-9...,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_1981,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Kustannusosakeyhtiö Otava,,,,,
3,2015,http://www.yso.fi/onto/kaunokki#person_1231758...,http://www.yso.fi/onto/koko/p6292,http://www.yso.fi/onto/kaunokki#male,http://www.btj.fi/at_1674759,http://lexvo.org/id/iso639-3/dan,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Gummerus Kustannus Oy,jannitys,2015.0,http://lexvo.org/id/iso639-3/dan,http://www.yso.fi/onto/kaunokki#person_1231758...,2010.0
4,2011,http://www.btj.fi/actor_Hogan%2C+Chuck,http://www.yso.fi/onto/koko/p73101,http://www.yso.fi/onto/kaunokki#male,http://www.btj.fi/at_1553291,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Tammi,jannitys,2011.0,http://lexvo.org/id/iso639-3/eng,http://www.btj.fi/actor_Hogan%2C+Chuck,2010.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20242,1996,http://www.yso.fi/onto/kaunokki#character_1231...,http://www.yso.fi/onto/koko/p2224,http://www.yso.fi/onto/kaunokki#female,http://www.yso.fi/onto/kaunokki#ateos_5288,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Karisto Oy,rakkaus,1996.0,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#character_1231...,1990.0
20243,1996,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p2224,unk,http://www.yso.fi/onto/kaunokki#ateos_5288,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Karisto Oy,rakkaus,1996.0,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#person_1231759...,1990.0
20244,1996,http://www.yso.fi/onto/kaunokki#person_1231759...,http://www.yso.fi/onto/koko/p2224,unk,http://www.yso.fi/onto/kaunokki#ateos_5288,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Karisto Oy,rakkaus,1996.0,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#character_1231...,1990.0
20245,1978,http://seco.tkk.fi/saha3/u0548430a-0211-458d-8...,http://www.yso.fi/onto/koko/p2224,http://www.yso.fi/onto/kaunokki#male,http://seco.tkk.fi/saha3/u66fb0c24-e465-4722-b...,http://lexvo.org/id/iso639-3/eng,http://www.yso.fi/onto/kaunokki#Kustantaja_123...,Gummerus Kustannus Oy,jannitys,1978.0,http://lexvo.org/id/iso639-3/eng,http://seco.tkk.fi/saha3/u0548430a-0211-458d-8...,1970.0


In [54]:
#df_fo_final.to_csv("../../data/df_fo_all.csv", sep="\t")