In [1]:
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '/Users/yameitu/Documents/Yamei/Research/2024PVIS/python_code/')
import utils
from rdflib import URIRef
from rdflib import Graph, URIRef, Literal, BNode, Namespace
from rdflib.namespace import FOAF, RDF
from rdflib.namespace import XSD
import pandas as pd
import ast
import networkx as nx 
# !pip install pydotplus
# !pip install graphviz
import io
from IPython.display import display, Image
from rdflib.tools.rdf2dot import rdf2dot
import math 
import json
import pydotplus
import sys

In [2]:
DOI_prefix = 'https://doi.org/'
temp = pd.read_csv('../kg_data/0_affiliation2name_v3.csv')
AFF_Mapping = dict(zip(temp['raw_affiliation'].tolist(), temp['wikidata'].tolist()))
mapping = utils.loadjson('../kg_data/1_phrase2dbpedia_cleaned.json')
mapping_keyword  = utils.loadjson('../kg_data/1_keyword2dbpedia_cleaned.json')


In [3]:
TVCG_P = Namespace('http://tvcg.org/property/')
TVCG_C = Namespace('http://tvcg.org/class/')
PRISM = Namespace("http://prismstandard.org/namespaces/basic/2.0/")
DCTERMS = Namespace("http://purl.org/dc/terms/")
MAG = Namespace("https://makg.org/property/")
MAGC = Namespace("https://makg.org/class/")
DATACITE = Namespace("http://purl.org/spar/datacite")
FABIO = Namespace("http://purl.org/spar/fabio/")
VISSURVEY = Namespace("http://vissurvey.org/property/")
ORG = Namespace('http://www.w3.org/ns/org#')
CITO = Namespace("http://purl.org/spar/cito/")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
DBO = Namespace("http://dbpedia.org/ontology/")

In [4]:
intra_rel = {
    ('task', 'application'): "applied_in",
    ('name', 'technique'): "uses",
    ('task', 'name'): 'solved_by',
    ('name', 'input_data'): 'feeds_in',
    ('name', 'output_data'): 'generates',
    ('name', 'evaluation_data'):'evaluated_on',
    ('name', 'evaluation_technique'):'evaluated_by',
    ('name', 'evaluation_baseline'):'evaluated_with'
}

In [5]:
inter_rel = {
    'task': 'has_task',
    'application': 'has_background',
    'name': 'has_method',
    'technique': 'has_technique',
    'evaluation_data': 'has_evaluation_data',
    'evaluation_technique': 'has_evaluation_technique',
    'evaluation_baseline': 'has_evaluation_baseline',
    'input_data': 'has_input_data',
    'output_data': 'has_output_data',
    'source_data': 'has_code'
}

In [7]:
def visualize(g):
    stream = io.StringIO()
    rdf2dot(g, stream, opts = {display})
    dg = pydotplus.graph_from_dot_data(stream.getvalue())
    png = dg.create_png()
    display(Image(png))
def mapKeyword2DB(keyword):
    global mapping_keyword
    if len(mapping_keyword[keyword])==0:
        return None
    else:
        return mapping_keyword[keyword]
def mapPhrase2DB(phrase):
    global mapping 
    if len(mapping[phrase])==0:
        return None
    else:
        return mapping[phrase]
def findPairs(ele, total, prefix=None):
    if isinstance(ele, dict):
        for key, val in ele.items():
            if isinstance(val, str) or (isinstance(val, list) and len(val)>0 and isinstance(val[0], str)):
                if prefix:
                    total[prefix+'_'+key] = val
            else:
                findPairs(val, total, key)
    elif isinstance(ele, list) and len(ele)>0 and isinstance(ele[0], dict):
        for one_dict in ele:
            findPairs(one_dict, total, prefix)
    return total
def addSemantics(g, one_paper, paper_node):
#     total = findPairs(one_paper['llm'], {})
    total = one_paper['llm']['scientific_concepts']
    ## add inter rel between ent and paper 
    for ele in total:
        if ele in inter_rel:
            rel_name = inter_rel[ele]
            
            if isinstance(total[ele], str):
                g = addInterEnt(g, total[ele], paper_node, rel_name, ele)
            elif isinstance(total[ele], list):
                for e in total[ele]:
                    g = addInterEnt(g, e, paper_node, rel_name, ele)
    
    ## add intra rel between ent
#     for key, val in intra_rel.items():
#         if key[0] in total and key[1] in total: ## exists both entities 
#             s = total[key[0]]
#             t = total[key[1]]
#             if isinstance(s, str) and isinstance(t, str):
#                 addIntraEnt(g,s, t, key[0], key[1], val)
#             elif isinstance(s, list) and isinstance(t, list):
#                 for s_ in s:
#                     for t_ in t:
#                         addIntraEnt(g,s_, t_, key[0], key[1], val)
#             elif isinstance(s, str) and isinstance(t, list):
#                 for t_ in t:
#                     addIntraEnt(g,s, t_, key[0], key[1], val)
#             elif isinstance(s, list) and isinstance(t, str):
#                 for s_ in s:
#                     addIntraEnt(g,s_, t, key[0], key[1], val)
    return g
def searchKG(g, keyword, RDF_type_string):
    sparql_ = f"""
        SELECT *
        where {{
            ?target rdf:type  {RDF_type_string} .
            ?target foaf:name "{keyword}"^^xsd:string .
        }}
    """
    try:
        output = g.query(sparql_)
        return list(output)
    except:
        print('ERROR searchKG:',keyword)
        return []

In [8]:
def addInterEnt(g, ele, paper_node, rel_name, entType):
    ent_node = searchKG(g, ele, "<http://tvcg.org/class/"+entType+">")
    if len(ent_node) ==0:
        ent_node = BNode()
        g.add((ent_node, RDF.type, URIRef("http://tvcg.org/class/"+entType)))
        g.add((ent_node, FOAF.name, Literal(ele, datatype=XSD.string)))
    else:
        ent_node = ent_node[0][0]
    g.add((paper_node, URIRef("http://tvcg.org/property/"+rel_name), ent_node))
    db = mapPhrase2DB(ele)
    if db!=None:
        for d in db:
            g.add((ent_node, TVCG_P.seeAlso, URIRef("https://dbpedia.org/page/"+d)))
            g.add((URIRef("https://dbpedia.org/page/"+d), FOAF.name, Literal(d, datatype=XSD.string)))
            g.add((URIRef("https://dbpedia.org/page/"+d), RDF.type, URIRef("https://tvcg.org/class/DBPedia")))
    return g 

def addIntraEnt(g, s, t, s_type, t_type, rel):
    s_node = searchKG(g, s, "<http://tvcg.org/class/"+s_type+">")[0][0]
    t_node = searchKG(g, t, "<http://tvcg.org/class/"+t_type+">")[0][0]
    g.add((s_node, URIRef("http://tvcg.org/property/"+rel), t_node))
    return g 
def flatDataDict(ele, output):
    if isinstance(ele, dict):
        for key, val in ele.items():
            if isinstance(val, str) or (isinstance(val, list) and len(val)>0 and isinstance(val[0], str)):
                output[key] = val
            else:
                flatDataDict(val, output)
    return output
def addAuthorEntity(g, paper_node, row):
    
    authors = row['data']['article']['authors']

    # print(authors)
    for author in authors:
        name = author['fullName']
        if name!=None:
            search_author = searchKG(g, name, '<https://makg.org/class/Author>')
            if len(search_author)==0:
                c = BNode()
                g.add((c, RDF.type, MAGC.Author))
                g.add((c, FOAF.name, Literal(name, datatype=XSD.string)))
                aff = author['affiliation']
                if aff!=None:
                    url = AFF_Mapping[aff]
                    if url == None or isinstance(url, str)==False or url=="":
                        a = BNode()
                        g.add((a, RDF.type, MAGC.Affiliation))
                        g.add((a, FOAF.name, Literal(aff, datatype=XSD.string)))
                    else:
                        a = URIRef(url)
                        g.add((a, FOAF.name, Literal(aff, datatype=XSD.string)))
                        g.add((a, RDF.type, MAGC.Affiliation))
                    g.add((c, ORG.memberOf, a)) ##  Author --> Affiliation 
            else:
                c = search_author[0][0]
            g.add((paper_node, DCTERMS.created, c))
    return g 

In [9]:
def processOnePaper(g, row):
    
    articleInfo = row['data']['article']
    if articleInfo['doi'] != None: 
        paper_node = URIRef(DOI_prefix+articleInfo['doi'])
    else:
        paper_node = BNode()
    g.add((paper_node, RDF.type, MAGC.Paper))
    
    ## add meta info 
    if articleInfo['id']:
        g.add((paper_node, DCTERMS.identifier, Literal(articleInfo['id'], datatype=XSD.string)))
    if articleInfo['normalizedAbstract']:
        g.add((paper_node, DCTERMS.abstract, Literal(articleInfo['normalizedAbstract'], datatype=XSD.string)))
    if articleInfo['normalizedTitle']:
        g.add((paper_node, DCTERMS.title, Literal(articleInfo['normalizedTitle'], datatype=XSD.string)))
    if articleInfo['pubDate']: 
        g.add((paper_node, PRISM.publicationDate, Literal(articleInfo['pubDate'], datatype=XSD.date)))
    if articleInfo['idPrefix']:
        g.add((paper_node, PRISM.issueIdentifier, Literal(articleInfo['idPrefix'], datatype=XSD.string)))
  
    
    ## add author:
    g = addAuthorEntity(g, paper_node, row)
    
    ## 
    try:
        g = addSemantics(g, row, paper_node)
    except:
        print('ERROR: add Semantics',row['llm'])
#         sys.exit("Error message")
    
#     g = addKeywords(g, row, paper_node)
    
    ## and journal 
    
#     if 'issue' in row['data']:
#         g = addIssueEntity(g, paper_node, row)
#     else:
#         g = addProceedEntity(g, paper_node, row)
        
    return g

def addKeywords(g, row, paper_node):
    keywords =  row['data']['article']['keywords']
    for k in keywords:
        mapped = mapKeyword2DB(k) 
        search_concept = searchKG(g, k, "<http://tvcg.org/class/keyword>")
        if len(search_concept)==0:
            c = BNode()
            g.add((c, FOAF.name, Literal(k, datatype=XSD.string)))
            g.add((c, RDF.type, TVCG_C.keyword))
            if mapped!=None:
                for d in mapped:
                    g.add((c, TVCG_P.seeAlso, URIRef("https://dbpedia.org/page/"+d)))
                    g.add((URIRef("https://dbpedia.org/page/"+d), FOAF.name, Literal(d, datatype=XSD.string)))
                    g.add((URIRef("https://dbpedia.org/page/"+d), RDF.type, URIRef("https://tvcg.org/class/DBPedia")))
        else:
            c = search_concept[0][0]

        g.add((paper_node, PRISM.keyword, c))

    return g
def searchKG(g, keyword, RDF_type_string):
    sparql_ = f"""
        SELECT *
        where {{
            ?target rdf:type  {RDF_type_string} .
            ?target foaf:name "{keyword}"^^xsd:string .
        }}
    """
    try:
        output = g.query(sparql_)
        return list(output)
    except:
        print('ERROR: search Keyword',keyword)
        return []
    

In [10]:
data = utils.loadjson('../kg_data/2_merged_data_llm.json')
from rdflib import Graph
g = Graph()

for i in range(len(data)):
    if i%100==0:
        print(i)
    ele = data[i]
    g = processOnePaper(g, ele)
    
# g.serialize(destination = "../kg_data/3_kg_v3.ttl")

0
100
200
300
ERROR: add Semantics {}
400
500
600
700
800
ERROR: add Semantics {}
900
1000
ERROR: add Semantics {}
ERROR: add Semantics {}
ERROR: add Semantics {}
1100
1200
ERROR: add Semantics {}
ERROR: add Semantics {}
1300
ERROR: add Semantics {}
1400
1500
1600
1700
1800
ERROR: add Semantics {}
1900
2000
ERROR: add Semantics {}
ERROR: add Semantics {}
2100
2200
2300
ERROR: add Semantics {}
2400
2500
2600
ERROR: add Semantics {}
2700
2800
ERROR: add Semantics {}
2900
3000
3100
ERROR: add Semantics {}
ERROR: add Semantics {}
3200
3300
3400
3500
ERROR: add Semantics {}
3600
ERROR: add Semantics {}
ERROR: add Semantics {}
3700
ERROR: add Semantics {}
3800
3900
4000
4100
4200
ERROR: add Semantics {}
4300
4400
4500
4600
4700
ERROR: add Semantics {}
4800
ERROR: add Semantics {}
ERROR: add Semantics {}
ERROR: add Semantics {}
4900


In [11]:
g.serialize(destination = "../kg_data/kg_v6.ttl")

<Graph identifier=Nd688d79bf4bb4ad28621010878b75918 (<class 'rdflib.graph.Graph'>)>

In [139]:
all_paper_idx = [ele['data']['article']['id'] for ele in data]

In [140]:
for i in range(len(data)):
    if i%100==0:
        print(i)
    recommendations = data[i]['data']['recommendedArticles']
    
    current_node = searchPaper(g, data[i]['data']['article']['id'])
    for r in recommendations:
        idx = r['abstractUrl'].split('/')[-1]
        if idx in all_paper_idx:
            target_node = searchPaper(g, idx)
            
            g.add((current_node, TVCG_P.recommendedTo, target_node))
    

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900


In [141]:
g.serialize(destination = "../kg_data/kg_v3.ttl")

<Graph identifier=N1ca7c07abd4d4fdd9d2edca3ef244992 (<class 'rdflib.graph.Graph'>)>

In [78]:
def searchPaper(g, idx):
    sparql_ = f"""
        SELECT *
        where {{
            ?target rdf:type <https://makg.org/class/Paper>;
            <http://purl.org/dc/terms/identifier> '{idx}'^^xsd:string.
        }}
    """
    output = g.query(sparql_)
    return list(output)[0][0]

In [83]:
# searchPaper(g, '1haUx0fpghW')

In [84]:
# data[100]['data']['recommendedArticles']