# Publication-Graph (Lab1)

In [404]:
#to activate venv: venv\Scripts\activate

## CSV processing

- created csv files with basic commands using the github recommended csv converter: XMLToCSV.py --annotate --neo4j data/dblp.xml data/dblp.dtd data/output.csv

In [405]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import random
import lipsum

### Article Preprocessing

In [406]:
#helper function
STOP_WORDS = set(stopwords.words('english'))
DATABASE_COMMUNITY_KEYWORDS = ["data management","indexing", "data modeling", "big data", "data processing", "data storage","data querying"]

def create_keywords(title):
    title = re.sub(r'[^\w\s]', "", title) #remove punctuation
    word_tokens = word_tokenize(title)
    keywords = set([w.lower() for w in word_tokens if not w.lower() in STOP_WORDS])

    for kw in DATABASE_COMMUNITY_KEYWORDS:
        if kw in title:
            keywords.add(kw)

    #add random database community kw to approx 30% of papers
    if(random.randrange(0,10,1)<= 3):
        keywords.add(random.sample(DATABASE_COMMUNITY_KEYWORDS,1)[0])

    return "|".join(keywords)

def trim_title(title):
    if len(title)>300:
        return title[:300]
    
    return title


def generate_affiliation(row):
    affil = {
        1: "Company",
        2: "University"
    }

    return f"{affil.get(random.randint(1,2))} {random.randint(1,30)}"

def generate_publisher(publisher):
    return f"Publisher {random.randint(1,10)}"

In [407]:
KEEP_HEADERS_ARTICLE = [
    "article:ID",
    "author:string[]",
    "crossref:string",#volume-key
    "editor:string[]",
    "ee:string[]",
    "journal:string",#journal:name
    "key:string", #paper:key
    "number:string",#volume of the year
    "pages:string",
    "title:string", #paper:title
    "volume:string",#consecutive issue number
    "year:int", #volume:year,
    "publisher:string"
]

In [408]:
header = """article:ID;author:string[];author-aux:string;author-orcid:string[];booktitle:string;cdate:date;cdrom:string;cite:string[];cite-label:string[];crossref:string;editor:string[];editor-orcid:string[];ee:string[];ee-type:string[];i:string[];journal:string;key:string;mdate:date;month:string;note:string[];note-label:string;note-type:string[];number:string;pages:string;publisher:string;publnr:string;publtype:string;sub:string[];sup:string[];title:string;title-bibtex:string;tt:string[];url:string[];volume:string;year:int""".split(";")
articles = pd.read_csv("data/output_article.csv", nrows=10000, sep=";", names=header)
articles = articles[KEEP_HEADERS_ARTICLE]

#typing columns
articles["crossref:string"] = articles["crossref:string"].astype("string")
articles["title:string"] = articles["title:string"].astype("string")
articles["author:string[]"] = articles["author:string[]"].astype("string")
pd.to_numeric(articles["number:string"], errors="coerce", downcast="integer")
pd.to_numeric(articles["year:int"], downcast="integer")

#filter down to year > 1985
articles = articles.dropna(subset="year:int")
articles = articles[articles["year:int"] > 1985]
print(len(articles))

9978


In [409]:
def doi_or_na(ee):
    try:
        doi = ee.split("|")[0]
        #returns the text if it is a doi, na otherwise
        if("doi" in ee):
            return ee
        
    except:
        print("no str found")
    
    return pd.NA

In [410]:
articles["doi"] = articles["ee:string[]"].apply(lambda x: doi_or_na(str(x)))

In [411]:
#drop rows with no journal name
articles = articles.dropna(subset=["journal:string"])

#drop articles without author
articles = articles.dropna(subset=["author:string[]"])

#drop with no pages
articles = articles.dropna(subset=["pages:string"])

#ensure title lenth
articles = articles.dropna(subset=["title:string"])
articles["title:string"] = articles["title:string"].map(trim_title)

#drop rows with missing year
articles = articles.dropna(subset=["year:int"])

#assert volume 1 for missing volume number
articles["number:string"] = articles["number:string"].fillna(1) #set volume number to 1 for missing ones

#create volumekey
articles["volume_key"] = articles["journal:string"] + articles["year:int"].astype("string") + articles["number:string"].astype("string")

#fill missing crossref
articles["crossref:string"] = articles["crossref:string"].fillna(articles["volume_key"])

#create corresponding_author
articles["corresponding_author"] = articles["author:string[]"].apply(lambda x: x.split("|")[0])

In [412]:
#generate keywords
articles["keywords"] = articles["title:string"].map(create_keywords)
articles["in_db_community"] = articles["keywords"].apply(lambda x: len(set(DATABASE_COMMUNITY_KEYWORDS).intersection(set(x.split("|"))))>0)

In [413]:
#generate publishers
articles["publisher"] = articles["publisher:string"].apply(generate_publisher)

In [414]:
articles.head()
#print(len(articles))

Unnamed: 0,article:ID,author:string[],crossref:string,editor:string[],ee:string[],journal:string,key:string,number:string,pages:string,title:string,volume:string,year:int,publisher:string,doi,volume_key,corresponding_author,keywords,in_db_community,publisher
558,577,Hugo Hellebrand|Markus Casper|Ralf Merz|Rita Ley,Hydrology and Earth System Sciences20111,,https://doi.org/10.5194/hess-15-2947-2011,Hydrology and Earth System Sciences,persons/LeyCHM11,1,2947-2962,Catchment classification by runoff behaviour w...,15,2011,,https://doi.org/10.5194/hess-15-2947-2011,Hydrology and Earth System Sciences20111,Hugo Hellebrand,selforganizing|maps|behaviour|classification|s...,False,Publisher 3
562,583,Markus Tresch,"ETH Zurich, Department of Computer Science / T...",,https://doi.org/10.3929/ethz-a-006651652,"ETH Zurich, Department of Computer Science / T...",persons/Tresch96,1,1-27,Principles of Distributed Object Database Lang...,248,1996,,https://doi.org/10.3929/ethz-a-006651652,"ETH Zurich, Department of Computer Science / T...",Markus Tresch,languages|object|distributed|data management|d...,True,Publisher 7
564,585,Andreas Rock|Gayane Grigoryan|Günther Heineman...,Hydrology and Earth System Sciences20121,,https://doi.org/10.5194/hess-16-409-2012,Hydrology and Earth System Sciences,persons/CasperGGGHLR12,1,409-421,Analysis of projected hydrological behavior of...,16,2012,,https://doi.org/10.5194/hess-16-409-2012,Hydrology and Earth System Sciences20121,Andreas Rock,indices|data storage|behavior|analysis|hydrolo...,True,Publisher 4
565,28670,Sarah R. Davies,Sci. Eng. Ethics20194,,https://doi.org/10.1007/s11948-018-0064-y|http...,Sci. Eng. Ethics,journals/see/Davies19,4,1235-1253,An Ethics of the System: Talking to Scientists...,25,2019,,https://doi.org/10.1007/s11948-018-0064-y|http...,Sci. Eng. Ethics20194,Sarah R. Davies,integrity|scientists|research|talking|system|e...,False,Publisher 3
566,28671,Colleen Murphy|Paolo Gardoni,Sci. Eng. Ethics20081,,https://doi.org/10.1007/s11948-007-9031-8|http...,Sci. Eng. Ethics,journals/see/MurphyG08,1,77-92,The Acceptability and the Tolerability of Soci...,14,2008,,https://doi.org/10.1007/s11948-007-9031-8|http...,Sci. Eng. Ethics20081,Colleen Murphy,acceptability|tolerability|approach|risks|capa...,False,Publisher 9


In [415]:
print(sum(articles["publisher:string"].isna()))

9282


In [416]:
articles.to_csv("data/articles_preprocessed.csv", index=False)

### Inproceedings preprocessing

In [417]:
KEEP_HEADER_INPROCEEDING = [
    "inproceedings:ID",
    "author:string[]",
    "booktitle:string",#conference/forum title
    "crossref:string[]",#proceeding key
    "editor:string[]",
    "ee:string[]",
    "key:string",#inproceedings key
    "number:string",
    "pages:string",
    "title:string",
    "volume:int",
    "year:int"#year_held
]

In [418]:
header = """inproceedings:ID;author:string[];author-aux:string[];author-orcid:string[];booktitle:string;cdrom:string[];cite:string[];cite-label:string[];crossref:string[];editor:string[];editor-orcid:string[];ee:string[];ee-type:string[];i:string[];key:string;mdate:date;month:string;note:string;note-type:string;number:string;pages:string;publtype:string;sub:string[];sup:string[];title:string;title-bibtex:string;tt:string;url:string;volume:int;year:int""".split(";")
inproceedings = pd.read_csv("data/output_inproceedings.csv", nrows=10000, sep=";", names=header)
inproceedings = inproceedings[KEEP_HEADER_INPROCEEDING]

#typing columns
inproceedings["crossref:string[]"] = inproceedings["crossref:string[]"].astype("string")
inproceedings["title:string"] = inproceedings["title:string"].astype("string")
pd.to_numeric(inproceedings["number:string"], errors="coerce", downcast="integer")
pd.to_numeric(inproceedings["year:int"], downcast="integer")
inproceedings["author:string[]"] = inproceedings["author:string[]"].astype("string")

#only keep > 1985
inproceedings = inproceedings.dropna(subset="year:int")
inproceedings = inproceedings[inproceedings["year:int"] > 1985]
print(len(inproceedings))

9764


In [419]:
inproceedings.head()
#print(inproceedings["booktitle:string"].unique())

Unnamed: 0,inproceedings:ID,author:string[],booktitle:string,crossref:string[],editor:string[],ee:string[],key:string,number:string,pages:string,title:string,volume:int,year:int
0,555,Arnon Rosenthal,SWEE,conf/swee/1998,,http://www.mitre.org/support/swee/rosenthal.html,www/org/mitre/future,,,The Future of Classic Data Administration: Obj...,,1998
2,159865,Sven Lorenz|Toni Bollinger|Udo Pletat,Text Understanding in LILOG,journals/lncs/1991-546,,https://doi.org/10.1007/3-540-54594-8_72,journals/lncs/BollingerLP91,,402-427,The LILOG Inference Engine.,,1991
3,159866,Geoffrey Simmons|Kai-Uwe Carstensen,Text Understanding in LILOG,journals/lncs/1991-546,,https://doi.org/10.1007/3-540-54594-8_83,journals/lncs/CarstensenS91,,632-644,Why a Hill Can't be a Valley: Representing Ges...,,1991
4,159867,David W. Flater|Yelena Yesha,Advanced Database Systems,journals/lncs/1993-759,,https://doi.org/10.1007/3-540-57507-3_13,journals/lncs/FlaterY93,,259-276,Towards Flexible Distributed Information Retri...,,1993
5,159868,Claus-Rainer Rollinger|Otthein Herzog,Text Understanding in LILOG,journals/lncs/1991-546,,https://doi.org/10.1007/3-540-54594-8_46,journals/lncs/RollingerH91,,3-13,Introducing LILOG.,,1991


In [420]:
#crossref is required to get the information from proceedings - thus crossref na are dropped
inproceedings = inproceedings.dropna(subset=["crossref:string[]"])

#drop articles without author
inproceedings = inproceedings.dropna(subset=["author:string[]"])

#create corresponding_author
inproceedings["corresponding_author"] = inproceedings["author:string[]"].apply(lambda x: x.split("|")[0])

#we have enough elements so drop the ones without pages
inproceedings = inproceedings.dropna(subset=["pages:string"])

#create doi
inproceedings["doi"] = inproceedings["ee:string[]"].apply(doi_or_na)

#ensure title lenth
inproceedings = inproceedings.dropna(subset=["title:string"])
inproceedings["title:string"] = inproceedings["title:string"].map(trim_title)

no str found
no str found
no str found


In [421]:
print(len(inproceedings))

9651


In [422]:
#generate keywords
inproceedings["keywords"] = inproceedings["title:string"].map(create_keywords)

In [423]:
conference_crossrefs = inproceedings["crossref:string[]"].unique()
print(len(conference_crossrefs))
print(conference_crossrefs)

823
<StringArray>
[      'journals/lncs/1991-546',       'journals/lncs/1993-759',
       'journals/lncs/1994-910',            'conf/nips/2008coa',
            'conf/aistats/2012',            'conf/aistats/2010',
            'conf/aistats/2011',            'conf/aistats/2007',
               'conf/colt/2012',            'conf/aistats/2009',
 ...
        'journals/corr/BeekL15',  'journals/corr/abs-1203-5423',
     'journals/corr/DuboisMM17',  'journals/corr/abs-1108-4077',
  'journals/corr/abs-1007-4993',       'journals/corr/Danvyd16',
  'journals/corr/abs-1204-5796', 'journals/corr/abs-1805-04255',
 'journals/corr/abs-1709-00049', 'journals/corr/abs-2107-01544']
Length: 823, dtype: string


In [424]:
inproceedings.head()

Unnamed: 0,inproceedings:ID,author:string[],booktitle:string,crossref:string[],editor:string[],ee:string[],key:string,number:string,pages:string,title:string,volume:int,year:int,corresponding_author,doi,keywords
2,159865,Sven Lorenz|Toni Bollinger|Udo Pletat,Text Understanding in LILOG,journals/lncs/1991-546,,https://doi.org/10.1007/3-540-54594-8_72,journals/lncs/BollingerLP91,,402-427,The LILOG Inference Engine.,,1991,Sven Lorenz,https://doi.org/10.1007/3-540-54594-8_72,data modeling|engine|inference|lilog
3,159866,Geoffrey Simmons|Kai-Uwe Carstensen,Text Understanding in LILOG,journals/lncs/1991-546,,https://doi.org/10.1007/3-540-54594-8_83,journals/lncs/CarstensenS91,,632-644,Why a Hill Can't be a Valley: Representing Ges...,,1991,Geoffrey Simmons,https://doi.org/10.1007/3-540-54594-8_83,representing|cant|properties|objects|object|va...
4,159867,David W. Flater|Yelena Yesha,Advanced Database Systems,journals/lncs/1993-759,,https://doi.org/10.1007/3-540-57507-3_13,journals/lncs/FlaterY93,,259-276,Towards Flexible Distributed Information Retri...,,1993,David W. Flater,https://doi.org/10.1007/3-540-57507-3_13,information|distributed|retrieval|towards|flex...
5,159868,Claus-Rainer Rollinger|Otthein Herzog,Text Understanding in LILOG,journals/lncs/1991-546,,https://doi.org/10.1007/3-540-54594-8_46,journals/lncs/RollingerH91,,3-13,Introducing LILOG.,,1991,Claus-Rainer Rollinger,https://doi.org/10.1007/3-540-54594-8_46,lilog|data management|introducing
6,159869,Bharat K. Bhargava|Jagannathan Srinivasan|Pras...,Advanced Database Systems,journals/lncs/1993-759,,https://doi.org/10.1007/3-540-57507-3_5,journals/lncs/BhargavaJSD93,,87-103,Transition From A Relation To Object Model Imp...,,1993,Bharat K. Bhargava,https://doi.org/10.1007/3-540-57507-3_5,model|transition|implementation|object|relation


In [425]:
inproceedings.to_csv("data/inproceedings_preprocessed.csv", index=False)

### preprocess proceedings csv

In [426]:
KEEP_HEADER_PROCEEDING = [
    "proceedings:ID",
    "booktitle:string",
    "editor:string[]",
    "ee:string[]",
    "key:string",
    "number:string",
    "title:string",
    "volume:string",
    "year:int"
]

In [427]:
header = """proceedings:ID;address:string;author:string[];booktitle:string;cite:string[];cite-label:string[];editor:string[];editor-orcid:string[];ee:string[];ee-type:string[];i:string;isbn:string[];isbn-type:string[];journal:string;key:string;mdate:date;note:string[];note-type:string;number:string;pages:string;publisher:string[];publisher-href:string;publtype:string;school:string;series:string[];series-href:string[];sub:string;sup:string[];title:string;url:string[];volume:string;year:int""".split(";")
proceedings = pd.read_csv("data/output_proceedings.csv", nrows=100000, sep=";", names=header)
proceedings = proceedings[KEEP_HEADER_PROCEEDING]

#typing columns
proceedings["title:string"] = proceedings["title:string"].astype("string")
pd.to_numeric(proceedings["number:string"], errors="coerce", downcast="integer")
pd.to_numeric(proceedings["year:int"], downcast="integer")

  proceedings = pd.read_csv("data/output_proceedings.csv", nrows=100000, sep=";", names=header)


0        1999
1        2015
2        2013
3        2014
4        2019
         ... 
53876    2013
53877    2007
53878    1999
53879    2017
53880    2010
Name: year:int, Length: 53881, dtype: int16

In [428]:
proceedings.head()

Unnamed: 0,proceedings:ID,booktitle:string,editor:string[],ee:string[],key:string,number:string,title:string,volume:string,year:int
0,461,MMB (Kurzvorträge),Dieter Baum|Norbert Th. Müller|Richard Rödler,,tr/trier/MI99-17,,"MMB '99, Messung, Modellierung und Bewertung v...",99-16,1999
1,2240,,Amir Hossein Alavi|Amir Hossein Gandomi|Conor ...,https://doi.org/10.1007/978-3-319-20883-1,reference/genetic/2015,,Handbook of Genetic Programming Applications,,2015
2,13516,,Ankur Agarwal|Borko Furht,https://doi.org/10.1007/978-1-4614-8495-0,reference/med/2013,,Handbook of Medical and Healthcare Technologies,,2013
3,103589,Trans. Computational Collective Intelligence,Ngoc Thanh Nguyen,https://doi.org/10.1007/978-3-662-44509-9,journals/tcci/2014-14,,Transactions on Computational Collective Intel...,8615,2014
4,103594,,Marcin Hernes|Ngoc Thanh Nguyen|Ryszard Kowalczyk,https://doi.org/10.1007/978-3-662-58611-2,journals/tcci/2019-32,,Transactions on Computational Collective Intel...,11370,2019


In [429]:
#drop where title not present
proceedings = proceedings.dropna(subset=["title:string"])

In [430]:
#filter proceedings down to relevant subset from selected papers
proceedings["in_selected_ip_subset"] = proceedings["key:string"].apply(lambda x: True if x in conference_crossrefs else pd.NA)
proceedings = proceedings.dropna(subset=["in_selected_ip_subset"])
print(sum(proceedings["in_selected_ip_subset"]))

#create doi
proceedings["doi"] = proceedings["ee:string[]"].apply(doi_or_na)

823


In [431]:
# get edition where possible
def get_edition(volume):
    try:
        return volume.split("-")[1]
    except Exception:
        return None
    
proceedings["edition"] = proceedings["volume:string"].map(get_edition)
proceedings["edition"] = proceedings["edition"].fillna(proceedings["volume:string"])


In [432]:
proceedings.to_csv("data/proceedings_preprocessed.csv", index=False)

### Create citations

In [433]:
#paper keys and years from articles
article_keys = articles[["key:string","year:int"]]
print(article_keys)

ip_keys = inproceedings[["key:string","year:int"]]
print(ip_keys)

## union both
cite_keys = pd.concat([article_keys, ip_keys])
#cite_keys["year:int"].dtype

                       key:string  year:int
558              persons/LeyCHM11      2011
562              persons/Tresch96      1996
564        persons/CasperGGGHLR12      2012
565         journals/see/Davies19      2019
566        journals/see/MurphyG08      2008
...                           ...       ...
9995        journals/cce/VidalM15      2015
9996          journals/cce/LiuK08      2008
9997        journals/cce/ApioBT18      2018
9998  journals/cce/BattistiCMMM20      2020
9999  journals/cce/RebughiniCDM17      2017

[9282 rows x 2 columns]
                        key:string  year:int
2      journals/lncs/BollingerLP91      1991
3      journals/lncs/CarstensenS91      1991
4          journals/lncs/FlaterY93      1993
5       journals/lncs/RollingerH91      1991
6      journals/lncs/BhargavaJSD93      1993
...                            ...       ...
9995       journals/corr/BlancoM15      2015
9996  journals/corr/abs-2109-08303      2021
9997   journals/corr/abs-1108-1865      20

In [434]:
def generate_citations(df, year):
    no_citations = random.randint(5,15)

    df = df[df["year:int"] < year]

    try:
        citation_keys = df["key:string"].sample(no_citations, random_state=42).tolist()
    except ValueError:
        print(f"could not make sample for year {year}")
        citation_keys = []

    return citation_keys    

In [435]:
cite_keys["cites"] = cite_keys["year:int"].apply(lambda year: generate_citations(cite_keys, year))

could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for year 1986
could not make sample for ye

In [436]:
cite_keys.head()

Unnamed: 0,key:string,year:int,cites
558,persons/LeyCHM11,2011,"[journals/entcs/KerjeanKST06, journals/jet/Ehl..."
562,persons/Tresch96,1996,"[journals/oopsm/MalenfantLV91, journals/lncs/K..."
564,persons/CasperGGGHLR12,2012,"[journals/entcs/RebernakMHP06, journals/proced..."
565,journals/see/Davies19,2019,"[journals/entcs/Merro07, journals/jet/Dokumaci..."
566,journals/see/MurphyG08,2008,"[journals/entcs/RaymondRJ08, journals/sigcse/W..."


In [437]:
cite_keys["cites"] = cite_keys["cites"].apply(lambda x: "|".join(x))
cite_keys.head()

Unnamed: 0,key:string,year:int,cites
558,persons/LeyCHM11,2011,journals/entcs/KerjeanKST06|journals/jet/Ehler...
562,persons/Tresch96,1996,journals/oopsm/MalenfantLV91|journals/lncs/Khe...
564,persons/CasperGGGHLR12,2012,journals/entcs/RebernakMHP06|journals/procedia...
565,journals/see/Davies19,2019,journals/entcs/Merro07|journals/jet/DokumaciS1...
566,journals/see/MurphyG08,2008,journals/entcs/RaymondRJ08|journals/sigcse/Wal...


In [438]:
cite_keys.to_csv("data/citations.csv", index=False)

### create editor and chairperson relationships

In [439]:
#make authors a list
articles["author_list"] = articles["author:string[]"].apply(lambda x: x.split("|"))

#
# expand authors to separate rows
editors = articles.explode("author_list")

#group by researcher and journal
editors = editors.groupby(["author_list", "journal:string"]).agg(publish_count=("author_list","count"))

In [440]:
editors = editors.sort_values("publish_count", ascending=False)
#get top 3 authors of each journal
top_publishers = editors.groupby(["journal:string"]).head(3)

In [441]:
top_publishers = top_publishers.rename(columns={"author_list": "editor", "journal:string": "journal"})
top_publishers.head(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,publish_count
author_list,journal:string,Unnamed: 2_level_1
Ignacio E. Grossmann,Comput. Chem. Eng.,119
Rafiqul Gani,Comput. Chem. Eng.,59
Efstratios N. Pistikopoulos,Comput. Chem. Eng.,53
Max Mulder,IEEE Trans. Hum. Mach. Syst.,19
Luciano Floridi,Sci. Eng. Ethics,16
Michael D. Mumford,Sci. Eng. Ethics,16
Drew Fudenberg,J. Econ. Theory,15
Stephanie J. Bird,Sci. Eng. Ethics,15
Massimo Marinacci,J. Econ. Theory,15
Marinus Maria van Paassen,IEEE Trans. Hum. Mach. Syst.,15


In [442]:
top_publishers.to_csv("data/editors.csv")

### Conference Chairpersons

In [443]:
#make authors a list
inproceedings["author_list"] = inproceedings["author:string[]"].apply(lambda x: x.split("|"))

#
# expand authors to separate rows
editors = inproceedings.explode("author_list")

#group by researcher and journal
editors = editors.groupby(["author_list", "crossref:string[]"]).agg(publish_count=("author_list","count"))

In [444]:
editors = editors.sort_values("publish_count", ascending=False)
#get top 3 authors of each journal
top_publishers = editors.groupby(["crossref:string[]"]).head(3)

In [445]:
top_publishers = top_publishers.rename(columns={"author_list": "chair", "crossref:string[]": "conference_edition"})
top_publishers.sort_values("crossref:string[]").head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,publish_count
author_list,crossref:string[],Unnamed: 2_level_1
Pablo Amaya,conf/abmb/2005,1
Mehmet Aksit,conf/abmb/2005,1
Perla Velasco Elizondo,conf/abmb/2005,1
Niels Joncheere,conf/abmb/2006,1
Mathieu Braem,conf/abmb/2006,1
Mehmet Aksit,conf/abmb/2006,1
Hartmut Ehrig,conf/accat/2007,2
Leen Lambers,conf/accat/2007,1
Ulrike Prange,conf/accat/2007,3
Mostafa Ajallooeian,conf/acml/2010,1


In [446]:
top_publishers.to_csv("data/conference_chairs.csv")

### no of reviewers per journal/conference

In [447]:
journal_list = articles[["journal:string"]].drop_duplicates()
conference_list = inproceedings[["booktitle:string"]].drop_duplicates()

conference_list.columns = journal_list.columns

journal_conference_list = pd.concat([journal_list, conference_list])


journal_conference_list["no_reviewers"] = journal_conference_list["journal:string"].apply(lambda x: random.randint(1,4) if random.randint(1,10) < 4 else 3)
journal_conference_list.head(10)

Unnamed: 0,journal:string,no_reviewers
558,Hydrology and Earth System Sciences,3
562,"ETH Zurich, Department of Computer Science / T...",3
565,Sci. Eng. Ethics,3
1960,J. Econ. Theory,3
4628,Web Intell. Agent Syst.,2
4631,Web Intell.,4
5094,IEEE Trans. Hum. Mach. Syst.,3
5889,Comput. Chem. Eng.,3
2,Text Understanding in LILOG,3
4,Advanced Database Systems,3


In [448]:
def get_reviewer_count(journal_name):
    return journal_conference_list[journal_conference_list["journal:string"] == journal_name]["no_reviewers"].values[0]

### Reviews

In [449]:
art_short = articles[["key:string","journal:string", "author_list"]]
ip_short = inproceedings[["key:string","booktitle:string", "author_list"]]

ip_short.columns = art_short.columns

In [450]:
article_authors = pd.concat([ip_short, art_short])
article_authors.head(10)

Unnamed: 0,key:string,journal:string,author_list
2,journals/lncs/BollingerLP91,Text Understanding in LILOG,"[Sven Lorenz, Toni Bollinger, Udo Pletat]"
3,journals/lncs/CarstensenS91,Text Understanding in LILOG,"[Geoffrey Simmons, Kai-Uwe Carstensen]"
4,journals/lncs/FlaterY93,Advanced Database Systems,"[David W. Flater, Yelena Yesha]"
5,journals/lncs/RollingerH91,Text Understanding in LILOG,"[Claus-Rainer Rollinger, Otthein Herzog]"
6,journals/lncs/BhargavaJSD93,Advanced Database Systems,"[Bharat K. Bhargava, Jagannathan Srinivasan, P..."
7,journals/lncs/DorreR91,Text Understanding in LILOG,"[Ingo Raasch, Jochen Dörre]"
8,journals/lncs/Emde91,Text Understanding in LILOG,[Werner Emde]
9,journals/lncs/Ridoux94,Constraint Programming,[Olivier Ridoux]
10,journals/lncs/Blasius91,Text Understanding in LILOG,[Karl-Hans Bläsius]
11,journals/lncs/LuckP91,Text Understanding in LILOG,"[Kai von Luck, Thomas Pirlein]"


In [451]:
affiliated_authors = article_authors.groupby('journal:string').agg({'author_list': "sum"})
affiliated_authors["author_list"] = affiliated_authors["author_list"].apply(lambda x: set(x))



In [452]:
affiliated_authors_dict = affiliated_authors.to_dict("index")
all_authors = set()

for key, v in affiliated_authors_dict.items():
    all_authors = all_authors.union(v["author_list"])

print(len(all_authors))

31736


In [453]:
class Key:
    PREVIOUS_KEY = "Comput. Chem. Eng."

def get_reviewers(key):
    try:
        author_sample = random.sample(affiliated_authors_dict[key]["author_list"], 5)
        Key.PREVIOUS_KEY = key
    except ValueError:
        author_sample = random.sample(affiliated_authors_dict[Key.PREVIOUS_KEY]["author_list"], 5)
    
    return set(author_sample)

In [454]:
#get sample of affiliated reviewers
article_authors["reviewers"] = article_authors["journal:string"].apply(lambda x: get_reviewers(x))
article_authors.head(3)

since Python 3.9 and will be removed in a subsequent version.
  author_sample = random.sample(affiliated_authors_dict[key]["author_list"], 5)
since Python 3.9 and will be removed in a subsequent version.
  author_sample = random.sample(affiliated_authors_dict[Key.PREVIOUS_KEY]["author_list"], 5)


Unnamed: 0,key:string,journal:string,author_list,reviewers
2,journals/lncs/BollingerLP91,Text Understanding in LILOG,"[Sven Lorenz, Toni Bollinger, Udo Pletat]","{Hans-Joachim Novak, Gert Smolka, Karl-Hans Bl..."
3,journals/lncs/CarstensenS91,Text Understanding in LILOG,"[Geoffrey Simmons, Kai-Uwe Carstensen]","{Gregor Erbach, Gudrun Klose, Bernd Walter, Di..."
4,journals/lncs/FlaterY93,Advanced Database Systems,"[David W. Flater, Yelena Yesha]","{H. V. Jagadish, Martin Andersson, Yelena Yesh..."


In [455]:
#Remove paper authors from reviewers
article_authors["reviewers"] = article_authors.apply(lambda row: row["reviewers"].difference(set(row["author_list"])), axis=1)

In [456]:
#create random reviewers where not enough
def make_reviewers(current_reviewers:set, authors:set, journal:str):
    no_reviewers = get_reviewer_count(journal)

    while len(current_reviewers) < no_reviewers:
        #add new sample
        current_reviewers = current_reviewers.union(set(random.sample(all_authors, 1)))
        #remove current authors
        current_reviewers = current_reviewers.difference(authors)

    if len(current_reviewers) >no_reviewers:
        current_reviewers = set(random.sample(current_reviewers, no_reviewers))

    return current_reviewers

In [457]:
article_authors["reviewers"] = article_authors.apply(lambda row: make_reviewers(row["reviewers"], set(row["author_list"]), row["journal:string"]), axis=1)

since Python 3.9 and will be removed in a subsequent version.
  current_reviewers = set(random.sample(current_reviewers, no_reviewers))
since Python 3.9 and will be removed in a subsequent version.
  current_reviewers = current_reviewers.union(set(random.sample(all_authors, 1)))


In [458]:
reviewers = article_authors[["key:string", "reviewers"]].explode("reviewers")
reviewers.head()

Unnamed: 0,key:string,reviewers
2,journals/lncs/BollingerLP91,Petra Steffens
2,journals/lncs/BollingerLP91,Karl-Hans Bläsius
2,journals/lncs/BollingerLP91,Gert Smolka
3,journals/lncs/CarstensenS91,Bernd Walter
3,journals/lncs/CarstensenS91,Dieter Landes


In [459]:
reviewers["review_text"] = reviewers["reviewers"].apply(lambda x: lipsum.generate_words(30))
reviewers["suggested_decision"] = reviewers["review_text"].apply(lambda x: random.sample(["acceptance", "conditional acceptance", "conditional rejection", "outright rejection"],1)[0] if random.randint(1,10) < 2 else "acceptance")

reviewers["suggested_decision"].unique()

array(['acceptance', 'outright rejection', 'conditional acceptance',
       'conditional rejection'], dtype=object)

In [460]:
reviewers["supports_acceptance"] = reviewers["suggested_decision"].apply(lambda x : x in ["acceptance", "conditional acceptance"])

In [461]:
acceptance_statistic = reviewers.groupby(["key:string"]).agg(acceptance_count=("supports_acceptance","sum"), no_reviewers=("reviewers", "count"))
acceptance_statistic["accepted"] = acceptance_statistic["acceptance_count"] > acceptance_statistic["no_reviewers"]/2
#acceptance_statistic[acceptance_statistic["accepted"] == False]
len(reviewers)

56250

In [462]:
reviewers.to_csv("data/reviewers.csv", index=False)

### Affiliations

In [463]:
affiliations = pd.concat([articles[["author_list"]], inproceedings[["author_list"]]])
affiliations = affiliations.explode("author_list")
affiliations["affiliation"] = affiliations["author_list"].apply(generate_affiliation)
affiliations["organization_type"] = affiliations["affiliation"].apply(lambda x: x.split(" ")[0])
affiliations.head()

Unnamed: 0,author_list,affiliation,organization_type
558,Hugo Hellebrand,Company 20,Company
558,Markus Casper,University 20,University
558,Ralf Merz,Company 16,Company
558,Rita Ley,University 2,University
562,Markus Tresch,University 17,University


In [464]:
affiliations.to_csv("data/affiliations.csv", index=False)

## Useful resources
import guide: https://neo4j.com/developer/desktop-csv-import/ -> access import folder via UI -> DB -> Open folder -> import

# Data Loading
## Initial setup

Most things will be done via the Python Neo4j API. However, a few steps were taken before:
1. creation of the db via the UI. Name:publication-graph, PW: publication-graph
2. started db via the UI
3. preprocessed files for import placed in the respective input folder of the db

In [18]:
from py2neo import Graph, ClientError

In [19]:
#set connection variables
PORT = "7687" #database running on this port for bolt connections
USER = "neo4j" #standard user
PASSWORD = "publication-graph" #db password

In [20]:
#connect to database
try:
    graph = Graph('bolt://localhost:'+PORT, auth=(USER, PASSWORD))
    print('SUCCESS: Connected to the Neo4j Database.')
except Exception as e:
    print('ERROR: Could not connect to the Neo4j Database. See console for details.')
    raise SystemExit(e)

SUCCESS: Connected to the Neo4j Database.


In [21]:
def run_query(query:str):
    try:
        return graph.run(query)
    except ClientError as e:
        print(e.message)

def _reset_graph():
    q = """
        call{
        Match (n)
        detach delete n} in transactions
    """
    return run_query(q)

def _find_import_problems():
    q = """
    MATCH (n) WHERE size(labels(n)) = 0 RETURN n
    """

    return run_query(q)

In [339]:
#_reset_graph()

### Create constraints 
- to increase performance during loading

In [340]:
QUERY_1 = """ 

//Create constraints on the identifiers of the nodes

CREATE CONSTRAINT FOR (p:Paper) REQUIRE p.paper_key IS UNIQUE;
CREATE CONSTRAINT FOR (r:Researcher) REQUIRE r.name IS UNIQUE;
CREATE CONSTRAINT FOR (j:Journal) REQUIRE j.journal_name IS UNIQUE;
CREATE CONSTRAINT FOR (v:Volume) REQUIRE v.volume_key IS UNIQUE;
CREATE CONSTRAINT FOR (k:Keyword) REQUIRE k.keyword IS UNIQUE;
CREATE CONSTRAINT FOR (ce:ConferenceEdition) REQUIRE ce.conference_edition_key IS UNIQUE;
CREATE CONSTRAINT for (c:Conference) require c.conference_name is unique;

"""

In [341]:
constraints = QUERY_1.split(";")

try:
    
    for c in constraints:
        run_query(c)

except ClientError as e:
    print(e.message)

An equivalent constraint already exists, 'Constraint( id=4, name='constraint_a7d3fea0', type='UNIQUENESS', schema=(:Paper {paper_key}), ownedIndex=3 )'.
An equivalent constraint already exists, 'Constraint( id=6, name='constraint_c5e92188', type='UNIQUENESS', schema=(:Researcher {name}), ownedIndex=5 )'.
An equivalent constraint already exists, 'Constraint( id=8, name='constraint_c8139076', type='UNIQUENESS', schema=(:Journal {journal_name}), ownedIndex=7 )'.
An equivalent constraint already exists, 'Constraint( id=10, name='constraint_10a078f', type='UNIQUENESS', schema=(:Volume {volume_key}), ownedIndex=9 )'.
An equivalent constraint already exists, 'Constraint( id=12, name='constraint_b93297e0', type='UNIQUENESS', schema=(:Keyword {keyword}), ownedIndex=11 )'.
An equivalent constraint already exists, 'Constraint( id=14, name='constraint_6460932c', type='UNIQUENESS', schema=(:ConferenceEdition {conference_edition_key}), ownedIndex=13 )'.
An equivalent constraint already exists, 'Cons

### 2. Load information from articles file
This query loads most nodes from the articles csv

In [402]:
QUERY_2 = """
Load csv with headers from
'file:///articles_preprocessed.csv' AS line

FIELDTERMINATOR ','

call{
with line

MERGE (paper:Paper {paper_key: line.`key:string`})
  ON CREATE
    SET paper.title = line.`title:string`
    SET paper.doi = line.doi

MERGE (volume:Volume {volume_key:line.`crossref:string`})
  ON CREATE
    SET volume.year = line.`year:int`,
        volume.volume_no = line.`number:string`,
        volume.consecutive_issue_no = line.`volume:string`

WITH line, SPLIT(line.keywords, '|') as keywords
  UNWIND keywords as kw
    MERGE (keyword:Keyword {keyword:kw})

WITH line
MERGE (journal:Journal {journal_name: line.`journal:string`})
set journal.publisher = line.publisher

WITH line, SPLIT(line.`author:string[]`, '|') as authors
  UNWIND authors as a
  MERGE (author:Researcher {name:a})

} in transactions
"""

In [403]:
run_query(QUERY_2)

In [344]:
#This query creates the author of relationship

QUERY_3 = """
Load csv with headers from
'file:///articles_preprocessed.csv' AS line

FIELDTERMINATOR ','

call{

  WITH line
  WITH line, SPLIT(line.`author:string[]`, '|') as authors
  UNWIND authors as a
  Match (r:Researcher {name:a}), (paper:Paper {paper_key: line.`key:string`})
  MERGE (r)-[:AUTHOR_OF]->(paper)

}in transactions
"""

In [345]:
run_query(QUERY_3)

In [346]:
#set corresponding author
QUERY_4 = """
Load csv with headers from
'file:///articles_preprocessed.csv' AS line

FIELDTERMINATOR ','

call{
  with line
  match (r:Researcher {name:line.corresponding_author}),(paper:Paper {paper_key: line.`key:string`})
  Merge (r)-[authorof:AUTHOR_OF]->(paper)
  Set authorof.corresponding_author = true
}in transactions

"""

In [347]:
run_query(QUERY_4)

In [348]:
#published in relationship, issues relationship, main topic relationship
QUERY_5 ="""
Load csv with headers from
  'file:///articles_preprocessed.csv' AS line

    FIELDTERMINATOR ','

call{
  with line
  Match (p:Paper {paper_key: line.`key:string`}), (volume:Volume {volume_key:line.`crossref:string`}), (journal:Journal {journal_name: line.`journal:string`})
  Merge  (p)- [publishedin:PUBLISHED_IN] -> (volume)
    ON CREATE
      Set publishedin.pages = line.`pages:string`

  Merge  (journal)-[:ISSUES] ->(volume)

  WITH line, SPLIT(line.keywords, '|') as keywords, p
  Unwind keywords as kw
    Match (k:Keyword {keyword:kw})
    Merge (p)-[:MAIN_TOPIC]->(k)

}in TRANSACTIONS

"""

In [349]:
run_query(QUERY_5)

### Load information from inproceedings file

In [350]:
IP_QUERY_1 = """

LOAD CSV WITH HEADERS FROM
'file:///inproceedings_preprocessed.csv' AS line

FIELDTERMINATOR ','
call{
    with line
    Merge (paper:Paper {paper_key: line.`key:string`})
    set paper.title = coalesce(paper.title, line.`title:string`)

    with line, paper
    Merge (ce:ConferenceEdition {conference_edition_key: line.`crossref:string[]`})
    Set ce.year_held = coalesce(ce.year_held, line.year)

    with line, ce,paper
    merge (c:Conference {conference_name:line.`booktitle:string`})
    merge (c)-[:HOLDS]->(ce)

    WITH line, SPLIT(line.keywords, '|') as keywords,paper
    Unwind keywords as kw
    MERGE (keyword:Keyword {keyword:kw})
    
    with line, keyword,paper
    MERGE (paper)-[:MAIN_TOPIC]->(keyword)

    WITH line, SPLIT(line.`author:string[]`, '|') as authors, paper
    unwind authors as a
    MERGE (author:Researcher {name:a})
    Merge (author)-[:AUTHOR_OF]->(paper)



}in transactions
"""

In [351]:
run_query(IP_QUERY_1)

In [352]:
IP_QUERY_2 = """
LOAD CSV WITH HEADERS FROM
'file:///inproceedings_preprocessed.csv' AS line

FIELDTERMINATOR ','
call{
    with line
    match (ce:ConferenceEdition {conference_edition_key: line.`crossref:string[]`}),(paper:Paper {paper_key: line.`key:string`})
    merge (paper)-[pi:PUBLISHED_IN]-(ce)
    set pi.pages = coalesce (pi.pages,line.`pages:string[]` )

    with line, paper
    Match (r:Researcher {name:line.corresponding_author})
    Merge (r)-[authorof:AUTHOR_OF]->(paper)
    Set authorof.corresponding_author = true
}in transactions

"""

In [353]:
run_query(IP_QUERY_2)

### 3. Load citation information

In [354]:
C_QUERY_1 = """
LOAD CSV WITH HEADERS FROM
'file:///citations.csv' AS line

FIELDTERMINATOR ','

call{

    with line
    with line, SPLIT(line.cites, '|') as citations

    unwind citations as c
    Match (paper:Paper {paper_key:line.`key:string`}), (p:Paper {paper_key:c})
    Merge (paper)-[:CITES]->(p)

}in transactions
"""

In [355]:
run_query(C_QUERY_1)

### 4. Load Auxiliary Relationships

In [356]:
A_QUERY_1_COMPOSED = """
//Publications of a journal
Match (p:Paper)-[:PUBLISHED_IN]->(v:Volume)<-[:ISSUES]-(j:Journal)
Merge (p)<-[pub:JOURNAL_PUBLICATION]-(j)
Set pub.year = v.year;


//Citation Count Aid
Match (v1:Volume)<-[:PUBLISHED_IN]-(p1:Paper)-[:CITES]->(p2:Paper)-[:PUBLISHED_IN]->(:Volume)<-[:ISSUES]-(j:Journal)
Merge (p1)-[cit:JOURNAL_CITATION]->(j)
Set cit.year = v1.year
"""
A_QUERY_2 ="""
//chairs and editors
Load csv with headers from
    'file:///editors.csv' AS line

    FIELDTERMINATOR ','

    call{
with line
Match(j:Journal {journal_name: line.`journal:string`}), (r:Researcher {name: line.author_list})
Merge (r)-[:EDITOR]-(j)

}in transactions;
"""

A_QUERY_3 ="""
//chairs and editors
Load csv with headers from
    'file:///conference_chairs.csv' AS line

    FIELDTERMINATOR ','

    call{
with line
Match(ce:ConferenceEdition {conference_edition_key: line.`crossref:string[]`}), (r:Researcher {name: line.author_list})
Merge (r)-[:CHAIRPERSON]-(ce)

}in transactions

"""

A_QUERY_4 = """
//abstract same as title for now
Match(n:Paper)
Set n.abstract = n.title;
"""

A_QUERY_5 = """
//reviewers
Load csv with headers from
    'file:///reviewers.csv' AS line

    FIELDTERMINATOR ','

    call{
      with line
      Match (p:Paper {paper_key: line.`key:string`})<-[:JOURNAL_PUBLICATION]-(j:Journal) //
      Merge (rev:Review)-[:REVIEW_SUBJECT]->(p)

      with line, p, rev, j
      Match (a:Researcher {name:line.reviewers})
      Merge (a)-[:PARTICIPATES_IN]->(rev)
      Merge (rev)-[:SUBMITTED_TO]->(j)

    }in transactions;


"""

A_QUERY_6 = """
//reviewers
Load csv with headers from
    'file:///reviewers.csv' AS line

    FIELDTERMINATOR ','

    call{
      with line
      Match (p:Paper {paper_key: line.`key:string`})-[:PUBLISHED_IN]->(ce:ConferenceEdition)
      Merge (rev:Review)-[:REVIEW_SUBJECT]->(p)
      with line, p, rev, ce

      Match (a:Researcher {name:line.reviewers})
      Merge (a)-[:PARTICIPATES_IN]->(rev)
      Merge (rev)-[:SUBMITTED_TO]->(ce)

    }in transactions;


"""

In [357]:
for q in A_QUERY_1_COMPOSED.split(";"):
    run_query(q)

In [358]:
run_query(A_QUERY_2)

In [359]:
run_query(A_QUERY_3)

In [360]:
run_query(A_QUERY_4)

In [361]:
run_query(A_QUERY_5)
run_query(A_QUERY_6)

### 5. Load Proceeding informaiton

In [362]:
P_QUERY_1 = """

LOAD CSV WITH HEADERS FROM
'file:///proceedings_preprocessed.csv' AS line

FIELDTERMINATOR ','

call{
    with line
    MERGE(ce:ConferenceEdition {conference_edition_key: line.`key:string`})

        SET ce.year_published = line.`year:int`,
            ce.edition = line.edition,
            ce.proceeding_title = line.`title:string`


}in transactions   

"""

P_QUERY_2 = """
LOAD CSV WITH HEADERS FROM
'file:///inproceedings_preprocessed.csv' AS line

FIELDTERMINATOR ','

call{
    with line
    match (ce:ConferenceEdition {conference_edition_key: line.`crossref:string[]`})

    set ce.year_held = coalesce(ce.year_held, tointeger(line.`year:int`))

}in transactions

"""

In [363]:
run_query(P_QUERY_1)

In [364]:
run_query(P_QUERY_2)

In [365]:
#todo
#ask about conference or proceeding
# check journal_publication relationship

### Load review content

In [366]:
REVIEW_CONTENT_QUERY = """
//reviewers
Load csv with headers from
'file:///reviewers.csv' AS line

FIELDTERMINATOR ','

    call{
      with line
      Match (paper:Paper {paper_key: line.`key:string`})<-[:REVIEW_SUBJECT]-(review:Review)
      Match (r:Researcher {name: line.reviewers})-[pi:PARTICIPATES_IN]->(review)

      Set pi.suggested_decision = line.suggested_decision,
          pi.review_content = line.review_text,
          pi.supports_acceptance = toBoolean(line.supports_acceptance)

    }in transactions

"""

In [367]:
run_query(REVIEW_CONTENT_QUERY)

In [368]:
REVIEW_DECISION_QUERY = """

match (r:Review)-[:REVIEW_SUBJECT]->(x:Paper), (r)<-[pi:PARTICIPATES_IN]-(:Researcher)
with r,collect(pi) as reviews, count(pi) as total
with r,reviews,total 
unwind reviews as review
match ()-[review {supports_acceptance:TRUE}]->(r)
with r, count(review) as subtotal, total
with r, tofloat(subtotal) / total as acceptance_rate

set r.decision = 
case acceptance_rate > 0.5
when  TRUE then "ACCEPTED"
    else "REJECTED" end

"""

REMOVE_RELATED_RELATIONSHIPS = """
match (review:Review {decision:"REJECTED"})-[:REVIEW_SUBJECT]->(paper:Paper)
optional match (paper)-[pi:PUBLISHED_IN]->()
optional match (paper)<-[jp:JOURNAL_PUBLICATION]-(:Journal)
optional match (paper)-[jc:JOURNAL_CITATION]->(:Journal)

delete pi, jp, jc
"""

In [None]:
run_query(REVIEW_DECISION_QUERY)
run_query(REMOVE_RELATED_RELATIONSHIPS)

In [370]:
organization_constraint = "CREATE CONSTRAINT FOR (o:Organization) REQUIRE o.organization_name IS UNIQUE"

In [371]:
run_query(organization_constraint)

In [387]:
AFFILIATION = """
Load csv with headers from
  'file:///affiliations.csv' AS line

    FIELDTERMINATOR ','

call{
  with line
  
  Match (r:Researcher {name: line.author_list})
  Merge (o:Organization {organization_name: line.affiliation})
  set o.organization_type = line.organization_type

  Merge (r)-[:AFFILIATED_WITH]->(o)
  
}in transactions


"""

In [389]:
run_query(AFFILIATION)

DB Community

In [83]:
#updated_journal citation relationship
journal_citation ="""

//publication year of the paper that is cited
// year the citation happend

Match(paper:Paper)-[:CITES]->(cited_paper:Paper),(cited_paper)-[:PUBLISHED_IN]->(volume:Volume),(cited_paper)<-[:JOURNAL_PUBLICATION]-(journal:Journal), (paper)-[:PUBLISHED_IN]-(pub), (paper)-[jc:JOURNAL_CITATION]->(journal)

SET jc.year_cited = coalesce(pub.year, pub.year_published),
    jc.publication_year_cited_paper = volume.year


"""

impact_factor = """
//Impact factor (citations(year x) of papers published in last two year)/(publications in last two years)

Match (:Paper)-[jc:JOURNAL_CITATION]->(journal:Journal)
where jc.year_cited = <YEAR> AND (jc.publication_year_cited_paper=<YEAR-2> Or jc.publication_year_cited_paper=<YEAR-1>)
with journal, count(jc) as no_citations

match (:Paper)<-[jp:JOURNAL_PUBLICATION]-(journal)
where jp.year = <YEAR-2> or jp.year = <YEAR-1>

with journal, no_citations, count(jp) as no_publications
return journal.journal_name, no_citations, no_publications, (tofloat(no_citations) / no_publications) as impact_factor

"""

In [84]:
def get_impact_factor(year:int):
    q = impact_factor.replace("<YEAR>", str(year)).replace("<YEAR-1>", str(year-1)).replace("<YEAR-2>", str(year-2))

    print(run_query(q))

In [86]:
get_impact_factor(2016)

 journal.journal_name | no_citations | no_publications |     impact_factor 
----------------------|--------------|-----------------|-------------------
 J. Econ. Theory      |          454 |             261 | 1.739463601532567 



In [90]:
h_index = """
Match (a:Researcher)-[ao:AUTHOR_OF]->(paper:Paper)
optional match (paper)<-[ci:CITES]-(citing_paper:Paper)

With  a, paper, count(citing_paper) as no_cit
With a, collect([a,paper,no_cit]) as papers

unwind range(0, size(papers)-1) as ind

//where no_cit > no_pub

with a, papers[ind][0] as auth, papers[ind][1] as pap,ind,papers[ind][2] as no_cit, case papers[ind][2] >= ind +1 when true then 1 else 0 end as in_count
//where  no_cit >= ind +1
//return a.name, pap.title, ind + 1 , no_cit, in_count
order by ind desc, no_cit
return a.name, sum( in_count) as h_index
order by h_index desc

"""

In [91]:
cursor = run_query(h_index)

In [92]:
df = cursor.to_data_frame()

In [94]:
len(df)

31729