# Part A2 
This notebook containts the preprocessing and data generation steps that were performed to obtain the graph data.

First step: created csv files with default commands using the github recommended csv converter: XMLToCSV.py --annotate --neo4j data/dblp.xml data/dblp.dtd data/output.csv

## CSV processing

In [40]:
#import of required packages


import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import random
import lipsum

### 0. Helper functions

In [41]:
#stopwords for stopword removal in keyword generation
STOP_WORDS = set(stopwords.words('english')) 

#collection of database community keywords
DATABASE_COMMUNITY_KEYWORDS = ["data management","indexing", "data modeling", "big data", "data processing", "data storage","data querying"]

def create_keywords(title, for_db_community=False):
    """
    This functions takes in a title string and returns keywords based on that title. This is done by splitting the title into tokens
    and removing the stopwords. Since the database community keywords are usually 2 words long, they are not included this way. To have 
    enough papers with database community keywords, 1 database community keyword is added randomly to ~30% of papers.
    """
    title = re.sub(r'[^\w\s]', "", title) #remove punctuation
    word_tokens = word_tokenize(title)
    keywords = set([w.lower() for w in word_tokens if not w.lower() in STOP_WORDS])

    for kw in DATABASE_COMMUNITY_KEYWORDS:
        if kw in title:
            keywords.add(kw)

    if for_db_community:
        keywords.add(random.sample(DATABASE_COMMUNITY_KEYWORDS,1)[0])

    #add random database community kw to approx 10% of papers
    if(random.randrange(0,10,1)<= 1):
        keywords.add(random.sample(DATABASE_COMMUNITY_KEYWORDS,1)[0])

    return "|".join(keywords)

def trim_title(title):
    """
    During data loading we discovered that there are very few papers that have an extremly long title string which causes problems during 
    data loading. Therefore the title character limit is checked and set to 300 characters
    """
    if len(title)>300:
        return title[:300]
    
    return title


def generate_affiliation(row):
    """
    Generates a random affiliation tuple.
    """
    affil = {
        1: "Company",
        2: "University"
    }

    return f"{affil.get(random.randint(1,2))} {random.randint(1,30)}"

def generate_publisher(publisher):
    """
    Generates a random publisher string.
    """
    return f"Publisher {random.randint(1,10)}"

In [42]:
def doi_or_na(ee):
    try:
        doi = ee.split("|")[0]
        #returns the text if it is a doi, na otherwise
        if("doi" in ee):
            return ee
        
    except:
        print("no str found")
    
    return pd.NA

### 1. Article Preprocessing

extracting relevant article information from the articles csv.

In [43]:
#headers to keep from the csv
KEEP_HEADERS_ARTICLE = [
    "article:ID",
    "author:string[]",
    "crossref:string",#volume-key
    "editor:string[]",
    "ee:string[]",
    "journal:string",#journal:name
    "key:string", #paper:key
    "number:string",#volume of the year
    "pages:string",
    "title:string", #paper:title
    "volume:string",#consecutive issue number
    "year:int", #volume:year,
    "publisher:string"
]

In [44]:
header = """article:ID;author:string[];author-aux:string;author-orcid:string[];booktitle:string;cdate:date;cdrom:string;cite:string[];cite-label:string[];crossref:string;editor:string[];editor-orcid:string[];ee:string[];ee-type:string[];i:string[];journal:string;key:string;mdate:date;month:string;note:string[];note-label:string;note-type:string[];number:string;pages:string;publisher:string;publnr:string;publtype:string;sub:string[];sup:string[];title:string;title-bibtex:string;tt:string[];url:string[];volume:string;year:int""".split(";")

#load data with header
articles = pd.read_csv("data/output_article.csv", nrows=100000, sep=";", names=header)
articles = articles[KEEP_HEADERS_ARTICLE]

#typing columns
articles["crossref:string"] = articles["crossref:string"].astype("string")
articles["title:string"] = articles["title:string"].astype("string")
articles["author:string[]"] = articles["author:string[]"].astype("string")
pd.to_numeric(articles["number:string"], errors="coerce", downcast="integer")
pd.to_numeric(articles["year:int"], downcast="integer")

#filter down to year > 1985
articles = articles.dropna(subset="year:int")
articles = articles[articles["year:int"] > 1985]
print(len(articles))

  articles = pd.read_csv("data/output_article.csv", nrows=100000, sep=";", names=header)


98652


In [45]:
#get doi
articles["doi"] = articles["ee:string[]"].apply(lambda x: doi_or_na(str(x)))

In [46]:
#make database community

database_journals = ['Decis. Support Syst.', 'J. Assoc. Inf. Sci. Technol.', 'Big Data Res.']
all_journals = ['Web Intell. Agent Syst.', 'Web Intell.' 'IEEE Trans. Hum. Mach. Syst.', 
                'Comput. Chem. Eng.', 'IEEE Control. Syst. Lett.', 'Inf. Knowl. Syst. Manag.', 'Internet Math.',
                'Int. J. Web Inf. Syst.', 'Manag. Sci.' 'Simul.', 'Rev. Iberoam. de Tecnol. del Aprendiz.', 'Dyn. Games Appl.',
                'Decis. Support Syst.', 'J. Assoc. Inf. Sci. Technol.', 'Big Data Res.',
                'Web Intell. Agent Syst.', 'Web Intell.', 'IEEE Trans. Hum. Mach. Syst.']


#filter journals
articles["keep"] = articles["journal:string"].apply(lambda x: x in all_journals)
articles["db_community"] = articles["journal:string"].apply(lambda x: x in database_journals)

articles = articles[articles["keep"]]
print(len(articles))

18030


In [47]:
#drop rows with no journal name
articles = articles.dropna(subset=["journal:string"])

#drop articles without author
articles = articles.dropna(subset=["author:string[]"])

#drop with no pages
articles = articles.dropna(subset=["pages:string"])

#ensure title lenth
articles = articles.dropna(subset=["title:string"])
articles["title:string"] = articles["title:string"].map(trim_title)

#drop rows with missing year
articles = articles.dropna(subset=["year:int"])

#assert volume 1 for missing volume number
articles["number:string"] = articles["number:string"].fillna(1) #set volume number to 1 for missing ones

#create volumekey
articles["volume_key"] = articles["journal:string"] + articles["year:int"].astype("string") + articles["number:string"].astype("string")

#fill missing crossref
articles["crossref:string"] = articles["crossref:string"].fillna(articles["volume_key"])

#create corresponding_author
articles["corresponding_author"] = articles["author:string[]"].apply(lambda x: x.split("|")[0])

In [49]:
#generate keywords
articles["keywords"] = articles.apply(lambda row: create_keywords(row["title:string"], row["db_community"]), axis=1)
articles["in_db_community"] = articles["keywords"].apply(lambda x: len(set(DATABASE_COMMUNITY_KEYWORDS).intersection(set(x.split("|"))))>0)

In [50]:
#generate publishers
articles["publisher"] = articles["publisher:string"].apply(generate_publisher)

In [53]:
articles[articles["in_db_community"]].head()
#print(len(articles))

Unnamed: 0,article:ID,author:string[],crossref:string,editor:string[],ee:string[],journal:string,key:string,number:string,pages:string,title:string,...,year:int,publisher:string,doi,keep,db_community,volume_key,corresponding_author,keywords,in_db_community,publisher
4634,32739,Alain Gaetan Njimolu Anyouzoa|Theo D'Hondt,Web Intell. Agent Syst.20052,,http://content.iospress.com/articles/web-intel...,Web Intell. Agent Syst.,journals/wias/AnyouzoaD05,2,85-95,On the stability of a dynamic stochastic capac...,...,2005,,,True,False,Web Intell. Agent Syst.20052,Alain Gaetan Njimolu Anyouzoa,pricing|capacity|resource|scheme|allocation|mu...,True,Publisher 2
4637,32742,Antonín Komenda|Jirí Vokrínek|Michal Pechoucek,Web Intell. Agent Syst.20112,,https://doi.org/10.3233/WIA-2011-0210,Web Intell. Agent Syst.,journals/wias/KomendaVP11,2,123-133,Plan representation and execution in multi-act...,...,2011,,https://doi.org/10.3233/WIA-2011-0210,True,False,Web Intell. Agent Syst.20112,Antonín Komenda,means|scenarios|multiactor|social|representati...,True,Publisher 5
4641,32746,Eugénio C. Oliveira|Lennart E. Nacke|Pedro Alv...,Web Intell.20153,,https://doi.org/10.3233/WEB-150321|https://www...,Web Intell.,journals/wias/NogueiraRON15,3,195-214,Modelling human emotion in interactive environ...,...,2015,,https://doi.org/10.3233/WEB-150321|https://www...,True,False,Web Intell.20153,Eugénio C. Oliveira,physiological|data management|modelling|intera...,True,Publisher 5
4643,32748,Haralambos Mouratidis|Manuel Kolp|Paolo Giorgi...,Web Intell. Agent Syst.20101,,https://doi.org/10.3233/WIA-2010-0182,Web Intell. Agent Syst.,journals/wias/MouratidisKGF10,1,99-122,An architectural description language for secu...,...,2010,,https://doi.org/10.3233/WIA-2010-0182,True,False,Web Intell. Agent Syst.20101,Haralambos Mouratidis,systems|multiagent|language|description|secure...,True,Publisher 2
4651,32756,Brian Thoms|Evren Eryilmaz|Nicole Dubin|Rafael...,Web Intell.20201,,https://doi.org/10.3233/WEB-200425,Web Intell.,journals/wias/ThomsEDHC20,1,1-13,Real-time visualization to improve quality in ...,...,2020,,https://doi.org/10.3233/WEB-200425,True,False,Web Intell.20201,Brian Thoms,improve|realtime|visualization|data processing...,True,Publisher 1


In [54]:
#save to file
articles.to_csv("data/articles_preprocessed.csv", index=False)

### 2. Inproceedings preprocessing

In [5]:
KEEP_HEADER_INPROCEEDING = [
    "inproceedings:ID",
    "author:string[]",
    "booktitle:string",#conference/forum title
    "crossref:string[]",#proceeding key
    "editor:string[]",
    "ee:string[]",
    "key:string",#inproceedings key
    "number:string",
    "pages:string",
    "title:string",
    "volume:int",
    "year:int"#year_held
]

In [6]:
header = """inproceedings:ID;author:string[];author-aux:string[];author-orcid:string[];booktitle:string;cdrom:string[];cite:string[];cite-label:string[];crossref:string[];editor:string[];editor-orcid:string[];ee:string[];ee-type:string[];i:string[];key:string;mdate:date;month:string;note:string;note-type:string;number:string;pages:string;publtype:string;sub:string[];sup:string[];title:string;title-bibtex:string;tt:string;url:string;volume:int;year:int""".split(";")
inproceedings = pd.read_csv("data/output_inproceedings.csv", nrows=10000, sep=";", names=header)
inproceedings = inproceedings[KEEP_HEADER_INPROCEEDING]

#typing columns
inproceedings["crossref:string[]"] = inproceedings["crossref:string[]"].astype("string")
inproceedings["title:string"] = inproceedings["title:string"].astype("string")
pd.to_numeric(inproceedings["number:string"], errors="coerce", downcast="integer")
pd.to_numeric(inproceedings["year:int"], downcast="integer")
inproceedings["author:string[]"] = inproceedings["author:string[]"].astype("string")

#only keep > 1985
inproceedings = inproceedings.dropna(subset="year:int")
inproceedings = inproceedings[inproceedings["year:int"] > 1985]
print(len(inproceedings))

9764


In [7]:
inproceedings.head()
#print(inproceedings["booktitle:string"].unique())

Unnamed: 0,inproceedings:ID,author:string[],booktitle:string,crossref:string[],editor:string[],ee:string[],key:string,number:string,pages:string,title:string,volume:int,year:int
0,555,Arnon Rosenthal,SWEE,conf/swee/1998,,http://www.mitre.org/support/swee/rosenthal.html,www/org/mitre/future,,,The Future of Classic Data Administration: Obj...,,1998
2,159865,Sven Lorenz|Toni Bollinger|Udo Pletat,Text Understanding in LILOG,journals/lncs/1991-546,,https://doi.org/10.1007/3-540-54594-8_72,journals/lncs/BollingerLP91,,402-427,The LILOG Inference Engine.,,1991
3,159866,Geoffrey Simmons|Kai-Uwe Carstensen,Text Understanding in LILOG,journals/lncs/1991-546,,https://doi.org/10.1007/3-540-54594-8_83,journals/lncs/CarstensenS91,,632-644,Why a Hill Can't be a Valley: Representing Ges...,,1991
4,159867,David W. Flater|Yelena Yesha,Advanced Database Systems,journals/lncs/1993-759,,https://doi.org/10.1007/3-540-57507-3_13,journals/lncs/FlaterY93,,259-276,Towards Flexible Distributed Information Retri...,,1993
5,159868,Claus-Rainer Rollinger|Otthein Herzog,Text Understanding in LILOG,journals/lncs/1991-546,,https://doi.org/10.1007/3-540-54594-8_46,journals/lncs/RollingerH91,,3-13,Introducing LILOG.,,1991


In [8]:
#crossref is required to get the information from proceedings - thus crossref na are dropped
inproceedings = inproceedings.dropna(subset=["crossref:string[]"])

#drop articles without author
inproceedings = inproceedings.dropna(subset=["author:string[]"])

#create corresponding_author
inproceedings["corresponding_author"] = inproceedings["author:string[]"].apply(lambda x: x.split("|")[0])

#we have enough elements so drop the ones without pages
inproceedings = inproceedings.dropna(subset=["pages:string"])

#create doi
inproceedings["doi"] = inproceedings["ee:string[]"].apply(doi_or_na)

#ensure title lenth
inproceedings = inproceedings.dropna(subset=["title:string"])
inproceedings["title:string"] = inproceedings["title:string"].map(trim_title)

no str found
no str found
no str found


In [9]:
print(len(inproceedings))

9651


In [10]:
#generate keywords
inproceedings["keywords"] = inproceedings["title:string"].map(create_keywords)
inproceedings["is_workshop"] = inproceedings["title:string"].apply(lambda x: "workshop" in x.lower())

In [12]:
print(len(inproceedings[inproceedings["is_workshop"]]))

49


In [13]:
inproceedings.head()

Unnamed: 0,inproceedings:ID,author:string[],booktitle:string,crossref:string[],editor:string[],ee:string[],key:string,number:string,pages:string,title:string,volume:int,year:int,corresponding_author,doi,keywords,is_workshop
2,159865,Sven Lorenz|Toni Bollinger|Udo Pletat,Text Understanding in LILOG,journals/lncs/1991-546,,https://doi.org/10.1007/3-540-54594-8_72,journals/lncs/BollingerLP91,,402-427,The LILOG Inference Engine.,,1991,Sven Lorenz,https://doi.org/10.1007/3-540-54594-8_72,lilog|data processing|inference|engine,False
3,159866,Geoffrey Simmons|Kai-Uwe Carstensen,Text Understanding in LILOG,journals/lncs/1991-546,,https://doi.org/10.1007/3-540-54594-8_83,journals/lncs/CarstensenS91,,632-644,Why a Hill Can't be a Valley: Representing Ges...,,1991,Geoffrey Simmons,https://doi.org/10.1007/3-540-54594-8_83,representing|position|objects|cant|object|vall...,False
4,159867,David W. Flater|Yelena Yesha,Advanced Database Systems,journals/lncs/1993-759,,https://doi.org/10.1007/3-540-57507-3_13,journals/lncs/FlaterY93,,259-276,Towards Flexible Distributed Information Retri...,,1993,David W. Flater,https://doi.org/10.1007/3-540-57507-3_13,information|flexible|distributed|retrieval|tow...,False
5,159868,Claus-Rainer Rollinger|Otthein Herzog,Text Understanding in LILOG,journals/lncs/1991-546,,https://doi.org/10.1007/3-540-54594-8_46,journals/lncs/RollingerH91,,3-13,Introducing LILOG.,,1991,Claus-Rainer Rollinger,https://doi.org/10.1007/3-540-54594-8_46,lilog|introducing,False
6,159869,Bharat K. Bhargava|Jagannathan Srinivasan|Pras...,Advanced Database Systems,journals/lncs/1993-759,,https://doi.org/10.1007/3-540-57507-3_5,journals/lncs/BhargavaJSD93,,87-103,Transition From A Relation To Object Model Imp...,,1993,Bharat K. Bhargava,https://doi.org/10.1007/3-540-57507-3_5,relation|object|model|data storage|implementat...,False


In [14]:
#save to file
inproceedings.to_csv("data/inproceedings_preprocessed.csv", index=False)

### 3. Proceedings csv Preprocessing

In [None]:
KEEP_HEADER_PROCEEDING = [
    "proceedings:ID",
    "booktitle:string",
    "editor:string[]",
    "ee:string[]",
    "key:string",
    "number:string",
    "title:string",
    "volume:string",
    "year:int"
]

In [None]:
header = """proceedings:ID;address:string;author:string[];booktitle:string;cite:string[];cite-label:string[];editor:string[];editor-orcid:string[];ee:string[];ee-type:string[];i:string;isbn:string[];isbn-type:string[];journal:string;key:string;mdate:date;note:string[];note-type:string;number:string;pages:string;publisher:string[];publisher-href:string;publtype:string;school:string;series:string[];series-href:string[];sub:string;sup:string[];title:string;url:string[];volume:string;year:int""".split(";")
proceedings = pd.read_csv("data/output_proceedings.csv", nrows=100000, sep=";", names=header)
proceedings = proceedings[KEEP_HEADER_PROCEEDING]

#typing columns
proceedings["title:string"] = proceedings["title:string"].astype("string")
pd.to_numeric(proceedings["number:string"], errors="coerce", downcast="integer")
pd.to_numeric(proceedings["year:int"], downcast="integer")

  proceedings = pd.read_csv("data/output_proceedings.csv", nrows=100000, sep=";", names=header)


0        1999
1        2015
2        2013
3        2014
4        2019
         ... 
53876    2013
53877    2007
53878    1999
53879    2017
53880    2010
Name: year:int, Length: 53881, dtype: int16

In [None]:
proceedings.head()

Unnamed: 0,proceedings:ID,booktitle:string,editor:string[],ee:string[],key:string,number:string,title:string,volume:string,year:int
0,461,MMB (Kurzvorträge),Dieter Baum|Norbert Th. Müller|Richard Rödler,,tr/trier/MI99-17,,"MMB '99, Messung, Modellierung und Bewertung v...",99-16,1999
1,2240,,Amir Hossein Alavi|Amir Hossein Gandomi|Conor ...,https://doi.org/10.1007/978-3-319-20883-1,reference/genetic/2015,,Handbook of Genetic Programming Applications,,2015
2,13516,,Ankur Agarwal|Borko Furht,https://doi.org/10.1007/978-1-4614-8495-0,reference/med/2013,,Handbook of Medical and Healthcare Technologies,,2013
3,103589,Trans. Computational Collective Intelligence,Ngoc Thanh Nguyen,https://doi.org/10.1007/978-3-662-44509-9,journals/tcci/2014-14,,Transactions on Computational Collective Intel...,8615,2014
4,103594,,Marcin Hernes|Ngoc Thanh Nguyen|Ryszard Kowalczyk,https://doi.org/10.1007/978-3-662-58611-2,journals/tcci/2019-32,,Transactions on Computational Collective Intel...,11370,2019


In [None]:
#drop where title not present
proceedings = proceedings.dropna(subset=["title:string"])

In [None]:
#filter proceedings down to relevant subset from selected papers
proceedings["in_selected_ip_subset"] = proceedings["key:string"].apply(lambda x: True if x in conference_crossrefs else pd.NA)
proceedings = proceedings.dropna(subset=["in_selected_ip_subset"])
print(sum(proceedings["in_selected_ip_subset"]))

#create doi
proceedings["doi"] = proceedings["ee:string[]"].apply(doi_or_na)

823


In [None]:
# get edition where possible
def get_edition(volume):
    try:
        return volume.split("-")[1]
    except Exception:
        return None
    
proceedings["edition"] = proceedings["volume:string"].map(get_edition)
proceedings["edition"] = proceedings["edition"].fillna(proceedings["volume:string"])


In [None]:
proceedings.to_csv("data/proceedings_preprocessed.csv", index=False)

## Data Generation

To be able to reflect everything that is required as described in the Lab text, we have to generate some additional data

### 1. Create citations

In [None]:
#paper keys and years from articles
article_keys = articles[["key:string","year:int"]]
#print(article_keys)

ip_keys = inproceedings[["key:string","year:int"]]
#print(ip_keys)

## union both
cite_keys = pd.concat([article_keys, ip_keys])
#cite_keys["year:int"].dtype

                       key:string  year:int
558              persons/LeyCHM11      2011
562              persons/Tresch96      1996
564        persons/CasperGGGHLR12      2012
565         journals/see/Davies19      2019
566        journals/see/MurphyG08      2008
...                           ...       ...
9995        journals/cce/VidalM15      2015
9996          journals/cce/LiuK08      2008
9997        journals/cce/ApioBT18      2018
9998  journals/cce/BattistiCMMM20      2020
9999  journals/cce/RebughiniCDM17      2017

[9282 rows x 2 columns]
                        key:string  year:int
2      journals/lncs/BollingerLP91      1991
3      journals/lncs/CarstensenS91      1991
4          journals/lncs/FlaterY93      1993
5       journals/lncs/RollingerH91      1991
6      journals/lncs/BhargavaJSD93      1993
...                            ...       ...
9995       journals/corr/BlancoM15      2015
9996  journals/corr/abs-2109-08303      2021
9997   journals/corr/abs-1108-1865      20

In [None]:
def generate_citations(df, year):
    """
    Selects 5 to 15 random citations from papers that have been published earlier than the current paper.
    """
    no_citations = random.randint(5,15)

    df = df[df["year:int"] < year]

    try:
        citation_keys = df["key:string"].sample(no_citations, random_state=42).tolist()
    except ValueError:
        #for the oldest papers obviously no citations can be selected, thus they won't have citations.
        print(f"could not make sample for year {year}")
        citation_keys = []

    return citation_keys    

In [15]:
#create citations across the board
cite_keys["cites"] = cite_keys["year:int"].apply(lambda year: generate_citations(cite_keys, year))

#create citations to journal articles, so that impact factor can calculated for more years
cite_keys["cites_journal_specific"] = cite_keys["year:int"].apply(lambda year: generate_citations(article_keys, year))

NameError: name 'cite_keys' is not defined

In [None]:
cite_keys.head()

Unnamed: 0,key:string,year:int,cites,cites_journal_specific
558,persons/LeyCHM11,2011,"[journals/entcs/KerjeanKST06, journals/jet/Ehl...","[journals/jet/FudenbergI06, journals/cce/Kokos..."
562,persons/Tresch96,1996,"[journals/oopsm/MalenfantLV91, journals/lncs/K...",[]
564,persons/CasperGGGHLR12,2012,"[journals/entcs/RebernakMHP06, journals/proced...","[journals/cce/ZhangHCW11, journals/jet/MorenoW..."
565,journals/see/Davies19,2019,"[journals/entcs/Merro07, journals/jet/Dokumaci...","[journals/thms/LeeCLPKK17, journals/cce/Rafiei..."
566,journals/see/MurphyG08,2008,"[journals/entcs/RaymondRJ08, journals/sigcse/W...","[journals/wias/YenFV04, journals/cce/BezzoMP04..."


In [None]:
#list concatenation before writing list to file
cite_keys["cites"] = cite_keys["cites"].apply(lambda x: "|".join(x))
cite_keys["cites_journal_specific"] = cite_keys["cites_journal_specific"].apply(lambda x: "|".join(x))

cite_keys.head()

Unnamed: 0,key:string,year:int,cites,cites_journal_specific
558,persons/LeyCHM11,2011,journals/entcs/KerjeanKST06|journals/jet/Ehler...,journals/jet/FudenbergI06|journals/cce/Kokossi...
562,persons/Tresch96,1996,journals/oopsm/MalenfantLV91|journals/lncs/Khe...,
564,persons/CasperGGGHLR12,2012,journals/entcs/RebernakMHP06|journals/procedia...,journals/cce/ZhangHCW11|journals/jet/MorenoW02...
565,journals/see/Davies19,2019,journals/entcs/Merro07|journals/jet/DokumaciS1...,journals/thms/LeeCLPKK17|journals/cce/Rafiei-S...
566,journals/see/MurphyG08,2008,journals/entcs/RaymondRJ08|journals/sigcse/Wal...,journals/wias/YenFV04|journals/cce/BezzoMP04|j...


In [None]:
cite_keys.to_csv("data/citations.csv", index=False)

### 2. Create editor relationships

In [None]:
#make authors a list
articles["author_list"] = articles["author:string[]"].apply(lambda x: x.split("|"))

#
# expand authors to separate rows
editors = articles.explode("author_list")

#group by researcher and journal
editors = editors.groupby(["author_list", "journal:string"]).agg(publish_count=("author_list","count"))

In [None]:
editors = editors.sort_values("publish_count", ascending=False)
#get top 3 authors of each journal
top_publishers = editors.groupby(["journal:string"]).head(3)

In [None]:
top_publishers = top_publishers.rename(columns={"author_list": "editor", "journal:string": "journal"})
top_publishers.head(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,publish_count
author_list,journal:string,Unnamed: 2_level_1
Ignacio E. Grossmann,Comput. Chem. Eng.,119
Rafiqul Gani,Comput. Chem. Eng.,59
Efstratios N. Pistikopoulos,Comput. Chem. Eng.,53
Max Mulder,IEEE Trans. Hum. Mach. Syst.,19
Luciano Floridi,Sci. Eng. Ethics,16
Michael D. Mumford,Sci. Eng. Ethics,16
Drew Fudenberg,J. Econ. Theory,15
Stephanie J. Bird,Sci. Eng. Ethics,15
Massimo Marinacci,J. Econ. Theory,15
Marinus Maria van Paassen,IEEE Trans. Hum. Mach. Syst.,15


In [None]:
top_publishers.to_csv("data/editors.csv")

### 3. Create Conference Chairpersons Relationships

In [None]:
#make authors a list
inproceedings["author_list"] = inproceedings["author:string[]"].apply(lambda x: x.split("|"))

#
# expand authors to separate rows
editors = inproceedings.explode("author_list")

#group by researcher and journal
editors = editors.groupby(["author_list", "crossref:string[]"]).agg(publish_count=("author_list","count"))

In [None]:
editors = editors.sort_values("publish_count", ascending=False)
#get top 3 authors of each journal
top_publishers = editors.groupby(["crossref:string[]"]).head(3)

In [None]:
top_publishers = top_publishers.rename(columns={"author_list": "chair", "crossref:string[]": "conference_edition"})
top_publishers.sort_values("crossref:string[]").head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,publish_count
author_list,crossref:string[],Unnamed: 2_level_1
Pablo Amaya,conf/abmb/2005,1
Mehmet Aksit,conf/abmb/2005,1
Perla Velasco Elizondo,conf/abmb/2005,1
Niels Joncheere,conf/abmb/2006,1
Mathieu Braem,conf/abmb/2006,1
Mehmet Aksit,conf/abmb/2006,1
Hartmut Ehrig,conf/accat/2007,2
Leen Lambers,conf/accat/2007,1
Ulrike Prange,conf/accat/2007,3
Mostafa Ajallooeian,conf/acml/2010,1


In [None]:
top_publishers.to_csv("data/conference_chairs.csv")

### 4. Create review information 

First the no of reviewers per journal/conference is determined. Usually this will be 3, however there is some variation introduced.

In [None]:
journal_list = articles[["journal:string"]].drop_duplicates()
conference_list = inproceedings[["booktitle:string"]].drop_duplicates()

conference_list.columns = journal_list.columns

journal_conference_list = pd.concat([journal_list, conference_list])


journal_conference_list["no_reviewers"] = journal_conference_list["journal:string"].apply(lambda x: random.randint(1,4) if random.randint(1,10) < 4 else 3)
journal_conference_list.head(10)

Unnamed: 0,journal:string,no_reviewers
558,Hydrology and Earth System Sciences,3
562,"ETH Zurich, Department of Computer Science / T...",3
565,Sci. Eng. Ethics,3
1960,J. Econ. Theory,3
4628,Web Intell. Agent Syst.,2
4631,Web Intell.,4
5094,IEEE Trans. Hum. Mach. Syst.,3
5889,Comput. Chem. Eng.,3
2,Text Understanding in LILOG,3
4,Advanced Database Systems,3


In [None]:
def get_reviewer_count(journal_name):
    return journal_conference_list[journal_conference_list["journal:string"] == journal_name]["no_reviewers"].values[0]

Then the Reviews are created. for a paper the researchers participating in that review are determined

In [None]:
art_short = articles[["key:string","journal:string", "author_list"]]
ip_short = inproceedings[["key:string","booktitle:string", "author_list"]]

ip_short.columns = art_short.columns

In [None]:
article_authors = pd.concat([ip_short, art_short])
article_authors.head(10)

Unnamed: 0,key:string,journal:string,author_list
2,journals/lncs/BollingerLP91,Text Understanding in LILOG,"[Sven Lorenz, Toni Bollinger, Udo Pletat]"
3,journals/lncs/CarstensenS91,Text Understanding in LILOG,"[Geoffrey Simmons, Kai-Uwe Carstensen]"
4,journals/lncs/FlaterY93,Advanced Database Systems,"[David W. Flater, Yelena Yesha]"
5,journals/lncs/RollingerH91,Text Understanding in LILOG,"[Claus-Rainer Rollinger, Otthein Herzog]"
6,journals/lncs/BhargavaJSD93,Advanced Database Systems,"[Bharat K. Bhargava, Jagannathan Srinivasan, P..."
7,journals/lncs/DorreR91,Text Understanding in LILOG,"[Ingo Raasch, Jochen Dörre]"
8,journals/lncs/Emde91,Text Understanding in LILOG,[Werner Emde]
9,journals/lncs/Ridoux94,Constraint Programming,[Olivier Ridoux]
10,journals/lncs/Blasius91,Text Understanding in LILOG,[Karl-Hans Bläsius]
11,journals/lncs/LuckP91,Text Understanding in LILOG,"[Kai von Luck, Thomas Pirlein]"


In [None]:
affiliated_authors = article_authors.groupby('journal:string').agg({'author_list': "sum"})
affiliated_authors["author_list"] = affiliated_authors["author_list"].apply(lambda x: set(x))

In [None]:
affiliated_authors_dict = affiliated_authors.to_dict("index")
all_authors = set()

for key, v in affiliated_authors_dict.items():
    all_authors = all_authors.union(v["author_list"])

print(len(all_authors))

31736


In [None]:
class Key:
    PREVIOUS_KEY = "Comput. Chem. Eng."

def get_reviewers(key):
    try:
        author_sample = random.sample(affiliated_authors_dict[key]["author_list"], 5)
        Key.PREVIOUS_KEY = key
    except ValueError:
        author_sample = random.sample(affiliated_authors_dict[Key.PREVIOUS_KEY]["author_list"], 5)
    
    return set(author_sample)

In [None]:
#get sample of affiliated reviewers
article_authors["reviewers"] = article_authors["journal:string"].apply(lambda x: get_reviewers(x))
article_authors.head(3)

since Python 3.9 and will be removed in a subsequent version.
  author_sample = random.sample(affiliated_authors_dict[key]["author_list"], 5)
since Python 3.9 and will be removed in a subsequent version.
  author_sample = random.sample(affiliated_authors_dict[Key.PREVIOUS_KEY]["author_list"], 5)


Unnamed: 0,key:string,journal:string,author_list,reviewers
2,journals/lncs/BollingerLP91,Text Understanding in LILOG,"[Sven Lorenz, Toni Bollinger, Udo Pletat]","{Hans-Joachim Novak, Gert Smolka, Karl-Hans Bl..."
3,journals/lncs/CarstensenS91,Text Understanding in LILOG,"[Geoffrey Simmons, Kai-Uwe Carstensen]","{Gregor Erbach, Gudrun Klose, Bernd Walter, Di..."
4,journals/lncs/FlaterY93,Advanced Database Systems,"[David W. Flater, Yelena Yesha]","{H. V. Jagadish, Martin Andersson, Yelena Yesh..."


In [None]:
#Remove paper authors from reviewers
article_authors["reviewers"] = article_authors.apply(lambda row: row["reviewers"].difference(set(row["author_list"])), axis=1)

In [None]:
#create random reviewers where not enough
def make_reviewers(current_reviewers:set, authors:set, journal:str):
    no_reviewers = get_reviewer_count(journal)

    while len(current_reviewers) < no_reviewers:
        #add new sample
        current_reviewers = current_reviewers.union(set(random.sample(all_authors, 1)))
        #remove current authors
        current_reviewers = current_reviewers.difference(authors)

    if len(current_reviewers) >no_reviewers:
        current_reviewers = set(random.sample(current_reviewers, no_reviewers))

    return current_reviewers

In [None]:
article_authors["reviewers"] = article_authors.apply(lambda row: make_reviewers(row["reviewers"], set(row["author_list"]), row["journal:string"]), axis=1)

since Python 3.9 and will be removed in a subsequent version.
  current_reviewers = set(random.sample(current_reviewers, no_reviewers))
since Python 3.9 and will be removed in a subsequent version.
  current_reviewers = current_reviewers.union(set(random.sample(all_authors, 1)))


In [None]:
reviewers = article_authors[["key:string", "reviewers"]].explode("reviewers")
reviewers.head()

Unnamed: 0,key:string,reviewers
2,journals/lncs/BollingerLP91,Petra Steffens
2,journals/lncs/BollingerLP91,Karl-Hans Bläsius
2,journals/lncs/BollingerLP91,Gert Smolka
3,journals/lncs/CarstensenS91,Bernd Walter
3,journals/lncs/CarstensenS91,Dieter Landes


In [None]:
reviewers["review_text"] = reviewers["reviewers"].apply(lambda x: lipsum.generate_words(30))
reviewers["suggested_decision"] = reviewers["review_text"].apply(lambda x: random.sample(["acceptance", "conditional acceptance", "conditional rejection", "outright rejection"],1)[0] if random.randint(1,10) < 2 else "acceptance")

reviewers["suggested_decision"].unique()

array(['acceptance', 'outright rejection', 'conditional acceptance',
       'conditional rejection'], dtype=object)

In [None]:
reviewers["supports_acceptance"] = reviewers["suggested_decision"].apply(lambda x : x in ["acceptance", "conditional acceptance"])

In [None]:
acceptance_statistic = reviewers.groupby(["key:string"]).agg(acceptance_count=("supports_acceptance","sum"), no_reviewers=("reviewers", "count"))
acceptance_statistic["accepted"] = acceptance_statistic["acceptance_count"] > acceptance_statistic["no_reviewers"]/2
#acceptance_statistic[acceptance_statistic["accepted"] == False]
len(reviewers)

56250

In [None]:
reviewers.to_csv("data/reviewers.csv", index=False)

### Affiliations

In [None]:
affiliations = pd.concat([articles[["author_list"]], inproceedings[["author_list"]]])
affiliations = affiliations.explode("author_list")
affiliations["affiliation"] = affiliations["author_list"].apply(generate_affiliation)
affiliations["organization_type"] = affiliations["affiliation"].apply(lambda x: x.split(" ")[0])
affiliations.head()

Unnamed: 0,author_list,affiliation,organization_type
558,Hugo Hellebrand,Company 20,Company
558,Markus Casper,University 20,University
558,Ralf Merz,Company 16,Company
558,Rita Ley,University 2,University
562,Markus Tresch,University 17,University


In [None]:
affiliations.to_csv("data/affiliations.csv", index=False)