## Parse, extract and analyze dblp database records

In [1]:
# import libraries

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import xml.sax
import re
pd.set_option('display.max_columns', None)
import gzip
import shutil

### Download dblp.dtd and dblp.xml.gz files from http://dblp.uni-trier.de/xml/.

In [2]:
# Read in contents of the dblp.dtd file to view desired info

with open("./dblp.dtd", mode="rt") as infile:
    for line in infile:
        if line.find('<!ELEMENT dblp')>=0 or line.find('data)*>')>=0:
            pubnames = line.strip()
            print(pubnames)
        elif line.find('<!ENTITY % field')>=0 or line.find('rel">')>=0:
            print()
            fieldnames = line.strip()
            print(fieldnames)

<!ELEMENT dblp (article|inproceedings|proceedings|book|incollection|
phdthesis|mastersthesis|www|person|data)*>

<!ENTITY % field "author|editor|title|booktitle|pages|year|address|journal|volume|number|month|url|ee|cdrom|cite|publisher|note|crossref|isbn|series|school|chapter|publnr|stream|rel">


In [None]:
# Unzip the dblp.xml.gz file

with gzip.open('./dblp.xml.gz', 'rb') as f_in:
    with open('./dblp.xml', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

## User-defined functions to extract all raw data available in dblp.xml file

Read in the dblp.xml file and produce two output files: pubFile.txt and fieldFile.txt

In [None]:
# User-defined functions

class DBLPContentHandler(xml.sax.handler.ContentHandler):
    def __init__(self):
        xml.sax.ContentHandler.__init__(self)

    def startElement(self, name, attrs):
        if name == "dblp":
            self.pubFile = open('pubFile.txt', 'w',  encoding="utf-8")
            self.fieldFile = open('fieldFile.txt', 'w',  encoding="utf-8")
            self.pubList = ["article", "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www", "person", "data"]
            self.fieldList = ["author", "editor", "title", "booktitle", "pages", "year", "address", "journal", "volume", "number", "month", "url", "ee", "cdrom", "cite", "publisher", "note", "crossref", "isbn", "series", "school", "chapter", "publnr", "stream", "rel"]
            self.content = ""
        elif name in self.pubList:
            self.key = attrs.getValue("key")
            self.pub = name
            self.fieldCount = 0
            self.content = ""
        elif name in self.fieldList:
            self.field = name
            self.content = ""

    def endElement(self, name):
        if name in self.pubList:
            self.pubFile.write(self.key)
            self.pubFile.write("\t")
            self.pubFile.write(self.pub)
            self.pubFile.write("\n")
        elif name in self.fieldList:
            self.fieldFile.write(self.key)
            self.fieldFile.write("\t")
            self.fieldFile.write(str(self.fieldCount))
            self.fieldFile.write( "\t")
            self.fieldFile.write(self.field)
            self.fieldFile.write("\t")
            self.fieldFile.write(self.content)
            self.fieldFile.write("\n")
            self.fieldCount += 1

    def characters(self, content):
        self.content += content

In [None]:
# Place both dblp.xml and dblp.dtd files in the same folder
# Parse the dblp.xml file and extract the desired info
# 2 output files: pubFile.txt and fieldFile.txt are generated

parser = xml.sax.make_parser()
handler = DBLPContentHandler()
parser.setContentHandler(handler)
parser.parse('./dblp.xml')

## Read in the text files

In [3]:
# Read in the pubFile.txt file

df_pubtype = pd.read_csv('pubFile.txt', header=None, names='PubKey PubType'.split(), delimiter = "\t")
df_pubtype

Unnamed: 0,PubKey,PubType
0,tr/meltdown/s18,article
1,tr/meltdown/m18,article
2,tr/acm/CS2013,book
3,tr/gte/TR-0263-08-94-165,article
4,tr/gte/TR-0222-10-92-165,article
...,...,...
10152026,phd/Smolka89,phdthesis
10152027,phd/Dobry87,phdthesis
10152028,phd/Ghemawat95,phdthesis
10152029,phd/Rothkugel2002,phdthesis


In [4]:
df_pubtype['PubType'].unique()

array(['article', 'book', 'proceedings', 'inproceedings', 'www',
       'mastersthesis', 'incollection', 'data', 'phdthesis'], dtype=object)

In [5]:
# Read in the fieldFile.txt file

df_fields = pd.read_csv('fieldFile.txt', header=None, names='PubKey Field# Field Entry'.split(), delimiter = "\t")
df_fields

Unnamed: 0,PubKey,Field#,Field,Entry
0,tr/meltdown/s18,0,author,Paul Kocher
1,tr/meltdown/s18,1,author,Daniel Genkin
2,tr/meltdown/s18,2,author,Daniel Gruss
3,tr/meltdown/s18,3,author,Werner Haas 0004
4,tr/meltdown/s18,4,author,Mike Hamburg
...,...,...,...,...
79523510,phd/sk/Frisch2009,2,year,2009
79523511,phd/sk/Frisch2009,3,school,"Bratislava, Univ."
79523512,phd/sk/Frisch2009,4,pages,1-151
79523513,phd/sk/Frisch2009,5,isbn,978-3-8300-4753-7


In [6]:
df_fields['Field'].unique()

array(['author', 'title', 'journal', 'year', 'ee', 'publisher', 'isbn',
       'volume', 'month', 'url', 'note', 'cdrom', 'editor', 'booktitle',
       'series', 'pages', 'crossref', 'school', 'cite', 'number',
       'publnr', 'rel', 'chapter', 'address'], dtype=object)

In [7]:
# inner join df_fields and df_pubtype

df_merged = df_fields.merge(df_pubtype, how='inner', on='PubKey')
df_merged.index = df_merged.index + 1
df_merged

Unnamed: 0,PubKey,Field#,Field,Entry,PubType
1,tr/meltdown/s18,0,author,Paul Kocher,article
2,tr/meltdown/s18,1,author,Daniel Genkin,article
3,tr/meltdown/s18,2,author,Daniel Gruss,article
4,tr/meltdown/s18,3,author,Werner Haas 0004,article
5,tr/meltdown/s18,4,author,Mike Hamburg,article
...,...,...,...,...,...
79523511,phd/sk/Frisch2009,2,year,2009,phdthesis
79523512,phd/sk/Frisch2009,3,school,"Bratislava, Univ.",phdthesis
79523513,phd/sk/Frisch2009,4,pages,1-151,phdthesis
79523514,phd/sk/Frisch2009,5,isbn,978-3-8300-4753-7,phdthesis


### Example of a proceedings (DBLP website / Project Description file)

\<dblp>

\<proceedings key="conf/kdd/2017" mdate="2017-08-15">

\<title> Proceedings of the 23rd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, Halifax, NS, Canada, August 13 - 17, 2017 \</title>

\<booktitle> KDD \</booktitle>

\<publisher> ACM \</publisher>

\<year> 2017 \</year>

\<isbn> 978-1-4503-4887-4 \</isbn>

\<ee> http://doi.acm.org/10.1145/3097983 \</ee>

\<url> db/conf/kdd/kdd2017.html \</url>

\</proceedings>

\</dblp>

In [8]:
## Demo query for a proceedings

df_merged[df_merged['PubKey'] == "conf/kdd/2017"]

Unnamed: 0,PubKey,Field#,Field,Entry,PubType
56478352,conf/kdd/2017,0,title,Proceedings of the 23rd ACM SIGKDD Internation...,proceedings
56478353,conf/kdd/2017,1,booktitle,KDD,proceedings
56478354,conf/kdd/2017,2,publisher,ACM,proceedings
56478355,conf/kdd/2017,3,year,2017,proceedings
56478356,conf/kdd/2017,4,isbn,978-1-4503-4887-4,proceedings
56478357,conf/kdd/2017,5,ee,https://doi.org/10.1145/3097983,proceedings
56478358,conf/kdd/2017,6,ee,https://www.wikidata.org/entity/Q99800617,proceedings
56478359,conf/kdd/2017,7,url,db/conf/kdd/kdd2017.html,proceedings


### Example of a publication or paper in the proceedings (DBLP website / Project Description file)

\<dblp>

\<inproceedings key="conf/kdd/FayyadCRPCL17" mdate="2017-08-25">

\<author> Usama M. Fayyad \</author>

\<author> Arno Candel \</author>

\<author> Eduardo Ariño de la Rubia \</author>

\<author> Szilárd Pafka \</author>

\<author> Anthony Chong \</author>

\<author> Jeong-Yoon Lee \</author>

\<title> Benchmarks and Process Management in Data Science: Will We Ever Get Over the Mess? \</title>

\<pages> 31-32 \</pages>

\<year> 2017 \</year>

\<booktitle> KDD \</booktitle>

\<ee> https://doi.org/10.1145/3097983.3120998 \</ee>

\<crossref> conf/kdd/2017 \</crossref>

\<url> db/conf/kdd/kdd2017.html#FayyadCRPCL17 \</url>

\</inproceedings>

\</dblp>

In [9]:
## Demo query for a publication or paper in the proceedings

df_merged[df_merged['PubKey'] == "conf/kdd/FayyadCRPCL17"]

Unnamed: 0,PubKey,Field#,Field,Entry,PubType
56499347,conf/kdd/FayyadCRPCL17,0,author,Usama M. Fayyad,inproceedings
56499348,conf/kdd/FayyadCRPCL17,1,author,Arno Candel,inproceedings
56499349,conf/kdd/FayyadCRPCL17,2,author,Eduardo Ario de la Rubia,inproceedings
56499350,conf/kdd/FayyadCRPCL17,3,author,Szilrd Pafka,inproceedings
56499351,conf/kdd/FayyadCRPCL17,4,author,Anthony Chong,inproceedings
56499352,conf/kdd/FayyadCRPCL17,5,author,Jeong-Yoon Lee,inproceedings
56499353,conf/kdd/FayyadCRPCL17,6,title,Benchmarks and Process Management in Data Scie...,inproceedings
56499354,conf/kdd/FayyadCRPCL17,7,pages,31-32,inproceedings
56499355,conf/kdd/FayyadCRPCL17,8,year,2017,inproceedings
56499356,conf/kdd/FayyadCRPCL17,9,booktitle,KDD,inproceedings


In [10]:
# Find all unique values in 'Field' column grouped by 'PubType'

df_merged_grouped = df_merged.groupby(['PubType'])['Field'].unique()
df_merged_grouped

PubType
article          [author, title, journal, year, ee, volume, mon...
book             [title, publisher, year, ee, isbn, pages, auth...
data             [author, title, year, month, ee, publisher, re...
incollection     [author, title, pages, year, booktitle, ee, cr...
inproceedings    [author, title, booktitle, year, url, crossref...
mastersthesis              [author, title, year, school, ee, note]
phdthesis        [author, title, year, pages, publisher, series...
proceedings      [editor, title, booktitle, series, volume, yea...
www              [title, ee, year, author, editor, note, url, c...
Name: Field, dtype: object

### Raw Data Exploration and Understanding

In [11]:
df_merged['PubType'].unique()

array(['article', 'book', 'proceedings', 'inproceedings', 'www',
       'mastersthesis', 'incollection', 'data', 'phdthesis'], dtype=object)

In [12]:
for record in df_merged['PubType'].unique().tolist():
    display(df_merged[df_merged['PubType'] == "{}".format(record)])

Unnamed: 0,PubKey,Field#,Field,Entry,PubType
1,tr/meltdown/s18,0,author,Paul Kocher,article
2,tr/meltdown/s18,1,author,Daniel Genkin,article
3,tr/meltdown/s18,2,author,Daniel Gruss,article
4,tr/meltdown/s18,3,author,Werner Haas 0004,article
5,tr/meltdown/s18,4,author,Mike Hamburg,article
...,...,...,...,...,...
58666116,conf/valuetools/Coppa14,4,volume,2,article
58666117,conf/valuetools/Coppa14,5,journal,EAI Endorsed Trans. Ubiquitous Environ.,article
58666118,conf/valuetools/Coppa14,6,number,7,article
58666119,conf/valuetools/Coppa14,7,ee,https://doi.org/10.4108/icst.valuetools.2014.2...,article


Unnamed: 0,PubKey,Field#,Field,Entry,PubType
29,tr/acm/CS2013,0,title,Computer Science Curricula 2013,book
30,tr/acm/CS2013,1,publisher,ACM Press and IEEE Computer Society Press,book
31,tr/acm/CS2013,2,year,2013,book
32,tr/acm/CS2013,3,ee,https://doi.org/10.1145/2534860,book
33,tr/acm/CS2013,4,ee,https://www.wikidata.org/entity/Q107021707,book
...,...,...,...,...,...
79505218,phd/it/Metelli21,6,volume,361,book
79505219,phd/it/Metelli21,7,isbn,978-1-64368-362-1,book
79505220,phd/it/Metelli21,8,isbn,978-1-64368-363-8,book
79505221,phd/it/Metelli21,9,ee,https://doi.org/10.3233/FAIA361,book


Unnamed: 0,PubKey,Field#,Field,Entry,PubType
2933,tr/trier/MI99-17,0,editor,Dieter Baum,proceedings
2934,tr/trier/MI99-17,1,editor,Norbert Th. Mller,proceedings
2935,tr/trier/MI99-17,2,editor,Richard Rdler,proceedings
2936,tr/trier/MI99-17,3,title,"MMB '99, Messung, Modellierung und Bewertung v...",proceedings
2937,tr/trier/MI99-17,4,booktitle,MMB (Kurzvortrge),proceedings
...,...,...,...,...,...
78802828,conf/jsspp/2010,6,isbn,978-3-642-16504-7,proceedings
78802829,conf/jsspp/2010,7,booktitle,JSSPP,proceedings
78802830,conf/jsspp/2010,8,series,Lecture Notes in Computer Science,proceedings
78802831,conf/jsspp/2010,9,publisher,Springer,proceedings


Unnamed: 0,PubKey,Field#,Field,Entry,PubType
3547,www/org/mitre/future,0,author,Arnon Rosenthal,inproceedings
3548,www/org/mitre/future,1,title,The Future of Classic Data Administration: Obj...,inproceedings
3549,www/org/mitre/future,2,booktitle,SWEE,inproceedings
3550,www/org/mitre/future,3,year,1998,inproceedings
3551,www/org/mitre/future,4,url,db/conf/swee/swee1998.html,inproceedings
...,...,...,...,...,...
78802902,conf/jsspp/KrevatCM02,5,year,2002,inproceedings
78802903,conf/jsspp/KrevatCM02,6,crossref,conf/jsspp/2002,inproceedings
78802904,conf/jsspp/KrevatCM02,7,booktitle,JSSPP,inproceedings
78802905,conf/jsspp/KrevatCM02,8,ee,https://doi.org/10.1007/3-540-36180-4_3,inproceedings


Unnamed: 0,PubKey,Field#,Field,Entry,PubType
3554,www/org/w3/style-xsl,0,title,W3C: Extensible Stylesheet Language (XSL),www
3555,www/org/w3/style-xsl,1,ee,http://www.w3.org/Style/XSL/,www
3556,www/org/w3/style-xsl,2,year,2001,www
3557,www/org/w3/http1-1,0,author,Roy T. Fielding,www
3558,www/org/w3/http1-1,1,author,Henrik Frystyk Nielsen,www
...,...,...,...,...,...
43830435,homepages/16/6180,1,title,Home Page,www
43830436,homepages/16/8515,0,author,Yen-Chang Pan,www
43830437,homepages/16/8515,1,title,Home Page,www
43830438,homepages/16/8852,0,author,Roberto Del Pero,www


Unnamed: 0,PubKey,Field#,Field,Entry,PubType
3743,ms/Vollmer2006,0,author,Stephan Vollmer,mastersthesis
3744,ms/Vollmer2006,1,title,Portierung des DBLP-Systems auf ein relational...,mastersthesis
3745,ms/Vollmer2006,2,year,2006,mastersthesis
3746,ms/Vollmer2006,3,school,"Diplomarbeit, Universitt Trier, FB IV, Informatik",mastersthesis
3747,ms/Vollmer2006,4,ee,http://dbis.uni-trier.de/Diplomanden/Vollmer/v...,mastersthesis
...,...,...,...,...,...
79455706,phd/basesearch/Li13i,6,note,base-search.net (ftunivhongkonghu:oai:hub.hku....,mastersthesis
79488415,phd/Ylonen94,0,author,Tatu Ylnen,mastersthesis
79488416,phd/Ylonen94,1,title,Shadow Paging Is Feasible.,mastersthesis
79488417,phd/Ylonen94,2,year,1994,mastersthesis


Unnamed: 0,PubKey,Field#,Field,Entry,PubType
3772,reference/crypt/Bhanu11,0,author,Bir Bhanu,incollection
3773,reference/crypt/Bhanu11,1,title,Ear Shape for Biometric Identification.,incollection
3774,reference/crypt/Bhanu11,2,pages,372-378,incollection
3775,reference/crypt/Bhanu11,3,year,2011,incollection
3776,reference/crypt/Bhanu11,4,booktitle,Encyclopedia of Cryptography and Security (2nd...,incollection
...,...,...,...,...,...
78320760,conf/cascon/SpenceM10,4,year,2010,incollection
78320761,conf/cascon/SpenceM10,5,booktitle,The Smart Internet,incollection
78320762,conf/cascon/SpenceM10,6,ee,https://doi.org/10.1007/978-3-642-16599-3_7,incollection
78320763,conf/cascon/SpenceM10,7,crossref,conf/cascon/2010smart,incollection


Unnamed: 0,PubKey,Field#,Field,Entry,PubType
35954247,data/10/AbdelliGETP22,0,author,Khouloud Abdelli,data
35954248,data/10/AbdelliGETP22,1,author,Helmut Griesser,data
35954249,data/10/AbdelliGETP22,2,author,Peter Ehrle,data
35954250,data/10/AbdelliGETP22,3,author,Carsten Tropschug,data
35954251,data/10/AbdelliGETP22,4,author,Stephan Pachnicke,data
...,...,...,...,...,...
35986942,data/10/FernandesCF17a,3,title,Quality Assessment of Digital Colposcopies.,data
35986943,data/10/FernandesCF17a,4,year,2017,data
35986944,data/10/FernandesCF17a,5,month,March,data
35986945,data/10/FernandesCF17a,6,ee,https://doi.org/10.24432/C5C022,data


Unnamed: 0,PubKey,Field#,Field,Entry,PubType
36101767,series/faia/2005-148,0,author,Kristian Kersting,phdthesis
36101768,series/faia/2005-148,1,title,An Inductive Logic Programming Approach to Sta...,phdthesis
36101769,series/faia/2005-148,2,year,2005,phdthesis
36101770,series/faia/2005-148,3,pages,1-228,phdthesis
36101771,series/faia/2005-148,4,publisher,IOS Press,phdthesis
...,...,...,...,...,...
79523511,phd/sk/Frisch2009,2,year,2009,phdthesis
79523512,phd/sk/Frisch2009,3,school,"Bratislava, Univ.",phdthesis
79523513,phd/sk/Frisch2009,4,pages,1-151,phdthesis
79523514,phd/sk/Frisch2009,5,isbn,978-3-8300-4753-7,phdthesis


## Data Preprocessing

In [13]:
df_merged['Field'].unique()

array(['author', 'title', 'journal', 'year', 'ee', 'publisher', 'isbn',
       'volume', 'month', 'url', 'note', 'cdrom', 'editor', 'booktitle',
       'series', 'pages', 'crossref', 'school', 'cite', 'number',
       'publnr', 'rel', 'chapter', 'address'], dtype=object)

In [14]:
# List of selected fields for answering questions

selected_fields = ['author', 'title', 'journal', 'year', 'publisher', 'isbn','volume', 'month', 
                   'editor', 'booktitle', 'series', 'pages', 'crossref', 'number']

In [15]:
# Subsetting selected fields for answering questions

df_merged_selected = df_merged[df_merged['Field'].isin(selected_fields)]
df_merged_selected

Unnamed: 0,PubKey,Field#,Field,Entry,PubType
1,tr/meltdown/s18,0,author,Paul Kocher,article
2,tr/meltdown/s18,1,author,Daniel Genkin,article
3,tr/meltdown/s18,2,author,Daniel Gruss,article
4,tr/meltdown/s18,3,author,Werner Haas 0004,article
5,tr/meltdown/s18,4,author,Mike Hamburg,article
...,...,...,...,...,...
79523509,phd/sk/Frisch2009,0,author,Guido Frisch,phdthesis
79523510,phd/sk/Frisch2009,1,title,Using open source software to develop E-busine...,phdthesis
79523511,phd/sk/Frisch2009,2,year,2009,phdthesis
79523513,phd/sk/Frisch2009,4,pages,1-151,phdthesis


In [16]:
# Check for inconsistent/abnormal data string length in 'Entry' column

df_merged_toclean = df_merged_selected[df_merged_selected['Entry'].str.len() > 500]
df_merged_toclean

Unnamed: 0,PubKey,Field#,Field,Entry,PubType
813914,journals/jasis/Shifiett85,1,title,"Beyond Beyond ""1984"": The future of library te...",article
2177011,journals/giq/McMahon01,1,title,Evaluation of Presidential Candidates Websites...,article
4499064,journals/ker/Parsons94c,1,title,Textbooks for artificial intelligence Essentia...,article
4566807,journals/sigpro/Gabbouj91,1,title,Speech production and speech modelling: Procee...,article
8465441,journals/pacmhci/SlovakTCCBDCI18,8,title,I just let him cry...: Designing Socio-Technic...,article
...,...,...,...,...,...
79284299,phd/hal/Kourtesis20,1,title,Immersive Virtual Reality Methods in Cognitive...,phdthesis
79300993,phd/hal/Faham18,1,title,L'instrumentation des processus de « Dcouverte...,phdthesis
79302176,phd/hal/Guillot17,1,title,La Reprsentation Intermdiaire et Abstraite de ...,phdthesis
79488936,phd/hu/Piroska15,1,title,Az infokommunikcis technolgia hatsnak elemzse ...,phdthesis


In [17]:
# Clean up inconsistent/abnormal data in the 'Entry' column

def cleaned_string(key):
    """returns the truncated title"""
    if key == 'journals/jasis/Shifiett85': 
        return 'Beyond Beyond "1984": The future of library technical services.'
    elif key == 'journals/giq/McMahon01': 
        return 'Evaluation of Presidential Candidates'
    elif key == 'journals/ker/Parsons94c': 
        return 'Textbooks for artificial intelligence. Essentials of artificial intelligence'
    elif key == 'journals/sigpro/Gabbouj91': 
        return 'Speech production and speech modelling'
    elif key == 'journals/pacmhci/SlovakTCCBDCI18': 
        return 'I just let him cry...: Designing Socio-Technical Interventions in Families to Prevent Mental Health Disorders'
    elif key == 'journals/bsl/000115': 
        return 'Itay Neeman. Aronszajn trees and failure of the Singular Cardinal Hypothesis.' 
    elif key == 'journals/bsl/Appleby18': 
        return 'Choice Sequences and Knowledge States: Extending the Notion of Finite Information to Produce a Clearer Foundation for Intuitionistic Analysis'
    elif key == 'journals/bsl/Yaacov15': 
        return 'Model theory of operator algebras I: stability.'
    elif key == 'journals/bsl/Dobrinen14': 
        return 'Lecture Note Series of the London Mathematical Society.'    
    elif key == 'journals/bsl/Hirschfeldt16': 
        return 'Five papers on reverse mathematics and Ramsey-theoretic principles.'  
    elif key == 'journals/bsl/Chernikov18': 
        return 'Model theory, Keisler measures, and groups.'
    elif key == 'journals/bsl/Urquhart16': 
        return 'The Once and Future Turing: Computing the World.'
    elif key == 'journals/bsl/Keisler17': 
        return 'Cofinality spectrum problems in model theory, set theory and general topology.'    
    elif key == 'journals/bsl/Gomes18': 
        return 'On the history of paraconsistency and da Costa\'s work: the establishment of paraconsistent logic.' 
    elif key == 'journals/tse/DeavoursS98': 
        return 'Solution Techniques for Stochastic Petri Nets and Extensions.'
    elif key == 'journals/nms/Erickson09': 
        return 'To be networked, hyperlinked, portable.' 
    elif key == 'journals/nms/Mawyer11': 
        return 'The game\'s afoot, Watson: Culture and crisis in play: Mary Flanagan, Critical Play: Radical Game Design.'    
    elif key == 'journals/nms/EricksonA10': 
        return 'Making, breaking promises? Civic spheres and virtual engagements.'
    elif key == 'journals/nms/Mawyer09': 
        return 'Gameworlds, lifecraft and warplay: Jim Rossignol, This Gaming Life: Travels in Three Cities.'
    elif key == 'journals/nms/Ytreberg11': 
        return 'Convergence: Essentially confused?: Tim Dwyer, Media Convergence.'
    elif key == 'journals/nms/Herman11': 
        return 'New media law and policy: Helen Nissenbaum, Privacy in Context: Technology, Policy, and the Integrity of Social Life.'   
    elif key == 'journals/nms/Nielsen09': 
        return 'Uneven accelerations: John Tomlinson, The Culture of Speed: The Coming of Immediacy.'
    elif key == 'journals/nms/Zube07': 
        return 'Mediated democracy: Andrew Chadwick, Internet Politics: States Citizens, and New Communication Technologies.'
    elif key == 'journals/nms/Griffiths07': 
        return 'Future assemblies: theorizing mobilities and users'
    elif key == 'journals/nms/Schackman09': 
        return 'Exploring the new frontiers of collaborative community'   
    elif key == 'journals/nms/Muri07': 
        return 'Traversing the territories: when humanists engage with biotechnology and technoscience'  
    elif key == 'journals/nms/Mosco09': 
        return 'Approaching digital democracy'   
    elif key == 'journals/nms/Poor11': 
        return 'Mobile Media in the Asia-Pacific: Gender and the Art of Being Mobile.' 
    elif key == 'journals/tkde/ZhangQLS05': 
        return 'Missing Values in Cost-Sensitive Decision Trees.'   
    elif key == 'journals/jgo/Pardalos06': 
        return 'Variational Analysis and Generalized Differentiation I: Basic Theory Series' 
    elif key == 'journals/jolis/Gibbs07': 
        return 'Collaborative Library Lessons for the Primary Grades'  
    elif key == 'journals/nfd/Hohner14': 
        return 'Academic integrity and plagiarism control as spheres of activity for universities and their libraries - Thoughts from the second Mainz conference on academic integrity'  
    elif key == 'journals/nfd/StangN13': 
        return 'Children Media World as a research focus.'    
    elif key == 'journals/robotica/Andrew93b': 
        return 'Neural Networks: Advances And Applications' 
    elif key == 'journals/robotica/Andrew91c': 
        return 'Uncertainty In Artificial Intelligence 4'    
    elif key == 'journals/robotica/Fox91c': 
        return 'Low Cost Automation: Techniques, Components and Instruments Applications' 
    elif key == 'journals/robotica/Andrew87n': 
        return 'Parallel Distributed Processing: Explorations in the Microstructures of Cognition.'     
    elif key == 'journals/robotica/Owen92b': 
        return 'Numerical Recipes Book (PASCAL)'  
    elif key == 'journals/robotica/Owen91r': 
        return 'Dynamics and Control of Multibody/Robotic Systems With Space Applications'    
    elif key == 'journals/cbm/GratzeFHGPWSKS98': 
        return 'A software package for non-invasive, real-time beat-to-beat monitoring of stroke volume, blood pressure, total peripheral resistance and for assessment of autonomic function'  
    elif key == 'journals/jcmc/Bughin01': 
        return 'E-Push or e-Pull? Laggards and First-Movers in European On-Line Banking.'    
    elif key == 'journals/tis/Rafaeli96': 
        return 'Who Owns Information? From Privacy to Public Access'
    elif key == 'journals/comj/Scheirer00': 
        return 'Studies in Cognitive and Systematic Musicology'
    elif key == 'journals/comj/Burt99c': 
        return 'Miniatures concrtes'    
    elif key == 'journals/comj/Palombini99a': 
        return 'A Career in Research Paperback'  
    elif key == 'journals/crossroads/StrohmayerM17': 
        return 'We had tough times, but we\'ve sort of sewn our way through it: the partnership quilt.'    
    elif key == 'journals/sigapl/PolivkaM00': 
        return 'A focus on J: past, present and future.'
    elif key == 'journals/stvr/Whitty95': 
        return 'Software in Safety-Related Systems'  
    elif key == 'books/hal/Bahroun23': 
        return 'Contributions to the building of video summaries: application to generic object search and facial recognition'    
    elif key == 'conf/alife/Penn16': 
        return 'Artificial Life and Society: Philosophies and Tools for Experiencing, Interacting with and Managing Real World Complex Adaptive Systems'  
    elif key == 'conf/lifetech/IwaseGN21': 
        return 'Relationship Between Learning by Teaching with Teachable Chatbots and the Big 5.'    
    elif key == 'conf/icca/ZhangSX10': 
        return 'Stage-by-stage201D; optimization approach to optimal control for general time-delay systems.'
    elif key == 'conf/isr2/WangLQS21': 
        return 'An improved one-stage detector for vehicle and pedestrian detection on campus AGV'
    elif key == 'conf/isr2/CunhaFCORSMB21': 
        return 'From Handcrafting to a Certified and Ergonomic Collaborative Workstation: the Digital Transformation Process.'
    elif key == 'conf/cdc/ArghaSSC16': 
        return 'A partial eigenstructure assignment method'   
    elif key == 'conf/medinfo/PiretFCD98': 
        return 'THE ELECTRONIC WARDEN Management of the Data Security Access in a Heterogeneous University Hospital Environment in Belgium.' 
    elif key == 'conf/chi/Shandilya0T22': 
        return 'Perspectives on Using Non-Textual Communication in Virtual Workspaces.'    
    elif key == 'conf/cdceo/SongAY22': 
        return 'Disaster Detection from SAR Images with Different Off-Nadir Angles Using Unsupervised Image Translation'
    elif key == 'conf/taln/CerfF20': 
        return 'Acoustic and phonetic parameters in parkinsonian speech before and after LSVT LOUD'    
    elif key == 'conf/icis/KaranasiosCHA19a': 
        return 'the embodiment of the platform logic in the emergency sector.'  
    elif key == 'conf/icis/KaranasiosCHA19': 
        return 'the embodiment of the platform logic in the emergency sector.'   
    elif key == 'conf/aiml/GuW16': 
        return 'logic as a normal modal logic.'
    elif key == 'conf/larc/CarvalhoSFP20': 
        return 'Performance Analysis of Code-based Relative GPS Positioning as Function of Baseline Separation'   
    elif key == 'conf/euspn/ChbaikKBO21': 
        return 'The Application of Smart Supply Chain Technologies in The Moroccan Logistics.' 
    elif key == 'conf/saso/AndersonHJ14': 
        return 'Reflection, Collectives and Adaptation: the Role of Models in the Design of Collective Adaptive Systems.'    
    elif key == 'conf/icccnt/JadhavP20': 
        return 'Algorithmic Comparative Classification Approach for Defect Detection of Pyrus Malus.'
    elif key == 'conf/dfg/BjornerGHMHP04': 
        return 'Formal Techniques.'
    elif key == 'conf/hri/XieP23': 
        return 'Can You Guess My Moves?: Playing Charades with a Humanoid Robot Employing Mutual Learning with Emotional Intelligence.'  
    elif key == 'conf/icml/Mitchell00': 
        return 'Boosting\'\' a Positive-Data-Only Learner.' 
    elif key == 'conf/amia/ChengDK13': 
        return 'Out of the Box\' - Directed, Actionable Decision Support for Drugs with Boxed Warnings.'
    elif key == 'data/10/AlamlehAE23': 
        return 'ChatGPT vs. Student: A Dataset for Source Classification of Computer Science Answers.'
    elif key == 'homepages/316/9217': 
        return 'Nabil Chbaik'   
    elif key == 'conf/safecomp/2020w': 
        return 'SAFECOMP Workshops' 
    elif key == 'conf/depcos/2014': 
        return 'DepCoS-RELCOMEX'    
    elif key == 'conf/depcos/2015': 
        return 'DepCoS-RELCOMEX'
    elif key == 'conf/mm/2017musa2': 
        return 'MUSA2@MM'
    elif key == 'conf/caise/2017radar': 
        return 'Joint Proceedings of the Radar tracks at the 18th International Working Conference on Business Process Modeling, Development and Support (BPMDS)'  
    elif key == 'conf/crowncom/2021': 
        return 'CrownCom/WiCON' 
    elif key == 'conf/aciids/2017p': 
        return 'ACIIDS (Extended Posters)'    
    elif key == 'conf/lwa/2006': 
        return 'LWA 2006: Lernen - Wissensentdeckung - Adaptivitt, Hildesheim, Deutschland, October 9th-11th 2006' 
    elif key == 'conf/staf/2021w': 
        return 'STAF 2021 Workshop Proceedings: 9th International Workshop on Bidirectional Transformations'    
    elif key == 'conf/atal/2014coin': 
        return 'COIN@AAMAS/PRICAI'
    elif key == 'conf/eusipco/1996': 
        return 'EUSIPCO'    
    elif key == 'conf/netsci/2016x': 
        return 'NetSci-X'  
    elif key == 'phd/us/McCarthy09': 
        return 'Static Analyses of Cryptographic Protocols.'   
    elif key == 'phd/us/Xin18': 
        return 'Towards Improving the Effectiveness of Automated Program Repair.'
    elif key == 'phd/dnb/Turke87': 
        return 'Untersuchung zur Realisierung von Zielen der Persnlichkeitsentwicklung auf dem Gebiet der Informationsverarbeitung und Prozessautomatisierung beim Einsatz von Schlern an numerisch gesteuerten Werkzeugmaschinen:'   
    elif key == 'phd/hal/Lisnyak18': 
        return 'Theoretical, numerical and experimental study of DC and AC electric arcs: Modeling and experimental investigations of default arcs propagating along the electric bus-bars for aeronautical applications' 
    elif key == 'phd/hal/Pittet14': 
        return 'OntoVersionGraph : a change management methodology dedicated to formal ontologies and their user views in a collaborative context : Application to SHOIN (D) ontologies.'    
    elif key == 'phd/hal/Hery14': 
        return 'learning the PubMed literature search tool. A pilot study, randomized, single-center, cluster'
    elif key == 'phd/hal/Prieto15': 
        return 'The effects of the Introduction of Information and Communication Technologies on interaction dynamics between citizens, service providers and rulers in precarious health care settings: insights from an mHealth pilot experiment in rural Guatemala.'
    elif key == 'phd/hal/Bernonville08': 
        return 'Use of Software Engineering and Human-Computer Interaction modeling techniques to create common supports between Interactive System development project partners and to support complex work situation modeling, application case: ordering-dispensing-administration medication process in hospital'  
    elif key == 'phd/hal/Cerqueus15': 
        return 'Bi-objective branch-and-cut algorithms applied to the binary knapsack problem : surrogate upper bound sets, dynamic branching strategies, generation and exploitation of cover inequalities.' 
    elif key == 'phd/hal/Santorineos06': 
        return 'An approach in research and art in the digital age through the study of artificial systems of organisation of human memory : necessity and proposal for a complex tool in reporting and synthesising data which may potentially contribute to the development of a new form of doctorate (doctorate-machine)'
    elif key == 'phd/hal/Kourtesis20': 
        return 'Immersive Virtual Reality Methods in Cognitive Neuroscience and Neuropsychology: The Virtual Reality Everyday Assessment Lab (VR-EAL)'   
    elif key == 'phd/hal/Faham18': 
        return 'The instrumentation of "Entrepreneurial Discovery" processes of "Research and Innovation Strategies for the Smart Specialisation" (RIS3)' 
    elif key == 'phd/hal/Guillot17': 
        return 'The Intermediate and Abstract Representation of Space as a Tool for Sound Spatialization'    
    elif key == 'phd/hu/Piroska15': 
        return 'Az infokommunikcis technolgia hatsnak elemzse az oktatsban'
    elif key == 'phd/hu/Norbert22': 
        return 'Designing Secure Authentication Schemes for Distributed Systems'


In [18]:
df_merged_toclean['Entry'] = df_merged_toclean.apply(lambda x: cleaned_string(x['PubKey']), axis=1)
df_merged_toclean

Unnamed: 0,PubKey,Field#,Field,Entry,PubType
813914,journals/jasis/Shifiett85,1,title,"Beyond Beyond ""1984"": The future of library te...",article
2177011,journals/giq/McMahon01,1,title,Evaluation of Presidential Candidates,article
4499064,journals/ker/Parsons94c,1,title,Textbooks for artificial intelligence. Essenti...,article
4566807,journals/sigpro/Gabbouj91,1,title,Speech production and speech modelling,article
8465441,journals/pacmhci/SlovakTCCBDCI18,8,title,I just let him cry...: Designing Socio-Technic...,article
...,...,...,...,...,...
79284299,phd/hal/Kourtesis20,1,title,Immersive Virtual Reality Methods in Cognitive...,phdthesis
79300993,phd/hal/Faham18,1,title,"The instrumentation of ""Entrepreneurial Discov...",phdthesis
79302176,phd/hal/Guillot17,1,title,The Intermediate and Abstract Representation o...,phdthesis
79488936,phd/hu/Piroska15,1,title,Az infokommunikcis technolgia hatsnak elemzse ...,phdthesis


In [19]:
df_merged_remaining = df_merged_selected[df_merged_selected['Entry'].str.len() <= 500]
df_merged_remaining

Unnamed: 0,PubKey,Field#,Field,Entry,PubType
1,tr/meltdown/s18,0,author,Paul Kocher,article
2,tr/meltdown/s18,1,author,Daniel Genkin,article
3,tr/meltdown/s18,2,author,Daniel Gruss,article
4,tr/meltdown/s18,3,author,Werner Haas 0004,article
5,tr/meltdown/s18,4,author,Mike Hamburg,article
...,...,...,...,...,...
79523509,phd/sk/Frisch2009,0,author,Guido Frisch,phdthesis
79523510,phd/sk/Frisch2009,1,title,Using open source software to develop E-busine...,phdthesis
79523511,phd/sk/Frisch2009,2,year,2009,phdthesis
79523513,phd/sk/Frisch2009,4,pages,1-151,phdthesis


In [20]:
df_merged_cleaned = pd.concat([df_merged_remaining, df_merged_toclean], axis=0)
df_merged_cleaned.sort_values(by = ['PubKey', 'Field#', 'PubType'], inplace = True)
df_merged_cleaned = df_merged_cleaned.reset_index()
df_merged_cleaned.drop(columns=['index', 'Field#'], inplace=True)
df_merged_cleaned

Unnamed: 0,PubKey,Field,Entry,PubType
0,books/acm/0082477,author,Marc Rettig,book
1,books/acm/0082477,title,The no-nonsense guide to computing careers.,book
2,books/acm/0082477,publisher,ACM,book
3,books/acm/0082477,year,1992,book
4,books/acm/0082477,isbn,0-89791-463-5,book
...,...,...,...,...
64021099,www/org/w3/http1-1,year,1999,www
64021100,www/org/w3/ql98,title,"W3C: The w3c Query Language Workshop, December...",www
64021101,www/org/w3/ql98,year,1998,www
64021102,www/org/w3/style-xsl,title,W3C: Extensible Stylesheet Language (XSL),www


In [21]:
# Raw data to use

# df_merged_cleaned.to_csv('df_merged_cleaned.csv')

In [22]:
# Read in the df_merged_cleaned.csv

# df_merged_cleaned = pd.read_csv('df_merged_cleaned.csv')
# df_merged_cleaned = df_merged_cleaned.loc[:, ~df_merged_cleaned.columns.str.contains('^Unnamed')]
# df_merged_cleaned

Unnamed: 0,PubKey,Field,Entry,PubType
0,books/acm/0082477,author,Marc Rettig,book
1,books/acm/0082477,title,The no-nonsense guide to computing careers.,book
2,books/acm/0082477,publisher,ACM,book
3,books/acm/0082477,year,1992,book
4,books/acm/0082477,isbn,0-89791-463-5,book
...,...,...,...,...
64021099,www/org/w3/http1-1,year,1999,www
64021100,www/org/w3/ql98,title,"W3C: The w3c Query Language Workshop, December...",www
64021101,www/org/w3/ql98,year,1998,www
64021102,www/org/w3/style-xsl,title,W3C: Extensible Stylesheet Language (XSL),www


In [23]:
# Partition the dataset into chunks of 5 million rows

df_merged_cleaned1 = df_merged_cleaned.loc[:5000000]
df_merged_cleaned2 = df_merged_cleaned.loc[5000001:10000000]
df_merged_cleaned3 = df_merged_cleaned.loc[10000001:15000000]
df_merged_cleaned4 = df_merged_cleaned.loc[15000001:20000000]
df_merged_cleaned5 = df_merged_cleaned.loc[20000001:25000000]
df_merged_cleaned6 = df_merged_cleaned.loc[25000001:30000000]
df_merged_cleaned7 = df_merged_cleaned.loc[30000001:35000000]
df_merged_cleaned8 = df_merged_cleaned.loc[35000001:40000000]
df_merged_cleaned9 = df_merged_cleaned.loc[40000001:45000000]
df_merged_cleaned10 = df_merged_cleaned.loc[45000001:50000000]
df_merged_cleaned11 = df_merged_cleaned.loc[50000001:55000000]
df_merged_cleaned12 = df_merged_cleaned.loc[55000001:60000000]
df_merged_cleaned13 = df_merged_cleaned.loc[60000001:65000000]

In [24]:
# Save the chunks of dataset into .csv files

df_merged_cleaned1.to_csv('df_merged_cleaned1.csv', index=False)
df_merged_cleaned2.to_csv('df_merged_cleaned2.csv', index=False)
df_merged_cleaned3.to_csv('df_merged_cleaned3.csv', index=False)
df_merged_cleaned4.to_csv('df_merged_cleaned4.csv', index=False)
df_merged_cleaned5.to_csv('df_merged_cleaned5.csv', index=False)
df_merged_cleaned6.to_csv('df_merged_cleaned6.csv', index=False)
df_merged_cleaned7.to_csv('df_merged_cleaned7.csv', index=False)
df_merged_cleaned8.to_csv('df_merged_cleaned8.csv', index=False)
df_merged_cleaned9.to_csv('df_merged_cleaned9.csv', index=False)
df_merged_cleaned10.to_csv('df_merged_cleaned10.csv', index=False)
df_merged_cleaned11.to_csv('df_merged_cleaned11.csv', index=False)
df_merged_cleaned12.to_csv('df_merged_cleaned12.csv', index=False)
df_merged_cleaned13.to_csv('df_merged_cleaned13.csv', index=False)

## End of Script