In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

import logging
import json
import re


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
DISPLAY_LIMIT = 20

Link to the repo: https://github.com/taidopurason/bdm-project-1

### Structure of this notebook
1. Read data from zipped json files into one dataframe.
2. Apply necessary (cleaning) transformations to the dataframe.
3. Create the new DataFrames corresponding to our Warehouse Schema.  
4. Save the DataFrames as Delta tables.
5. Demonstrate adding new entries to the warehouse.
6. Demonstrate queries on the data.

### 1. Extract the data
To check how we downloaded the data from the source, see https://github.com/taidopurason/bdm-project-1/blob/main/Loading%20Data.ipynb. We split the downloaded data into files where each file contains 250,000 json objects.

In [0]:
# Uncomment one or the other line.

# This reads ALL splits into one dataframe
#_df = spark.read.option("multiline", True).json('dbfs:/user/dblpv13/dblpv13.*.json.gz')

# For a faster setup, read just one split
_df = spark.read.option("multiline", True).json('dbfs:/user/dblpv13/dblpv13.0.json.gz')

In [0]:
# Immediately delete the abstract column because they look really annoying on GitHub.
_df = _df.drop(F.col('abstract'))

_df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- bio: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- gid: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- name_zh: string (nullable = true)
 |    |    |-- oid: string (nullable = true)
 |    |    |-- oid_zh: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |    |    |-- org: string (nullable = true)
 |    |    |-- org_zh: string (nullable = true)
 |    |    |-- orgid: string (nullable = true)
 |    |    |-- orgs: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- orgs_zh: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- sid: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- fos: array (nullable = 

### 2. Transform the data
TODO: delete all entries where any reference is null.

TODO: delete all entries where any author id is null.

We don't want any null values in the FK columns.

In [0]:
logger.info(f"Initially, there were {_df.count()} rows of data")

INFO:__main__:Initially, there were 250000 rows of data


In [0]:
# Drop entries with 1-word titles or empty authors or nonexistant _id or 
_df = (_df.filter((F.size(F.col('authors')) > 0) & # By default F.size() returns -1 if the value is null.
                  (F.size(F.split(F.col('title'), ' ')) > 1) &  
                  (F.col('_id') != '') & 
                  (F.col('_id').isNotNull()) & 
                  ~(F.array_contains(F.col('references'), ''))))

In [0]:
# remove forewords
_df = _df.filter(~F.lower(F.col("title")).contains("foreword"))

In [0]:
# Convert n_citation data type to int
_df = _df.withColumn('n_citation', F.col('n_citation').cast('int'))

In [0]:
# Replace empty language values with null.
_df = _df.withColumn('lang', F.when(F.col('lang') == '', None).otherwise(F.col('lang')))

In [0]:
# Replace empty 'keyword' and 'fos' arrays with null values.
_df = (_df.withColumn('keywords', F.when(F.size(F.col('keywords')) == 0, None).otherwise(F.col('keywords')))
          .withColumn('fos', F.when(F.size(F.col('fos')) == 0, None).otherwise(F.col('fos'))))

In [0]:
# Replace non-numeric page numbers with nulls and convert column type to int. Then replace 0 page numbers with nulls as well.
_df = (_df.withColumn('page_start', F.when(F.col('page_start').cast('int').isNotNull(), F.col('page_start')).otherwise(None)) # replace non-numeric page numbers with null
          .withColumn('page_end', F.when(F.col('page_end').cast('int').isNotNull(), F.col('page_end')).otherwise(None))
          .withColumn('page_start', F.col('page_start').cast('int')) # convert column type to int
          .withColumn('page_end', F.col('page_end').cast('int'))
          .withColumn('page_start', F.when(F.col('page_start') == 0, None).otherwise(F.col('page_start'))) # replace 0 page numbers with null as well
          .withColumn('page_end', F.when(F.col('page_end') == 0, None).otherwise(F.col('page_end'))))

In [0]:
# Replace empty dois with nulls.
_df = _df.withColumn('doi', F.when(F.col('doi') == '', None).otherwise(F.col('doi')))

In [0]:
# Replace empty years with nulls and change data type to int.
_df = (_df.withColumn('year', F.when(F.col('year') == 0, None).otherwise(F.col('year')))
          .withColumn('year', F.col('year').cast('int')))

In [0]:
# Replace non-numeric volume and issue numbers with null and convert data types to int. Then repalce 0 values with null as well.
_df = (_df.withColumn('volume', F.when(F.col('volume').cast('int').isNotNull(), F.col('volume')).otherwise(None)) # replace non-numeric values
          .withColumn('issue', F.when(F.col('issue').cast('int').isNotNull(), F.col('issue')).otherwise(None))
          .withColumn('volume', F.col('volume').cast('int')) # convert column type to int
          .withColumn('issue', F.col('issue').cast('int'))
          .withColumn('volume', F.when(F.col('volume') == 0, None).otherwise(F.col('volume'))) # replace 0 issue and volume numbers with null as well.
          .withColumn('issue', F.when(F.col('issue') == 0, None).otherwise(F.col('issue'))))

In [0]:
# replace empty strings in some columns with nulls
def replace_empty_string(col):
    return F.when(col == "", None).otherwise(col)

venue = F.col("venue")
for col in ["_id", "issn", "name", "name_d", "name_s", "online_issn", "publisher", "raw", "raw_zh", "t"]:
    venue = venue.withField(col, replace_empty_string(F.col(f"venue.{col}")))
    
_df = (
    _df
    .withColumn("venue", venue)
    .withColumn("issn", replace_empty_string(F.col("issn")))
    .withColumn("isbn", replace_empty_string(F.col("isbn")))
    .withColumn("isbn", F.when(F.col("isbn") == "isbn", None).otherwise(F.col("isbn")))
    .withColumn("issn", F.when(F.col("issn") == "issn", None).otherwise(F.col("issn")))
)

In [0]:
# fix incorrect issn
_df = (_df
           .withColumn("issn",
                       F.when(F.length(F.col("issn")) == 9, F.col("issn"))
                       .when(F.length(F.col("issn")) == 8, F.concat(F.col("issn").substr(1, 4), F.lit("-"), F.col("issn").substr(5, 4)))
                       .when(F.col("issn").contains("E-ISBN"), F.col("issn").substr(1, 9))
                       .otherwise(None)
                      )
           .withColumn("venue", 
                       F.col("venue")
                       .withField("issn", F.coalesce(F.col("venue.issn"), F.col("issn")))
                       )
           .drop("issn")
          )

In [0]:
# replace venue with null fields with null
venue_is_empty = (
    F.col("venue.issn").isNull() &
    F.col("venue.name").isNull() &
    F.col("venue.name_d").isNull() &
    F.col("venue.name_s").isNull() &
    F.col("venue.online_issn").isNull() &
    F.col("venue.publisher").isNull() &
    F.col("venue.raw").isNull() &
    F.col("venue.raw_zh").isNull()
)
_df = _df.withColumn("venue", F.when(venue_is_empty, None).otherwise(F.col("venue")))

In [0]:
# remove rows with null venues
_df = _df.filter(F.col("venue").isNotNull())

In [0]:
# coalescing venue._id and venue.issn to make up for missing ids
_df = _df.withColumn("venue", F.col("venue").withField("_id", F.coalesce(F.col("venue._id"), F.col("venue.issn"))))

# removing rows with venue id null
_df = _df.filter(F.col("venue._id").isNotNull())

In [0]:
logger.info(f"Now, there are {_df.count()} rows of data")

_df.printSchema()

INFO:__main__:Now, there are 163401 rows of data
root
 |-- _id: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- bio: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- gid: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- name_zh: string (nullable = true)
 |    |    |-- oid: string (nullable = true)
 |    |    |-- oid_zh: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |    |    |-- org: string (nullable = true)
 |    |    |-- org_zh: string (nullable = true)
 |    |    |-- orgid: string (nullable = true)
 |    |    |-- orgs: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- orgs_zh: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- sid: string (nullable = true)
 |-- doi: stri

In [0]:
# Display endresult
display(_df.limit(DISPLAY_LIMIT))

_id,authors,doi,fos,isbn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,venue,volume,year
53e99784b7602d9701f3e151,"List(List(53f46797dabfaeb22f542630, null, null, null, Jairo Rocha, null, null, null, null, null, null, null, null, null, null), List(54328883dabfaeb4c6a8a699, null, null, null, Theo Pavlidis, null, null, null, null, null, null, null, null, null, null))",10.1109/ICDAR.1993.395663,"List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",,,"List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)",en,17,605.0,602.0,,"List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)",A solution to the problem of touching and broken characters.,List(http://dx.doi.org/10.1109/ICDAR.1993.395663),"List(53a72a4920f7420be8bfa51b, null, null, International Conference on Document Analysis and Recognition, null, null, null, ICDAR-1, null, null, null, null, 0)",,1993
53e99784b7602d9701f3e15d,"List(List(53f43b03dabfaedce555bf2a, null, null, null, Min Pan, null, null, null, null, null, null, null, null, null, null), List(53f45ee9dabfaee43ecda842, null, null, null, Chris C. N. Chu, null, null, null, null, null, null, null, null, null, null), List(53f42e8cdabfaee1c0a4274e, null, null, null, Hai Zhou, null, null, null, null, null, null, null, null, null, null))",10.1109/ISCAS.2005.1465124,"List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",0-7803-8834-8,,"List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)",en,28,,2461.0,//static.aminer.org/pdf/PDF/000/423/329/timing_yield_estimation_using_statistical_static_timing_analysis.pdf,"List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)",Timing yield estimation using statistical static timing analysis,"List(http://dx.doi.org/10.1109/ISCAS.2005.1465124, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1465124)","List(53a72e2020f7420be8c80142, null, null, International Symposium on Circuits and Systems, null, null, null, ISCAS (3), null, null, null, null, 0)",,2005
53e99784b7602d9701f3f411,"List(List(548a2e3ddabfae9b40134fbc, null, null, null, Harry M. Sneed, null, null, null, null, null, null, null, null, null, null))",10.1109/CMPSAC.2002.1044548,"List(XML Base, World Wide Web, XML framework, XML Encryption, Efficient XML Interchange, SGML, Programming language, Software engineering, XML, XML validation, Computer science, cXML)",0-7695-1727-7,,"List(Internet, hypermedia markup languages, information resources, systems re-engineering, COBOL, PL/I, World Wide Web, XML, batch programs, data conversion, e-commerce, eXtensible Markup Language, enterprise application integration, interface reengineering, legacy programs, online programs, software reengineering, subprograms, systems integration)",en,28,172.0,167.0,,"List(53e9adbdb7602d97037be8a2, 53e9bb53b7602d9704792f33, 558aa425e4b0b32fcb37fff4, 558abd44e4b031bae1f9653a, 53e9a326b7602d9702c32229, 53e9b1d7b7602d9703c6ce7c, 558a7de784ae84d265bdee99, 53e9ae17b7602d9703828d13, 53e9aa4fb7602d97033bf9ad)",Using XML to Integrate Existing Software Systems into the Web,"List(http://dx.doi.org/10.1109/CMPSAC.2002.1044548, http://doi.ieeecomputersociety.org/10.1109/CMPSAC.2002.1044548, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1044548)","List(53a72e9920f7420be8c93fac, null, null, Computer Software and Applications Conference, null, null, null, COMPSAC, null, null, null, null, 0)",,2002
53e99784b7602d9701f3f5fe,"List(List(53f46a22dabfaee0d9c3d5e5, null, ysg_2005@hotmail.com, 5b8698cce1cd8e14a3826671, Shuguo Yang, null, null, null, null, School of Mathematics and Physics, Qingdao University of Science and Technology, Qingdao, China 266061, null, 5f71b2e91c455f439fe3f23f, null, null, null))",10.1007/s11704-011-0127-6,"List(Virtualization, Service level objective, Virtual machine, Computer science, Testbed, Quality of service, Provisioning, Resource allocation, Web application, Operating system, Distributed computing)",,4.0,"List(resource allocation, cpu utilization, quality of service)",en,2,512.0,506.0,,"List(53e9a073b7602d9702957efa, 53e9ad87b7602d970377bfb5, 53e9be51b7602d9704b11381, 53e9be04b7602d9704abb31d, 53e9992bb7602d9702169236, 53e998cdb7602d97021044db, 53e9afa6b7602d97039f6054, 53e99822b7602d9702044e60)",Research on resource allocation for multi-tier web applications in a virtualization environment,"List(http://dx.doi.org/10.1007/s11704-011-0127-6, http://link.springer.com/article/10.1007/s11704-011-0127-6, http://www.webofknowledge.com/)","List(572de199d39c4f49934b3d5c, 1673-7350, null, null, null, null, null, Frontiers of Computer Science in China, null, null, null, null, 0)",5.0,2011
53e99792b7602d9701f5af1a,"List(List(5631df8845cedb3399f3e752, null, null, null, Shigeru Fujita, null, null, null, null, null, null, null, null, null, null), List(53f4775edabfaee4dc891b69, null, null, null, Kenji Sugawara, null, null, null, null, null, null, null, null, null, null), List(54096ca7dabfae450f483585, null, null, null, Claude Moulin, null, null, null, null, null, null, null, null, null, null), List(5448b55bdabfae87b7e68206, null, null, null, Jean-Paul A. Barthès, null, null, null, null, null, null, null, null, null, null))",10.1109/COGINF.2010.5599834,"List(Syma, Computer science, Symbiotic computing, Multi-agent system, Human–computer interaction, Schedule, Artificial intelligence, Ubiquitous computing, Cognition)",,,"List(cognition, multi-agent systems, ubiquitous computing, ADIPS-DASH, OMAS, SYMA, actuators, awareness and operation module, cognition functions, decision functions, intelligent multiagent system, multiparadigm-multiagent framework, perceptual interaction, social interaction, symbiotic base mechanism, symbiotic multiagent system, Awareness, Cognition Layer model, Multi-agent system, Social-ware, Symbiotic Computing)",en,4,630.0,625.0,,"List(53e9b3dab7602d9703ec7ddf, 53e9a3edb7602d9702d03525, 53e9b9fbb7602d97045f67ae, 53e9b4c3b7602d9703fdfe37, 53e9a310b7602d9702c1a36e, 53e9abfeb7602d97035c19c5)",The design of awareness and operation module for the symbiotic applications.,"List(http://dx.doi.org/10.1109/COGINF.2010.5599834, http://doi.ieeecomputersociety.org/10.1109/COGINF.2010.5599834, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5599834)","List(53a72bad20f7420be8c2d5af, null, null, IEEE International Conference on Cognitive Informatics, null, null, null, IEEE ICCI, null, null, null, null, 0)",,2010
53e99792b7602d9701f5af27,"List(List(53f46e66dabfaee02adb48fd, null, ysq05@mails.tsinghua.edu.cn, null, Shengqi Ye, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(53f362d7dabfae4b3498de6a, null, null, null, Yingjia He, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(54488a23dabfae87b7e3f16a, null, null, null, Jianming Hu, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(561cba1b45ce11c523ca3441, null, null, null, Zuo Zhang, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null))",10.1109/FSKD.2008.678,"List(Spline (mathematics), Mars Exploration Program, Data modeling, Multivariate adaptive regression splines, Traffic flow, Computer science, Regression analysis, Artificial intelligence, Artificial neural network, Machine learning, Beijing)",,,"List(considerable accuracy, promising traffic flow forecasting, proposed mars method, neural networks, short-term traffic flow forecasting, historical traffic flow data, traffic flow forecasting, multivariate adaptive regression splines, mars model, analytical function, mars method, analytic function, forecasting, predictive models, spline, mars, traffic flow, neural network, detectors, data models, regression analysis)",en,11,675.0,669.0,,"List(53e9b95bb7602d9704549008, 53e9ba11b7602d97046117d8, 53e9b8f6b7602d97044dc1a6, 53e99d51b7602d970260acca, 53e9a751b7602d9703088787)",Short-Term Traffic Flow Forecasting Based on MARS,"List(http://dx.doi.org/10.1109/FSKD.2008.678, http://www.webofknowledge.com/)","List(53a72cfa20f7420be8c554b2, null, null, null, null, null, null, FSKD (5), null, null, null, null, 0)",,2008
53e99792b7602d9701f5af35,"List(List(53f43a51dabfaec22baa659b, null, dedwards@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Dennis Edwards, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f3b3ffdabfae4b34b2dae9, null, ssimmons@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Sharon Simmons, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f4333fdabfaeb22f451979, null, nwilde@uwf.edu, null, Norman Wilde, null, null, null, null, Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., null, null, List(Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null))",10.1016/j.jss.2004.12.018,"List(Data mining, Causality, End user, Ranking, Computer science, Military systems, Software, Feature model, Component-based software engineering, A-weighting, Distributed computing)",,1.0,"List(Feature location, Distributed systems, Software reconnaissance)",en,62,68.0,57.0,//static.aminer.org/pdf/PDF/000/996/035/an_approach_to_feature_location_in_distributed_systems.pdf,"List(53e9b6eeb7602d970427df40, 53e9b6eeb7602d9704283b9f, 53e9b40eb7602d9703f01b25, 53e9a3c0b7602d9702ccdfc9, 53e99818b7602d97020347a2, 53e9a2acb7602d9702bb4d7e, 558aa7ea84ae84d265bee194, 558a5258e4b037c08756714c, 53e9b946b7602d97045336a9, 53e9b1d6b7602d9703c67695, 53e9a516b7602d9702e3bcea, 53e9ac33b7602d97035f892c, 53e9ba22b7602d9704628817, 53e9af3ab7602d97039769c8, 53e9b1a3b7602d9703c2c6f7, 53e9ac89b7602d9703660f90, 53e9ad2db7602d970370e8a2, 53e9a735b7602d970306db2b, 53e99960b7602d97021a17da)",An approach to feature location in distributed systems,"List(http://dx.doi.org/10.1016/j.jss.2004.12.018, https://www.sciencedirect.com/science/article/pii/S016412120500004X, http://www.webofknowledge.com/)","List(54825226582fc50b5e05610e, 0164-1212, null, null, null, null, null, Journal of Systems and Software, null, null, null, null, 0)",79.0,2006
53e99792b7602d9701f5b06f,"List(List(53f45e2adabfaeb22f51d645, null, null, null, Luís Macedo, null, null, null, 0000-0002-3144-0362, null, null, null, null, null, null), List(53f45576dabfaeee22a30c3d, null, null, null, Amílcar Cardoso, null, null, null, null, null, null, null, null, null, null))",10.1007/BFb0056317,"List(Adjacency matrix, Graph, Knowledge representation and reasoning, Storytelling, Architectural design, Computer science, Artificial intelligence, Case-based reasoning, Recursion, Subgraph isomorphism problem)",3-540-64990-5,,"List(nested graph-structured representations, adjacency matrix)",en,20,12.0,1.0,,"List(53e9b049b7602d9703aadc37, 53e99df1b7602d97026b4d0e, 53e9a6fdb7602d97030331ef, 53e9b39db7602d9703e81ff1, 53e9b6fab7602d970428ee72, 53e9b109b7602d9703b875ef, 53e9a5e9b7602d9702f136da, 53e99e28b7602d97026f0125, 53e9ba17b7602d970461b707, 53e99ad1b7602d970235524d, 53e9b35ab7602d9703e35fe3, 53e9a70bb7602d9703040c52, 53e99ccab7602d970258074d, 558a73e3e4b0b32fcb36e62f, 53e99d0cb7602d97025c15c4, 53e9b1aab7602d9703c36404, 5c790e6c4895d9cbc61790aa)",Nested Graph-Structured Representations for Cases,"List(http://dx.doi.org/10.1007/BFb0056317, http://www.webofknowledge.com/)","List(53a7271520f7420be8b8b5ba, 0302-9743, null, null, null, null, null, EWCBR, null, null, null, null, 0)",1488.0,1998
53e99792b7602d9701f5b074,"List(List(53f4357bdabfaee4dc77b09a, null, kanchana.thilakarathna@nicta.com.au, null, Kanchana Thilakarathna, null, null, null, 0000-0003-4332-0082, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f4662fdabfaee2a1dadc95, null, null, null, Xinlong Guan, null, null, null, null, Natl ICT Australia, Sydney, NSW, Australia, null, null, List(Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f484c5dabfaee4dc8b0b1e, null, null, null, Aruna Seneviratne, null, null, null, 0000-0001-6894-7987, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null))",10.1145/2594368.2601465,"List(World Wide Web, Content sharing, Android (operating system), Social network, Computer science, Active networking, Overlay, User-centered design)",,,"List(cellular data traffic offloading, mobile social networking, store and forward networks, user generated content sharing)",en,2,361.0,360.0,,"List(53e9b04eb7602d9703ab29b9, 557c6f6a08b02739a5ca7106, 53e9be79b7602d9704b38a13, 53e9b360b7602d9703e3d236)",Demo: Yalut -- user-centric social networking overlay,"List(http://dx.doi.org/10.1145/2594368.2601465, http://doi.acm.org/10.1145/2594368.2601465, http://dl.acm.org/citation.cfm?id=2594368.2601465&coll=DL&dl=GUIDE&CFID=521580964&CFTOKEN=96511501&preflayout=flat, http://www.webofknowledge.com/)","List(53a72cf620f7420be8c548e2, null, null, null, null, null, null, MobiSys, null, null, null, null, 0)",,2014
53e99792b7602d9701f5b085,"List(List(53f43415dabfaee43ec18eea, null, null, null, Bernard L. Menezes, null, null, null, null, Dept. of Electrical and Computer Eng., null, null, List(Dept. of Electrical and Computer Eng.), null, null), List(53f47f2cdabfaee43ed52fa2, null, null, null, K. Thadani, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f42f14dabfaee02ac76859, null, null, null, Alfred G. Dale, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f46382dabfaee02ad88cb3, null, null, null, Roy M. Jenevein, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null))",10.1007/978-1-4613-1679-4_6,"List(Multiprocessor architecture, Space-based architecture, Computer architecture, Computer science, Parallel computing, Symmetric multiprocessor system, Bandwidth (signal processing), Database machine, Host processor, Hypercube, Computation)",,,,en,9,88.0,75.0,//static.aminer.org/pdf/PDF/000/463/867/design_of_a_hyperkyklos_based_multiprocessor_architecture_for_high_performance.pdf,"List(53e9aacab7602d9703449777, 53e9b153b7602d9703bda549, 53e9a82bb7602d970317135f, 53e9bb7ab7602d97047bc792)",Design of a HyperKYKLOS-based Multiprocessor Architecture for High-Performance Join Operations,"List(http://dx.doi.org/10.1007/978-1-4613-1679-4_6, https://link.springer.com/chapter/10.1007%2F978-1-4613-1679-4_6, http://dblp.uni-trier.de/db/conf/iwdm/iwdm87.html#MenezesTDJ87, https://rd.springer.com/chapter/10.1007/978-1-4613-1679-4_6)","List(53a72ac520f7420be8c0cd21, null, null, null, null, null, null, IWDM, null, null, null, null, 0)",,1987


### 3. Create the new DFs
#### 3.1. Venue DF

In [0]:
# creating the venues df
venues_df = (_df
             .withColumn("has_volume_or_issue", F.when(F.col("volume").isNotNull() | F.col("issue").isNotNull(), True).otherwise(None))
             .select("venue.*", "has_volume_or_issue")
             .filter(F.col("_id").isNotNull())
             .drop("src", "sid", "type").distinct())

# removing the columns from the original df
_df = _df.withColumn("venue_id", F.col("venue._id")).drop("venue")

display(venues_df.limit(DISPLAY_LIMIT))

_id,issn,name,name_d,name_s,online_issn,publisher,raw,raw_zh,t,has_volume_or_issue
53a72bad20f7420be8c2d5af,,,IEEE International Conference on Cognitive Informatics,,,,IEEE ICCI,,,
555036db7cea80f9541603d7,,,,,,,J. Multivariate Analysis,,,True
555036ba7cea80f95414db52,,,Advances in Operations Research,,,,Adv. Operations Research,,,True
53a72e2020f7420be8c80142,,,International Symposium on Circuits and Systems,,,,ISCAS (3),,,
53a724b320f7420be8b37f4c,,,Hawaii International Conference on System Sciences,,,,HICSS,,,
53a72cf620f7420be8c548e2,,,,,,,MobiSys,,,
555036c77cea80f954155203,,,IEEE Transactions on Circuits and Systems,,,,IEEE Trans. on Circuits and Systems,,,True
0377-2217,0377-2217,,,,,North-Holland,European Journal of Operational Research,,J,True
555036b77cea80f95414b7de,1869-1919,,,,,,SCIENCE CHINA Information Sciences,,,True
53a72a4920f7420be8bfa51b,,,International Conference on Document Analysis and Recognition,,,,ICDAR-1,,,


In [0]:
# combining rows with the same id, but different column values
# taking the first non-null value for the id as the column value

venue_columns = (
    "issn",
    "name",
    "name_d",
    "name_s",
    "raw",
    "raw_zh",
    "online_issn",
    "publisher",
    "t",
    "has_volume_or_issue"
)

venues_df = venues_df.groupBy(F.col("_id")).agg(*(F.first(F.col(col), ignorenulls=True).alias(col) for col in venue_columns))

In [0]:
venues_df = (
    venues_df
    # coalescing the name and raw columns
    .withColumn("raw", F.coalesce(
            F.col("raw"), 
            F.col("raw_zh"),
        ))
    .withColumn("name", F.coalesce(
            F.col("name"), 
            F.col("name_d"),
        ))
    .drop("name_d", "name_s", "raw_zh") 
    # creating the type field
    .withColumn("type",            
               F.when(
                   (
                       F.col("raw").contains("@") | 
                       F.lower(F.col("raw")).contains("workshop") |
                       F.lower(F.col("name")).contains("workshop")
                   ), 
                   "Workshop"
               ).when(
                   (F.col("t") == "J"),
                   "Journal"
               ).when(
                   (
                       (F.col("t") == "C") |
                       F.lower(F.col("raw")).contains("conference") |
                       F.lower(("name")).contains("conference") |
                       F.lower(F.col("raw")).contains("symposium") |
                       F.lower(("name")).contains("symposium") |
                       F.lower(F.col("raw")).contains("proceedings") |
                       F.lower(("name")).contains("proceedings")
                   ),
                   "Conference"
               ).when(
                   (
                       F.lower(F.col("raw")).contains("journal") |
                       F.lower(("name")).contains("journal") |
                       F.col("has_volume_or_issue")
                   ),
                   "Journal"
               ).otherwise(None)
      )
    .drop("t", "has_volume_or_issue")
)

display(venues_df.limit(DISPLAY_LIMIT))

_id,issn,name,raw,online_issn,publisher,type
0001-0782,0001-0782,,COMMUNICATIONS OF THE ACM,1557-7317,,Journal
0001-253X,0001-253X,,ASLIB PROCEEDINGS,1758-3748,,Journal
0001-2815,0001-2815,,TISSUE ANTIGENS,,,Journal
0001-4966,0001-4966,,JOURNAL OF THE ACOUSTICAL SOCIETY OF AMERICA,1520-8524,,Journal
0001-5903,0001-5903,,Acta Informatica,,,Journal
0001-8708,0001-8708,,Advances in Mathematics,,Academic Press,Journal
0002-8231,0002-8231,,JOURNAL OF THE AMERICAN SOCIETY FOR INFORMATION SCIENCE,,,Journal
0002-9149,0002-9149,,AMERICAN JOURNAL OF CARDIOLOGY,,,Journal
0002-9343,0002-9343,,AMERICAN JOURNAL OF MEDICINE,,,Journal
0002-9378,0002-9378,,AMERICAN JOURNAL OF OBSTETRICS AND GYNECOLOGY,,,Journal


In [0]:
display(_df.limit(DISPLAY_LIMIT))

_id,abstract,authors,doi,fos,isbn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,volume,year,venue_id
53e99784b7602d9701f3e151,,"List(List(53f46797dabfaeb22f542630, null, null, null, Jairo Rocha, null, null, null, null, null, null, null, null, null, null), List(54328883dabfaeb4c6a8a699, null, null, null, Theo Pavlidis, null, null, null, null, null, null, null, null, null, null))",10.1109/ICDAR.1993.395663,"List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",,,"List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)",en,17,605.0,602.0,,"List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)",A solution to the problem of touching and broken characters.,List(http://dx.doi.org/10.1109/ICDAR.1993.395663),,1993,53a72a4920f7420be8bfa51b
53e99784b7602d9701f3e15d,"As process variations become a significant problem in deep sub-micron technology, a shift from deterministic static timing analysis to statistical static timing analysis for high-performance circuit designs could reduce the excessive conservatism that is built into current timing design methods. We address the timing yield problem for sequential circuits and propose a statistical approach to handle it. We consider the spatial and path reconvergence correlations between path delays, set-up time and hold time constraints, and clock skew due to process variations. We propose a method to get the timing yield based on the delay distributions of register-to-register paths in the circuit On average, the timing yield results obtained by our approach have average errors of less than 1.0% in comparison with Monte Carlo simulation. Experimental results show that shortest path variations and clock skew due to process variations have considerable impact on circuit timing, which could bias the timing yield results. In addition, the correlation between longest and shortest path delays is not significant.","List(List(53f43b03dabfaedce555bf2a, null, null, null, Min Pan, null, null, null, null, null, null, null, null, null, null), List(53f45ee9dabfaee43ecda842, null, null, null, Chris C. N. Chu, null, null, null, null, null, null, null, null, null, null), List(53f42e8cdabfaee1c0a4274e, null, null, null, Hai Zhou, null, null, null, null, null, null, null, null, null, null))",10.1109/ISCAS.2005.1465124,"List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",0-7803-8834-8,,"List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)",en,28,,2461.0,//static.aminer.org/pdf/PDF/000/423/329/timing_yield_estimation_using_statistical_static_timing_analysis.pdf,"List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)",Timing yield estimation using statistical static timing analysis,"List(http://dx.doi.org/10.1109/ISCAS.2005.1465124, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1465124)",,2005,53a72e2020f7420be8c80142
53e99784b7602d9701f3f411,"The eXtensible Markup Language 驴 XML 驴 is not only a language for communication between humans and the web, it is also a language for communication between programs. Rather than passing parameters, programs can pass documents from one to another, containing not only pure data, but control information as well. Even legacy programs written in ancient languages such as COBOL and PL/I can be adapted by means ofinterface reengineering to process and to generate XML documents.","List(List(548a2e3ddabfae9b40134fbc, null, null, null, Harry M. Sneed, null, null, null, null, null, null, null, null, null, null))",10.1109/CMPSAC.2002.1044548,"List(XML Base, World Wide Web, XML framework, XML Encryption, Efficient XML Interchange, SGML, Programming language, Software engineering, XML, XML validation, Computer science, cXML)",0-7695-1727-7,,"List(Internet, hypermedia markup languages, information resources, systems re-engineering, COBOL, PL/I, World Wide Web, XML, batch programs, data conversion, e-commerce, eXtensible Markup Language, enterprise application integration, interface reengineering, legacy programs, online programs, software reengineering, subprograms, systems integration)",en,28,172.0,167.0,,"List(53e9adbdb7602d97037be8a2, 53e9bb53b7602d9704792f33, 558aa425e4b0b32fcb37fff4, 558abd44e4b031bae1f9653a, 53e9a326b7602d9702c32229, 53e9b1d7b7602d9703c6ce7c, 558a7de784ae84d265bdee99, 53e9ae17b7602d9703828d13, 53e9aa4fb7602d97033bf9ad)",Using XML to Integrate Existing Software Systems into the Web,"List(http://dx.doi.org/10.1109/CMPSAC.2002.1044548, http://doi.ieeecomputersociety.org/10.1109/CMPSAC.2002.1044548, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1044548)",,2002,53a72e9920f7420be8c93fac
53e99784b7602d9701f3f5fe,"Resource allocation for multi-tier web applications in virtualization environments is one of the most important problems in autonomous computing. On one hand, the more resources that are provisioned to a multitier web application, the easier it is to meet service level objectives (SLO). On the other hand, the virtual machine which hosts the multi-tier web application needs to be consolidated as much as possible in order to maintain high resource utilization. This paper presents an adaptive resource controller which consists of a feedback utilization controller and an auto-regressive and moving average model (ARMA)-based model estimator. It can meet application-level quality of service (QoS) goals while achieving high resource utilization. To evaluate the proposed controllers, simulations are performed on a testbed simulating a virtual data center using Xen virtual machines. Experimental results indicate that the controllers can improve CPU utilization and make the best tradeoff between resource utilization and performance for multi-tier web applications.","List(List(53f46a22dabfaee0d9c3d5e5, null, ysg_2005@hotmail.com, 5b8698cce1cd8e14a3826671, Shuguo Yang, null, null, null, null, School of Mathematics and Physics, Qingdao University of Science and Technology, Qingdao, China 266061, null, 5f71b2e91c455f439fe3f23f, null, null, null))",10.1007/s11704-011-0127-6,"List(Virtualization, Service level objective, Virtual machine, Computer science, Testbed, Quality of service, Provisioning, Resource allocation, Web application, Operating system, Distributed computing)",,4.0,"List(resource allocation, cpu utilization, quality of service)",en,2,512.0,506.0,,"List(53e9a073b7602d9702957efa, 53e9ad87b7602d970377bfb5, 53e9be51b7602d9704b11381, 53e9be04b7602d9704abb31d, 53e9992bb7602d9702169236, 53e998cdb7602d97021044db, 53e9afa6b7602d97039f6054, 53e99822b7602d9702044e60)",Research on resource allocation for multi-tier web applications in a virtualization environment,"List(http://dx.doi.org/10.1007/s11704-011-0127-6, http://link.springer.com/article/10.1007/s11704-011-0127-6, http://www.webofknowledge.com/)",5.0,2011,572de199d39c4f49934b3d5c
53e99792b7602d9701f5af1a,,"List(List(5631df8845cedb3399f3e752, null, null, null, Shigeru Fujita, null, null, null, null, null, null, null, null, null, null), List(53f4775edabfaee4dc891b69, null, null, null, Kenji Sugawara, null, null, null, null, null, null, null, null, null, null), List(54096ca7dabfae450f483585, null, null, null, Claude Moulin, null, null, null, null, null, null, null, null, null, null), List(5448b55bdabfae87b7e68206, null, null, null, Jean-Paul A. Barthès, null, null, null, null, null, null, null, null, null, null))",10.1109/COGINF.2010.5599834,"List(Syma, Computer science, Symbiotic computing, Multi-agent system, Human–computer interaction, Schedule, Artificial intelligence, Ubiquitous computing, Cognition)",,,"List(cognition, multi-agent systems, ubiquitous computing, ADIPS-DASH, OMAS, SYMA, actuators, awareness and operation module, cognition functions, decision functions, intelligent multiagent system, multiparadigm-multiagent framework, perceptual interaction, social interaction, symbiotic base mechanism, symbiotic multiagent system, Awareness, Cognition Layer model, Multi-agent system, Social-ware, Symbiotic Computing)",en,4,630.0,625.0,,"List(53e9b3dab7602d9703ec7ddf, 53e9a3edb7602d9702d03525, 53e9b9fbb7602d97045f67ae, 53e9b4c3b7602d9703fdfe37, 53e9a310b7602d9702c1a36e, 53e9abfeb7602d97035c19c5)",The design of awareness and operation module for the symbiotic applications.,"List(http://dx.doi.org/10.1109/COGINF.2010.5599834, http://doi.ieeecomputersociety.org/10.1109/COGINF.2010.5599834, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5599834)",,2010,53a72bad20f7420be8c2d5af
53e99792b7602d9701f5af27,"A promising traffic flow forecasting model based on Multivariate Adaptive Regression Splines (MARS) is developed in this paper. First, the historical traffic flow data is obtained from the loop detectors installed on the road network of Beijing. Then, part of the data is selected for training the MARS model while the rest is used to test the method. The results based on MARS method are compared with those of other methods such as the Neural Networks. The proposed MARS method is proved to have a considerable accuracy. Moreover, the model constructed with MARS can be described with analytical functions, which helps a lot in the further research on traffic flow forecasting.","List(List(53f46e66dabfaee02adb48fd, null, ysq05@mails.tsinghua.edu.cn, null, Shengqi Ye, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(53f362d7dabfae4b3498de6a, null, null, null, Yingjia He, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(54488a23dabfae87b7e3f16a, null, null, null, Jianming Hu, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(561cba1b45ce11c523ca3441, null, null, null, Zuo Zhang, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null))",10.1109/FSKD.2008.678,"List(Spline (mathematics), Mars Exploration Program, Data modeling, Multivariate adaptive regression splines, Traffic flow, Computer science, Regression analysis, Artificial intelligence, Artificial neural network, Machine learning, Beijing)",,,"List(considerable accuracy, promising traffic flow forecasting, proposed mars method, neural networks, short-term traffic flow forecasting, historical traffic flow data, traffic flow forecasting, multivariate adaptive regression splines, mars model, analytical function, mars method, analytic function, forecasting, predictive models, spline, mars, traffic flow, neural network, detectors, data models, regression analysis)",en,11,675.0,669.0,,"List(53e9b95bb7602d9704549008, 53e9ba11b7602d97046117d8, 53e9b8f6b7602d97044dc1a6, 53e99d51b7602d970260acca, 53e9a751b7602d9703088787)",Short-Term Traffic Flow Forecasting Based on MARS,"List(http://dx.doi.org/10.1109/FSKD.2008.678, http://www.webofknowledge.com/)",,2008,53a72cfa20f7420be8c554b2
53e99792b7602d9701f5af35,"This paper describes an approach to the feature location problem for distributed systems, that is, to the problem of locating which code components are important in providing a particular feature for an end user. A feature is located by observing system execution and noting time intervals in which it is active. Traces of execution in intervals with and without the feature are compared. Earlier experience has shown that this analysis is difficult because distributed systems often exhibit stochastic behavior and because time intervals are hard to identify with precision. To get around these difficulties, the paper proposes a definition of time interval based on the causality analysis introduced by Lamport and others. A strict causal interval may be defined, but it must often be extended to capture latent events and to represent the inherent imprecision in time measurement. This extension is modeled using a weighting function which may be customized to the specific circumstances of each study. The end result of the analysis is a component relevance index, denoted p""c, which can be used to measure the relevance of a software component to a particular feature. Software engineers may focus their analysis efforts on the top components as ranked according to p""c. Two case studies are presented. The first study demonstrates the feasibility of p""c by applying our method to a well-defined distributed system. The second study demonstrates the versatility of p""c by applying our method to message logs obtained from a large military system. Both studies indicate that the suggested approach could be an effective guide for a software engineer who is maintaining or enhancing a distributed system.","List(List(53f43a51dabfaec22baa659b, null, dedwards@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Dennis Edwards, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f3b3ffdabfae4b34b2dae9, null, ssimmons@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Sharon Simmons, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f4333fdabfaeb22f451979, null, nwilde@uwf.edu, null, Norman Wilde, null, null, null, null, Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., null, null, List(Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null))",10.1016/j.jss.2004.12.018,"List(Data mining, Causality, End user, Ranking, Computer science, Military systems, Software, Feature model, Component-based software engineering, A-weighting, Distributed computing)",,1.0,"List(Feature location, Distributed systems, Software reconnaissance)",en,62,68.0,57.0,//static.aminer.org/pdf/PDF/000/996/035/an_approach_to_feature_location_in_distributed_systems.pdf,"List(53e9b6eeb7602d970427df40, 53e9b6eeb7602d9704283b9f, 53e9b40eb7602d9703f01b25, 53e9a3c0b7602d9702ccdfc9, 53e99818b7602d97020347a2, 53e9a2acb7602d9702bb4d7e, 558aa7ea84ae84d265bee194, 558a5258e4b037c08756714c, 53e9b946b7602d97045336a9, 53e9b1d6b7602d9703c67695, 53e9a516b7602d9702e3bcea, 53e9ac33b7602d97035f892c, 53e9ba22b7602d9704628817, 53e9af3ab7602d97039769c8, 53e9b1a3b7602d9703c2c6f7, 53e9ac89b7602d9703660f90, 53e9ad2db7602d970370e8a2, 53e9a735b7602d970306db2b, 53e99960b7602d97021a17da)",An approach to feature location in distributed systems,"List(http://dx.doi.org/10.1016/j.jss.2004.12.018, https://www.sciencedirect.com/science/article/pii/S016412120500004X, http://www.webofknowledge.com/)",79.0,2006,54825226582fc50b5e05610e
53e99792b7602d9701f5b06f,"This paper describes an approach to representing cases as nested graph-structures, i.e., as hierarchically, spatially, temporally and causally interconnected nodes (case nodes), which may be themselves recursively described by other sets of interconnected nodes. Each case node represents a case piece (sub-case). An adjacency matrix may represent these nested graph-structured cases. Within our approach, new cases are constructed using an iterative context-guided retrieval of case nodes from multiple cases. In order to illustrate the expressiveness of this case representation approach, we discuss its application to the diagnosis and therapeutics of neurological diseases, to architectural design and to storytelling. Some issues that come out of this approach, like its contribution to the representation of cases of CBR and to integrate ordinary and creative reasoning, are discussed.","List(List(53f45e2adabfaeb22f51d645, null, null, null, Luís Macedo, null, null, null, 0000-0002-3144-0362, null, null, null, null, null, null), List(53f45576dabfaeee22a30c3d, null, null, null, Amílcar Cardoso, null, null, null, null, null, null, null, null, null, null))",10.1007/BFb0056317,"List(Adjacency matrix, Graph, Knowledge representation and reasoning, Storytelling, Architectural design, Computer science, Artificial intelligence, Case-based reasoning, Recursion, Subgraph isomorphism problem)",3-540-64990-5,,"List(nested graph-structured representations, adjacency matrix)",en,20,12.0,1.0,,"List(53e9b049b7602d9703aadc37, 53e99df1b7602d97026b4d0e, 53e9a6fdb7602d97030331ef, 53e9b39db7602d9703e81ff1, 53e9b6fab7602d970428ee72, 53e9b109b7602d9703b875ef, 53e9a5e9b7602d9702f136da, 53e99e28b7602d97026f0125, 53e9ba17b7602d970461b707, 53e99ad1b7602d970235524d, 53e9b35ab7602d9703e35fe3, 53e9a70bb7602d9703040c52, 53e99ccab7602d970258074d, 558a73e3e4b0b32fcb36e62f, 53e99d0cb7602d97025c15c4, 53e9b1aab7602d9703c36404, 5c790e6c4895d9cbc61790aa)",Nested Graph-Structured Representations for Cases,"List(http://dx.doi.org/10.1007/BFb0056317, http://www.webofknowledge.com/)",1488.0,1998,53a7271520f7420be8b8b5ba
53e99792b7602d9701f5b074,"Yalut is a novel user-centric hybrid content sharing overlay for social networking. Yalut enables the users to retain control over their own data and preserve their privacy, whilst still using the popular centralized services. In this demonstration, we show the feasibility of Yalut by integrating the service with the popular social networking apps on Android devices, Mac and Windows desktop platforms. We show that it is possible to provide the benefits of distributed content sharing on top of the existing centralized services with minimal changes to the content sharing process.","List(List(53f4357bdabfaee4dc77b09a, null, kanchana.thilakarathna@nicta.com.au, null, Kanchana Thilakarathna, null, null, null, 0000-0003-4332-0082, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f4662fdabfaee2a1dadc95, null, null, null, Xinlong Guan, null, null, null, null, Natl ICT Australia, Sydney, NSW, Australia, null, null, List(Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f484c5dabfaee4dc8b0b1e, null, null, null, Aruna Seneviratne, null, null, null, 0000-0001-6894-7987, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null))",10.1145/2594368.2601465,"List(World Wide Web, Content sharing, Android (operating system), Social network, Computer science, Active networking, Overlay, User-centered design)",,,"List(cellular data traffic offloading, mobile social networking, store and forward networks, user generated content sharing)",en,2,361.0,360.0,,"List(53e9b04eb7602d9703ab29b9, 557c6f6a08b02739a5ca7106, 53e9be79b7602d9704b38a13, 53e9b360b7602d9703e3d236)",Demo: Yalut -- user-centric social networking overlay,"List(http://dx.doi.org/10.1145/2594368.2601465, http://doi.acm.org/10.1145/2594368.2601465, http://dl.acm.org/citation.cfm?id=2594368.2601465&coll=DL&dl=GUIDE&CFID=521580964&CFTOKEN=96511501&preflayout=flat, http://www.webofknowledge.com/)",,2014,53a72cf620f7420be8c548e2
53e99792b7602d9701f5b085,"The traffic characteristics of various distributed join algorithms on the Hypercube are analyzed. It is shown that, regardless of which join strategy is employed, the network bandwidth requirements of the computation and collection phases are radically different. This imbalance prevents these two phases from being pipelined (overlapped). To alleviate this problem, the HyperKYKLOS Network is proposed. The topology of this network is defined and a brief description of the I/O nodes presently under construction is included.","List(List(53f43415dabfaee43ec18eea, null, null, null, Bernard L. Menezes, null, null, null, null, Dept. of Electrical and Computer Eng., null, null, List(Dept. of Electrical and Computer Eng.), null, null), List(53f47f2cdabfaee43ed52fa2, null, null, null, K. Thadani, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f42f14dabfaee02ac76859, null, null, null, Alfred G. Dale, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f46382dabfaee02ad88cb3, null, null, null, Roy M. Jenevein, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null))",10.1007/978-1-4613-1679-4_6,"List(Multiprocessor architecture, Space-based architecture, Computer architecture, Computer science, Parallel computing, Symmetric multiprocessor system, Bandwidth (signal processing), Database machine, Host processor, Hypercube, Computation)",,,,en,9,88.0,75.0,//static.aminer.org/pdf/PDF/000/463/867/design_of_a_hyperkyklos_based_multiprocessor_architecture_for_high_performance.pdf,"List(53e9aacab7602d9703449777, 53e9b153b7602d9703bda549, 53e9a82bb7602d970317135f, 53e9bb7ab7602d97047bc792)",Design of a HyperKYKLOS-based Multiprocessor Architecture for High-Performance Join Operations,"List(http://dx.doi.org/10.1007/978-1-4613-1679-4_6, https://link.springer.com/chapter/10.1007%2F978-1-4613-1679-4_6, http://dblp.uni-trier.de/db/conf/iwdm/iwdm87.html#MenezesTDJ87, https://rd.springer.com/chapter/10.1007/978-1-4613-1679-4_6)",,1987,53a72ac520f7420be8c0cd21


#### 3.2. Author DF

In [0]:
# Create the Authors DF

df2 = _df.withColumn('auth_expl', F.explode(F.col("authors"))) # explode the authors array
df2 = (df2.withColumn('auth_id', F.col('auth_expl._id')) # separate the authors id and name
          .withColumn('auth_name', F.col('auth_expl.name')))

authors_df = df2.select('auth_id', 'auth_name').distinct() # make the authors df of distinct auth_id and auth_name pairs
# there were only 92 rows where both auth id and name were null.
# altogether, there are 400k unique authors.

authors_df = (authors_df.withColumnRenamed('auth_id', 'ID')
                        .withColumnRenamed('auth_name', 'Name'))

del df2
display(authors_df.limit(DISPLAY_LIMIT))

ID,Name
53f46a22dabfaee0d9c3d5e5,Shuguo Yang
53f45ee9dabfaee43ecda842,Chris C. N. Chu
548a2e3ddabfae9b40134fbc,Harry M. Sneed
53f45e2adabfaeb22f51d645,Luís Macedo
53f42e8cdabfaee1c0a4274e,Hai Zhou
53f43b03dabfaedce555bf2a,Min Pan
54328883dabfaeb4c6a8a699,Theo Pavlidis
53f4775edabfaee4dc891b69,Kenji Sugawara
53f4333fdabfaeb22f451979,Norman Wilde
53f46e66dabfaee02adb48fd,Shengqi Ye


In [0]:
# Generate the authors FK column in the original DF
_df = _df.withColumn('Author_ID', F.col('authors._id'))
display(_df.limit(DISPLAY_LIMIT))

_id,abstract,authors,doi,fos,isbn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,volume,year,venue_id,Author_ID
53e99784b7602d9701f3e151,,"List(List(53f46797dabfaeb22f542630, null, null, null, Jairo Rocha, null, null, null, null, null, null, null, null, null, null), List(54328883dabfaeb4c6a8a699, null, null, null, Theo Pavlidis, null, null, null, null, null, null, null, null, null, null))",10.1109/ICDAR.1993.395663,"List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",,,"List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)",en,17,605.0,602.0,,"List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)",A solution to the problem of touching and broken characters.,List(http://dx.doi.org/10.1109/ICDAR.1993.395663),,1993,53a72a4920f7420be8bfa51b,"List(53f46797dabfaeb22f542630, 54328883dabfaeb4c6a8a699)"
53e99784b7602d9701f3e15d,"As process variations become a significant problem in deep sub-micron technology, a shift from deterministic static timing analysis to statistical static timing analysis for high-performance circuit designs could reduce the excessive conservatism that is built into current timing design methods. We address the timing yield problem for sequential circuits and propose a statistical approach to handle it. We consider the spatial and path reconvergence correlations between path delays, set-up time and hold time constraints, and clock skew due to process variations. We propose a method to get the timing yield based on the delay distributions of register-to-register paths in the circuit On average, the timing yield results obtained by our approach have average errors of less than 1.0% in comparison with Monte Carlo simulation. Experimental results show that shortest path variations and clock skew due to process variations have considerable impact on circuit timing, which could bias the timing yield results. In addition, the correlation between longest and shortest path delays is not significant.","List(List(53f43b03dabfaedce555bf2a, null, null, null, Min Pan, null, null, null, null, null, null, null, null, null, null), List(53f45ee9dabfaee43ecda842, null, null, null, Chris C. N. Chu, null, null, null, null, null, null, null, null, null, null), List(53f42e8cdabfaee1c0a4274e, null, null, null, Hai Zhou, null, null, null, null, null, null, null, null, null, null))",10.1109/ISCAS.2005.1465124,"List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",0-7803-8834-8,,"List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)",en,28,,2461.0,//static.aminer.org/pdf/PDF/000/423/329/timing_yield_estimation_using_statistical_static_timing_analysis.pdf,"List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)",Timing yield estimation using statistical static timing analysis,"List(http://dx.doi.org/10.1109/ISCAS.2005.1465124, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1465124)",,2005,53a72e2020f7420be8c80142,"List(53f43b03dabfaedce555bf2a, 53f45ee9dabfaee43ecda842, 53f42e8cdabfaee1c0a4274e)"
53e99784b7602d9701f3f411,"The eXtensible Markup Language 驴 XML 驴 is not only a language for communication between humans and the web, it is also a language for communication between programs. Rather than passing parameters, programs can pass documents from one to another, containing not only pure data, but control information as well. Even legacy programs written in ancient languages such as COBOL and PL/I can be adapted by means ofinterface reengineering to process and to generate XML documents.","List(List(548a2e3ddabfae9b40134fbc, null, null, null, Harry M. Sneed, null, null, null, null, null, null, null, null, null, null))",10.1109/CMPSAC.2002.1044548,"List(XML Base, World Wide Web, XML framework, XML Encryption, Efficient XML Interchange, SGML, Programming language, Software engineering, XML, XML validation, Computer science, cXML)",0-7695-1727-7,,"List(Internet, hypermedia markup languages, information resources, systems re-engineering, COBOL, PL/I, World Wide Web, XML, batch programs, data conversion, e-commerce, eXtensible Markup Language, enterprise application integration, interface reengineering, legacy programs, online programs, software reengineering, subprograms, systems integration)",en,28,172.0,167.0,,"List(53e9adbdb7602d97037be8a2, 53e9bb53b7602d9704792f33, 558aa425e4b0b32fcb37fff4, 558abd44e4b031bae1f9653a, 53e9a326b7602d9702c32229, 53e9b1d7b7602d9703c6ce7c, 558a7de784ae84d265bdee99, 53e9ae17b7602d9703828d13, 53e9aa4fb7602d97033bf9ad)",Using XML to Integrate Existing Software Systems into the Web,"List(http://dx.doi.org/10.1109/CMPSAC.2002.1044548, http://doi.ieeecomputersociety.org/10.1109/CMPSAC.2002.1044548, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1044548)",,2002,53a72e9920f7420be8c93fac,List(548a2e3ddabfae9b40134fbc)
53e99784b7602d9701f3f5fe,"Resource allocation for multi-tier web applications in virtualization environments is one of the most important problems in autonomous computing. On one hand, the more resources that are provisioned to a multitier web application, the easier it is to meet service level objectives (SLO). On the other hand, the virtual machine which hosts the multi-tier web application needs to be consolidated as much as possible in order to maintain high resource utilization. This paper presents an adaptive resource controller which consists of a feedback utilization controller and an auto-regressive and moving average model (ARMA)-based model estimator. It can meet application-level quality of service (QoS) goals while achieving high resource utilization. To evaluate the proposed controllers, simulations are performed on a testbed simulating a virtual data center using Xen virtual machines. Experimental results indicate that the controllers can improve CPU utilization and make the best tradeoff between resource utilization and performance for multi-tier web applications.","List(List(53f46a22dabfaee0d9c3d5e5, null, ysg_2005@hotmail.com, 5b8698cce1cd8e14a3826671, Shuguo Yang, null, null, null, null, School of Mathematics and Physics, Qingdao University of Science and Technology, Qingdao, China 266061, null, 5f71b2e91c455f439fe3f23f, null, null, null))",10.1007/s11704-011-0127-6,"List(Virtualization, Service level objective, Virtual machine, Computer science, Testbed, Quality of service, Provisioning, Resource allocation, Web application, Operating system, Distributed computing)",,4.0,"List(resource allocation, cpu utilization, quality of service)",en,2,512.0,506.0,,"List(53e9a073b7602d9702957efa, 53e9ad87b7602d970377bfb5, 53e9be51b7602d9704b11381, 53e9be04b7602d9704abb31d, 53e9992bb7602d9702169236, 53e998cdb7602d97021044db, 53e9afa6b7602d97039f6054, 53e99822b7602d9702044e60)",Research on resource allocation for multi-tier web applications in a virtualization environment,"List(http://dx.doi.org/10.1007/s11704-011-0127-6, http://link.springer.com/article/10.1007/s11704-011-0127-6, http://www.webofknowledge.com/)",5.0,2011,572de199d39c4f49934b3d5c,List(53f46a22dabfaee0d9c3d5e5)
53e99792b7602d9701f5af1a,,"List(List(5631df8845cedb3399f3e752, null, null, null, Shigeru Fujita, null, null, null, null, null, null, null, null, null, null), List(53f4775edabfaee4dc891b69, null, null, null, Kenji Sugawara, null, null, null, null, null, null, null, null, null, null), List(54096ca7dabfae450f483585, null, null, null, Claude Moulin, null, null, null, null, null, null, null, null, null, null), List(5448b55bdabfae87b7e68206, null, null, null, Jean-Paul A. Barthès, null, null, null, null, null, null, null, null, null, null))",10.1109/COGINF.2010.5599834,"List(Syma, Computer science, Symbiotic computing, Multi-agent system, Human–computer interaction, Schedule, Artificial intelligence, Ubiquitous computing, Cognition)",,,"List(cognition, multi-agent systems, ubiquitous computing, ADIPS-DASH, OMAS, SYMA, actuators, awareness and operation module, cognition functions, decision functions, intelligent multiagent system, multiparadigm-multiagent framework, perceptual interaction, social interaction, symbiotic base mechanism, symbiotic multiagent system, Awareness, Cognition Layer model, Multi-agent system, Social-ware, Symbiotic Computing)",en,4,630.0,625.0,,"List(53e9b3dab7602d9703ec7ddf, 53e9a3edb7602d9702d03525, 53e9b9fbb7602d97045f67ae, 53e9b4c3b7602d9703fdfe37, 53e9a310b7602d9702c1a36e, 53e9abfeb7602d97035c19c5)",The design of awareness and operation module for the symbiotic applications.,"List(http://dx.doi.org/10.1109/COGINF.2010.5599834, http://doi.ieeecomputersociety.org/10.1109/COGINF.2010.5599834, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5599834)",,2010,53a72bad20f7420be8c2d5af,"List(5631df8845cedb3399f3e752, 53f4775edabfaee4dc891b69, 54096ca7dabfae450f483585, 5448b55bdabfae87b7e68206)"
53e99792b7602d9701f5af27,"A promising traffic flow forecasting model based on Multivariate Adaptive Regression Splines (MARS) is developed in this paper. First, the historical traffic flow data is obtained from the loop detectors installed on the road network of Beijing. Then, part of the data is selected for training the MARS model while the rest is used to test the method. The results based on MARS method are compared with those of other methods such as the Neural Networks. The proposed MARS method is proved to have a considerable accuracy. Moreover, the model constructed with MARS can be described with analytical functions, which helps a lot in the further research on traffic flow forecasting.","List(List(53f46e66dabfaee02adb48fd, null, ysq05@mails.tsinghua.edu.cn, null, Shengqi Ye, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(53f362d7dabfae4b3498de6a, null, null, null, Yingjia He, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(54488a23dabfae87b7e3f16a, null, null, null, Jianming Hu, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(561cba1b45ce11c523ca3441, null, null, null, Zuo Zhang, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null))",10.1109/FSKD.2008.678,"List(Spline (mathematics), Mars Exploration Program, Data modeling, Multivariate adaptive regression splines, Traffic flow, Computer science, Regression analysis, Artificial intelligence, Artificial neural network, Machine learning, Beijing)",,,"List(considerable accuracy, promising traffic flow forecasting, proposed mars method, neural networks, short-term traffic flow forecasting, historical traffic flow data, traffic flow forecasting, multivariate adaptive regression splines, mars model, analytical function, mars method, analytic function, forecasting, predictive models, spline, mars, traffic flow, neural network, detectors, data models, regression analysis)",en,11,675.0,669.0,,"List(53e9b95bb7602d9704549008, 53e9ba11b7602d97046117d8, 53e9b8f6b7602d97044dc1a6, 53e99d51b7602d970260acca, 53e9a751b7602d9703088787)",Short-Term Traffic Flow Forecasting Based on MARS,"List(http://dx.doi.org/10.1109/FSKD.2008.678, http://www.webofknowledge.com/)",,2008,53a72cfa20f7420be8c554b2,"List(53f46e66dabfaee02adb48fd, 53f362d7dabfae4b3498de6a, 54488a23dabfae87b7e3f16a, 561cba1b45ce11c523ca3441)"
53e99792b7602d9701f5af35,"This paper describes an approach to the feature location problem for distributed systems, that is, to the problem of locating which code components are important in providing a particular feature for an end user. A feature is located by observing system execution and noting time intervals in which it is active. Traces of execution in intervals with and without the feature are compared. Earlier experience has shown that this analysis is difficult because distributed systems often exhibit stochastic behavior and because time intervals are hard to identify with precision. To get around these difficulties, the paper proposes a definition of time interval based on the causality analysis introduced by Lamport and others. A strict causal interval may be defined, but it must often be extended to capture latent events and to represent the inherent imprecision in time measurement. This extension is modeled using a weighting function which may be customized to the specific circumstances of each study. The end result of the analysis is a component relevance index, denoted p""c, which can be used to measure the relevance of a software component to a particular feature. Software engineers may focus their analysis efforts on the top components as ranked according to p""c. Two case studies are presented. The first study demonstrates the feasibility of p""c by applying our method to a well-defined distributed system. The second study demonstrates the versatility of p""c by applying our method to message logs obtained from a large military system. Both studies indicate that the suggested approach could be an effective guide for a software engineer who is maintaining or enhancing a distributed system.","List(List(53f43a51dabfaec22baa659b, null, dedwards@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Dennis Edwards, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f3b3ffdabfae4b34b2dae9, null, ssimmons@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Sharon Simmons, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f4333fdabfaeb22f451979, null, nwilde@uwf.edu, null, Norman Wilde, null, null, null, null, Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., null, null, List(Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null))",10.1016/j.jss.2004.12.018,"List(Data mining, Causality, End user, Ranking, Computer science, Military systems, Software, Feature model, Component-based software engineering, A-weighting, Distributed computing)",,1.0,"List(Feature location, Distributed systems, Software reconnaissance)",en,62,68.0,57.0,//static.aminer.org/pdf/PDF/000/996/035/an_approach_to_feature_location_in_distributed_systems.pdf,"List(53e9b6eeb7602d970427df40, 53e9b6eeb7602d9704283b9f, 53e9b40eb7602d9703f01b25, 53e9a3c0b7602d9702ccdfc9, 53e99818b7602d97020347a2, 53e9a2acb7602d9702bb4d7e, 558aa7ea84ae84d265bee194, 558a5258e4b037c08756714c, 53e9b946b7602d97045336a9, 53e9b1d6b7602d9703c67695, 53e9a516b7602d9702e3bcea, 53e9ac33b7602d97035f892c, 53e9ba22b7602d9704628817, 53e9af3ab7602d97039769c8, 53e9b1a3b7602d9703c2c6f7, 53e9ac89b7602d9703660f90, 53e9ad2db7602d970370e8a2, 53e9a735b7602d970306db2b, 53e99960b7602d97021a17da)",An approach to feature location in distributed systems,"List(http://dx.doi.org/10.1016/j.jss.2004.12.018, https://www.sciencedirect.com/science/article/pii/S016412120500004X, http://www.webofknowledge.com/)",79.0,2006,54825226582fc50b5e05610e,"List(53f43a51dabfaec22baa659b, 53f3b3ffdabfae4b34b2dae9, 53f4333fdabfaeb22f451979)"
53e99792b7602d9701f5b06f,"This paper describes an approach to representing cases as nested graph-structures, i.e., as hierarchically, spatially, temporally and causally interconnected nodes (case nodes), which may be themselves recursively described by other sets of interconnected nodes. Each case node represents a case piece (sub-case). An adjacency matrix may represent these nested graph-structured cases. Within our approach, new cases are constructed using an iterative context-guided retrieval of case nodes from multiple cases. In order to illustrate the expressiveness of this case representation approach, we discuss its application to the diagnosis and therapeutics of neurological diseases, to architectural design and to storytelling. Some issues that come out of this approach, like its contribution to the representation of cases of CBR and to integrate ordinary and creative reasoning, are discussed.","List(List(53f45e2adabfaeb22f51d645, null, null, null, Luís Macedo, null, null, null, 0000-0002-3144-0362, null, null, null, null, null, null), List(53f45576dabfaeee22a30c3d, null, null, null, Amílcar Cardoso, null, null, null, null, null, null, null, null, null, null))",10.1007/BFb0056317,"List(Adjacency matrix, Graph, Knowledge representation and reasoning, Storytelling, Architectural design, Computer science, Artificial intelligence, Case-based reasoning, Recursion, Subgraph isomorphism problem)",3-540-64990-5,,"List(nested graph-structured representations, adjacency matrix)",en,20,12.0,1.0,,"List(53e9b049b7602d9703aadc37, 53e99df1b7602d97026b4d0e, 53e9a6fdb7602d97030331ef, 53e9b39db7602d9703e81ff1, 53e9b6fab7602d970428ee72, 53e9b109b7602d9703b875ef, 53e9a5e9b7602d9702f136da, 53e99e28b7602d97026f0125, 53e9ba17b7602d970461b707, 53e99ad1b7602d970235524d, 53e9b35ab7602d9703e35fe3, 53e9a70bb7602d9703040c52, 53e99ccab7602d970258074d, 558a73e3e4b0b32fcb36e62f, 53e99d0cb7602d97025c15c4, 53e9b1aab7602d9703c36404, 5c790e6c4895d9cbc61790aa)",Nested Graph-Structured Representations for Cases,"List(http://dx.doi.org/10.1007/BFb0056317, http://www.webofknowledge.com/)",1488.0,1998,53a7271520f7420be8b8b5ba,"List(53f45e2adabfaeb22f51d645, 53f45576dabfaeee22a30c3d)"
53e99792b7602d9701f5b074,"Yalut is a novel user-centric hybrid content sharing overlay for social networking. Yalut enables the users to retain control over their own data and preserve their privacy, whilst still using the popular centralized services. In this demonstration, we show the feasibility of Yalut by integrating the service with the popular social networking apps on Android devices, Mac and Windows desktop platforms. We show that it is possible to provide the benefits of distributed content sharing on top of the existing centralized services with minimal changes to the content sharing process.","List(List(53f4357bdabfaee4dc77b09a, null, kanchana.thilakarathna@nicta.com.au, null, Kanchana Thilakarathna, null, null, null, 0000-0003-4332-0082, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f4662fdabfaee2a1dadc95, null, null, null, Xinlong Guan, null, null, null, null, Natl ICT Australia, Sydney, NSW, Australia, null, null, List(Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f484c5dabfaee4dc8b0b1e, null, null, null, Aruna Seneviratne, null, null, null, 0000-0001-6894-7987, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null))",10.1145/2594368.2601465,"List(World Wide Web, Content sharing, Android (operating system), Social network, Computer science, Active networking, Overlay, User-centered design)",,,"List(cellular data traffic offloading, mobile social networking, store and forward networks, user generated content sharing)",en,2,361.0,360.0,,"List(53e9b04eb7602d9703ab29b9, 557c6f6a08b02739a5ca7106, 53e9be79b7602d9704b38a13, 53e9b360b7602d9703e3d236)",Demo: Yalut -- user-centric social networking overlay,"List(http://dx.doi.org/10.1145/2594368.2601465, http://doi.acm.org/10.1145/2594368.2601465, http://dl.acm.org/citation.cfm?id=2594368.2601465&coll=DL&dl=GUIDE&CFID=521580964&CFTOKEN=96511501&preflayout=flat, http://www.webofknowledge.com/)",,2014,53a72cf620f7420be8c548e2,"List(53f4357bdabfaee4dc77b09a, 53f4662fdabfaee2a1dadc95, 53f484c5dabfaee4dc8b0b1e)"
53e99792b7602d9701f5b085,"The traffic characteristics of various distributed join algorithms on the Hypercube are analyzed. It is shown that, regardless of which join strategy is employed, the network bandwidth requirements of the computation and collection phases are radically different. This imbalance prevents these two phases from being pipelined (overlapped). To alleviate this problem, the HyperKYKLOS Network is proposed. The topology of this network is defined and a brief description of the I/O nodes presently under construction is included.","List(List(53f43415dabfaee43ec18eea, null, null, null, Bernard L. Menezes, null, null, null, null, Dept. of Electrical and Computer Eng., null, null, List(Dept. of Electrical and Computer Eng.), null, null), List(53f47f2cdabfaee43ed52fa2, null, null, null, K. Thadani, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f42f14dabfaee02ac76859, null, null, null, Alfred G. Dale, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f46382dabfaee02ad88cb3, null, null, null, Roy M. Jenevein, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null))",10.1007/978-1-4613-1679-4_6,"List(Multiprocessor architecture, Space-based architecture, Computer architecture, Computer science, Parallel computing, Symmetric multiprocessor system, Bandwidth (signal processing), Database machine, Host processor, Hypercube, Computation)",,,,en,9,88.0,75.0,//static.aminer.org/pdf/PDF/000/463/867/design_of_a_hyperkyklos_based_multiprocessor_architecture_for_high_performance.pdf,"List(53e9aacab7602d9703449777, 53e9b153b7602d9703bda549, 53e9a82bb7602d970317135f, 53e9bb7ab7602d97047bc792)",Design of a HyperKYKLOS-based Multiprocessor Architecture for High-Performance Join Operations,"List(http://dx.doi.org/10.1007/978-1-4613-1679-4_6, https://link.springer.com/chapter/10.1007%2F978-1-4613-1679-4_6, http://dblp.uni-trier.de/db/conf/iwdm/iwdm87.html#MenezesTDJ87, https://rd.springer.com/chapter/10.1007/978-1-4613-1679-4_6)",,1987,53a72ac520f7420be8c0cd21,"List(53f43415dabfaee43ec18eea, 53f47f2cdabfaee43ed52fa2, 53f42f14dabfaee02ac76859, 53f46382dabfaee02ad88cb3)"


#### 3.3. Organization DF

TODO: Make sure that in \_df there is the column 'Org' with the organization id.

In [0]:
# Create the new df

# finds the country names in a list of strings
# modified to only use the first element of the list
# uses regex to remove punctuation from the string and to match the given names of the countries and some abbreviations

def getCountry(s):
    if s is None:
        return None
    arr = []
    countries = ["Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua & Deps", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina", "Burundi", "Cambodia", "Cameroon", "Canada", "Cape Verde", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo", "Congo Democratic Republic", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czech Republic", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "East Timor", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Ethiopia", "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-bissau", "Guyana", "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Ivory Coast", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "South Korea", "Kosovo", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg", "Macedonia", "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar", "Burma", "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "Norway", "Romania", "Pakistan", "Palau", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Oman", "Russia", "Rwanda", "St Kitts & Nevis", "St Lucia", "Saint Vincent & The Grenadines", "Samoa", "San Marino", "Sao Tome & Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Swaziland", "Sweden", "Switzerland", "Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Togo", "Tonga", "Trinidad & Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe"]
    state_names = ["alaska", "alabama", "arkansas", "american samoa", "arizona", "california", "colorado", "connecticut", "district ", "of columbia", "delaware", "florida", "georgia", "guam", "hawaii", "iowa", "idaho", "illinois", "indiana", "kansas", "kentucky", "louisiana", "massachusetts", "maryland", "maine", "michigan", "minnesota", "missouri", "mississippi", "montana", "north carolina", "north dakota", "nebraska", "new hampshire", "new jersey", "new mexico", "nevada", "new york", "ohio", "oklahoma", "oregon", "pennsylvania", "puerto rico", "rhode island", "south carolina", "south dakota", "tennessee", "texas", "utah", "virginia", "virgin islands", "vermont", "washington", "wisconsin", "west virginia", "wyoming"]
    states = ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']
    
    for i in s:
        if i["org"] is None:
            arr.append(None)
            break
        sent = re.sub("[^a-zA-Z -]", "", i["org"])
        x = None
        for j in countries:
            x = re.search(j.lower(), sent.lower())
            if x is not None:
                if j.lower() == 'india':
                    x = re.search('indiana', sent.lower())
                    if x is not None:
                        arr.append("United States")
                elif j.lower() == 'georgia':
                    x = re.search('USA', sent)
                    if x is not None:
                        arr.append("United States")
                else:
                    arr.append(j)
                break
        if x is None:
            x = re.search("USA", sent)
            if x is not None:
                arr.append("United States")
                break
        if x is None:
            x = re.search("UK", sent)
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            x = re.search("england", sent.lower())
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            x = re.search("scotland", sent.lower())
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            x = re.search("wales", sent.lower())
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            for j in states:
                x = re.search(j, sent)
                if x is not None:
                    arr.append("United States")
                    break
        if x is None:
            for j in state_names:
                x = re.search(j, sent.lower())
                if x is not None:
                    arr.append("United States")
                    break
        break
                    
    if len(arr) > 0:
        return arr[0]
    else:
        return None

getCountryUDF = udf(getCountry)

In [0]:
# Organization (affiliation of the first author)
# ID - authors.orgid
# Name - authors.org
# Country - getCountryUDF(F.arrays_zip("authors.org"))
def organization(df):
    new_df = df.select(F.col("authors.orgid").getItem(0).alias("ID"),
                       F.col("authors.org").getItem(0).alias("Name"),
                       (getCountryUDF(F.arrays_zip("authors.org"))).alias("Country"))
    new_df = new_df.na.drop("all")
    return new_df

org_df = organization(_df)

In [0]:
# create the new df with new IDs
new_df = org_df.select('ID', 'Name', 'Country').distinct()
new_df = new_df.withColumn('Org', F.monotonically_increasing_id())
new_df = new_df.withColumn('Org', new_df.Org.cast(T.StringType()))
new_df = new_df.fillna('missing_org_id', 'ID')

In [0]:
# add separate org id and name columns to the original df
df2 = _df.withColumn("ID", F.col("authors.orgid").getItem(0)).withColumn("Name", F.col("authors.org").getItem(0))
df2 = df2.fillna('missing_org_id', 'ID')

In [0]:
# Join ID to original dataframe (_df)

# join newly created IDs to the original df and replace missing org ids with generated IDs
_df = df2.join(new_df.select('Org', 'ID', 'Name'), on=['ID', 'Name'])
_df = _df.replace('missing_org_id', None)
_df = _df.withColumn('Org', F.coalesce(_df.ID, _df.Org))
_df = _df.drop('ID', 'Name')

display(_df.limit(DISPLAY_LIMIT))

_id,abstract,authors,doi,fos,isbn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,volume,year,venue_id,Author_ID,Org
53e997d7b7602d9701fd0004,"An ad hoc team setting is one in which teammates must work together to obtain a common goal, but without any prior agreement regarding how to work together. In this work we introduce a role-based approach for ad hoc teamwork, in which each teammate is inferred to be following a specialized role that accomplishes a specific task or exhibits a particular behavior. In such cases, the role an ad hoc agent should select depends both on its own capabilities and on the roles currently selected by other team members. We present methods for evaluating the influence of the ad hoc agent's role selection on the team's utility and we examine empirically how to choose the best suited method for role assignment in a complex environment. Finally, we show that an appropriate assignment method can be determined from a limited amount of data and used successfully in new tasks that the team has not encountered before.","List(List(53f43819dabfaee02acd9ff6, null, null, 5b86b6bfe1cd8e14a34c9598, Katie Genter, null, null, null, null, The University of Texas at Austin, null, 5f71b2841c455f439fe3c6bf, null, null, null), List(53f434b5dabfaeecd694fcdf, null, null, 5b86b6bfe1cd8e14a34c9598, Noa Agmon, null, null, null, null, The University of Texas at Austin, null, 5f71b2841c455f439fe3c6bf, null, null, null), List(53f42d28dabfaeb2acfe712a, null, null, 5b86b6bfe1cd8e14a34c9598, Peter Stone, null, null, null, null, The University of Texas at Austin, null, 5f71b2841c455f439fe3c6bf, null, null, null))",,"List(Teamwork, Computer science, Knowledge management, Human–computer interaction)",0-9817381-3-3,,"List(role assignment, appropriate assignment method, specialized role, common goal, role selection, complex environment, present method, team member, team setting, limited amount)",en,1.0,1252.0,1251,https://static.aminer.cn/upload/pdf/program/53e997d7b7602d9701fd0004_0.pdf,"List(53e9a309b7602d9702c14b8b, 53e99858b7602d970208f24f, 53e9aa61b7602d97033d4b67, 53e9a1e1b7602d9702ae0e7e, 53e9a6cab7602d9702fff64d)",Role selection in ad hoc teamwork,List(http://dl.acm.org/citation.cfm?id=2343948),,2012,555037187cea80f954172d7a,"List(53f43819dabfaee02acd9ff6, 53f434b5dabfaeecd694fcdf, 53f42d28dabfaeb2acfe712a)",5f71b2841c455f439fe3c6bf
53e997ddb7602d9701fd2cce,"A social e-book provides not only the original text but also other readers&' comments, and it enables social interactions inside the book. We posited that a social e-book could be a useful tool for collaborative learning, and it could provide new opportunities for classic humanities texts. The research objective is to find the tendencies of reader generated annotations during two social reading projects. For theoretical background, ""the significance of the text - social interaction model"" was used for the analysis conducted in this study, and we classified user generated annotations into three different types. As a result, participants had a tendency to make more annotations about their understanding and appreciation than regarding text interpretation. In addition, the result shows that the social e-book can promote fine-grained interactions. Regarding the comparison of the genres of the contents, the group of people who read the classic and humanities genre is more active than those who read the popular literature genre. For future study, more specific ways to improve interest and understanding will be examined for effective collaborative reading experiences through the social e-book.","List(List(53f466afdabfaeee22a54847, null, null, 5b868af8e1cd8e14a32924f3, Seyeon Lee, null, null, null, null, Graduate School of Culture Technology, KAIST, Yuseong-gu, Daejeon, South Korea, null, 5f71b2831c455f439fe3c61e, null, null, null), List(53f467fadabfaedd74e6ffbd, null, null, 5b868af8e1cd8e14a32924f3, Jea In Kim, null, null, null, null, Graduate School of Culture Technology, KAIST, Yuseong-gu, Daejeon, South Korea, null, 5f71b2831c455f439fe3c61e, null, null, null), List(53f43a87dabfaee02acf230a, null, null, 5b868af8e1cd8e14a32924f3, Chung-Kon Shi, null, null, null, null, Graduate School of Culture Technology, KAIST, Yuseong-gu, Daejeon, South Korea, null, 5f71b2831c455f439fe3c61e, null, null, null))",10.1007/978-3-642-39371-6_6,"List(Social group, Social relation, World Wide Web, Social media, Collaborative learning, Future study, Psychology, Social learning, Epistemology, Social competence)",,,"List(social reading project, reader experience, original text, collaborative learning, text interpretation, social interaction model, social e-book, effective collaborative reading experience, social interaction, future study, classic humanities text)",en,0.0,57.0,50,,"List(53e9b86db7602d9704439294, 53e9b130b7602d9703bb0b59, 53e9b39db7602d9703e80f29)",Assessing the possibility of a social e-book by analyzing reader experiences,List(http://dx.doi.org/10.1007/978-3-642-39371-6_6),,2013,53a7261520f7420be8b66689,"List(53f466afdabfaeee22a54847, 53f467fadabfaedd74e6ffbd, 53f43a87dabfaee02acf230a)",5f71b2831c455f439fe3c61e
53e997e4b7602d9701fdcbe4,"This is the fourth year in which a team from University College Dublin has participated in the Multi-Agent Programming Contest. This paper describes the system that was created to participate in the contest, along with observations of the team's experiences in the contest. The system itself was built using the AF-TeleoReactive and AF-AgentSpeak agent programming languages running on the Agent Factory platform. Unlike in previous years where a hybrid control architecture was used, this year the system was implemented using only agent code and associated actions, sensors, modules and platform services.","List(List(53f46944dabfaefedbb94054, null, null, 5b86b6c3e1cd8e14a34cb1f5, Dominic Carr, null, null, null, null, University College Dublin, Ireland, null, 5f71b5d91c455f439fe53c8a, null, null, null), List(53f438a2dabfaee2a1cfba45, null, null, 5b86b6c3e1cd8e14a34cb1f5, Sean Russell, null, null, null, null, University College Dublin, Ireland, null, 5f71b5d91c455f439fe53c8a, null, null, null), List(53f4d24adabfaeedcf780064, null, null, 5b86b6c3e1cd8e14a34cb1f5, Balazs Pete, null, null, null, null, University College Dublin, Ireland, null, 5f71b5d91c455f439fe53c8a, null, null, null), List(5430219cdabfaeca69bcbdab, null, null, 5b86b6c3e1cd8e14a34cb1f5, G. M. P. O'Hare, null, null, null, null, University College Dublin, Ireland, null, 5f71b5d91c455f439fe53c8a, null, null, null), List(5484ab76dabfaed7b5fa1b09, null, null, 5b86b6c3e1cd8e14a34cb1f5, Rem W. Collier, null, null, null, null, University College Dublin, Ireland, null, 5f71b5d91c455f439fe53c8a, null, null, null))",10.1007/978-3-642-31915-0_12,"List(Computer software, Architecture, Software engineering, Factory, Computer science, CONTEST, Multi-agent system, Artificial intelligence)",,,"List(af-agentspeak agent programming language, hybrid control architecture, multi-agent programming, platform service, agent code, university college dublin, previous year, agent factory platform)",en,2.0,207.0,197,,"List(53e9b715b7602d97042ab658, 53e9a1dbb7602d9702adac6a, 53e9b56cb7602d97040a4772, 53e99d88b7602d97026469fb, 53e9b0a5b7602d9703b1344c, 53e9b0c2b7602d9703b32bfc, 53e9b648b7602d97041a30bb, 53e99976b7602d97021b6824, 53e9a533b7602d9702e5845d, 53e99a04b7602d970224e686)",Bogtrotters in space,List(http://dx.doi.org/10.1007/978-3-642-31915-0_12),,2011,53a72ea220f7420be8c9573e,"List(53f46944dabfaefedbb94054, 53f438a2dabfaee2a1cfba45, 53f4d24adabfaeedcf780064, 5430219cdabfaeca69bcbdab, 5484ab76dabfaed7b5fa1b09)",5f71b5d91c455f439fe53c8a
53e997e8b7602d9701fdfeec,"Given a set B of n black points in general position, we say that a set of white points W blocks B if in the Delaunay triangulation of [Formula: see text] there is no edge connecting two black points. We give the following bounds for the size of the smallest set W blocking B: (i) [Formula: see text] white points are always sufficient to block a set of n black points, (ii) if B is in convex position, [Formula: see text] white points are always sufficient to block it, and (iii) at least [Formula: see text] white points are always necessary to block a set of n black points.","List(List(53f4475bdabfaeecd69b037a, null, oaich@ist.tugraz.at, 5b869b4de1cd8e14a393f427, Oswin Aichholzer, null, null, null, null, Institute for Software Technology, University of Technology, Graz, Austria, null, null, List(Institute for Software Technology, University of Technology, Graz, Austria, Partially supported by the Austrian Science Fund (FWF): NFN ‘Industrial Geometry’ S9205-N12 and the ESF EUROCORES programme EuroGIGA – ComPoSe, Austrian Science Fund (FWF): I 648-N18.), null, null), List(53f44082dabfaee43ec6cfb0, null, ruyfabila@math.cinvestav.edu.mx, 5b86b514e1cd8e14a3406b2e, Ruy Fabila-Monroy, null, null, null, 0000-0002-2517-0298, Departamento de Matemáticas, Cinvestav, México, Mexico, null, 5f71b3301c455f439fe411b6, List(Departamento de Matemáticas, Cinvestav, México, Mexico, Partially supported by Conacyt of Mexico, grant 153984.), null, null), List(53f3af18dabfae4b34b108b0, null, thackl@ist.tugraz.at, 5b869b4de1cd8e14a393f427, Thomas Hackl, null, null, null, null, Institute for Software Technology, University of Technology, Graz, Austria, null, null, List(Institute for Software Technology, University of Technology, Graz, Austria, Funded by the Austrian Science Fund (FWF): P23629-N18.), null, null), List(5447fcdcdabfae87b7dbc44b, null, marc@cs.uu.nl, 5b86b92de1cd8e14a35e2774, Marc van Kreveld, null, null, null, null, Department of Computer Science, Utrecht University, Utrecht, The Netherlands, null, 5f71b2841c455f439fe3c6c2, List(Department of Computer Science, Utrecht University, Utrecht, The Netherlands), null, null), List(53f4627adabfaefedbb798dc, null, apilz@ist.tugraz.at, 5b869b4de1cd8e14a393f427, Alexander Pilz, null, null, null, 0000-0002-6059-1821, Institute for Software Technology, University of Technology, Graz, Austria, null, null, List(Institute for Software Technology, University of Technology, Graz, Austria, Recipient of a DOC-fellowship of the Austrian Academy of Sciences at the Institute for Software Technology, Graz University of Technology, Austria.), null, null), List(5601a96d45cedb3395e88e81, null, pedro.ramos@uah.es, 5b8689dbe1cd8e14a321fc7e, Pedro Ramos, null, null, null, 0000-0001-6904-5803, Departamento de Matemáticas, Universidad de Alcalá, Madrid, Spain, null, 5f71b3101c455f439fe40329, List(Departamento de Matemáticas, Universidad de Alcalá, Madrid, Spain, Corresponding author., Partially supported by MEC grants MTM2009-07242, MTM2011-22792 and by the ESF EUROCORES programme EuroGIGA – ComPoSe, grant EUI-EURC-2011-4306. The work was done while the author was visiting the University of Technology, Graz, supported by MICINN Programa Nacional de Movilidad de Recursos Humanos, Plan Nacional de I+D+i 2008–2011.), null, null), List(53f42ef3dabfaee43ebd8d3b, null, bvogt@ist.tugraz.at, 5b869b4de1cd8e14a393f427, Birgit Vogtenhuber, null, null, null, null, Institute for Software Technology, University of Technology, Graz, Austria, null, null, List(Institute for Software Technology, University of Technology, Graz, Austria, Partially supported by the Austrian Science Fund (FWF): NFN ‘Industrial Geometry’ S9205-N12 and the ESF EUROCORES programme EuroGIGA – ComPoSe, Austrian Science Fund (FWF): I 648-N18.), null, null))",10.1016/j.comgeo.2012.02.005,"List(Graph drawing, General position, Combinatorics, Convex position, Mathematics, Delaunay triangulation, Delaunay graph)",,2.0,"List(convex position, delaunay triangulation, n black point, delaunay graph, set b, proximity graphs, graph drawing, blocking delaunay triangulations, witness graphs, n-1 white point, following bound, general position, black point, white point, smallest set)",en,16.0,159.0,154,https://static.aminer.cn/upload/pdf/program/53e997e8b7602d9701fdfeec_0.pdf,"List(53e99adcb7602d970235febb, 53e9ade9b7602d97037f3d3f, 53e9984bb7602d9702079d8f, 53e9a2ddb7602d9702be93df, 56d833a5dabfae2eee3216f7)",Blocking Delaunay triangulations.,"List(http://dx.doi.org/10.1016/j.comgeo.2012.02.005, http://www.ncbi.nlm.nih.gov/pubmed/23483043?report=xml&format=text, https://www.sciencedirect.com/science/article/pii/S0925772112000569, http://www.webofknowledge.com/)",46.0,2013,555036b57cea80f95414abdc,"List(53f4475bdabfaeecd69b037a, 53f44082dabfaee43ec6cfb0, 53f3af18dabfae4b34b108b0, 5447fcdcdabfae87b7dbc44b, 53f4627adabfaefedbb798dc, 5601a96d45cedb3395e88e81, 53f42ef3dabfaee43ebd8d3b)",17179881764
53e997e9b7602d9701fe40fc,"While there have been plenty of applications of case-based reasoning (CBR) to different design tasks, rarely has the methodology  been used for generating new works of art. If the goal is to produce completely novel artistic styles, then perhaps other  reasoning methods offer better opportunities for producing interesting artwork. However, if the goal is to produce new artwork  that fits a previously-existing style, then it seems to us that CBR is the ideal strategy to use. In this paper we present  some ideas for integrating CBR with other artificial intelligence techniques in order to generate new artwork that imitates  a particular artistic style. As an example we show how we have successfully implemented our ideas in a system that produces  new works of art in the style of the Dutch painter Piet Mondrian. Along the way we discuss the implications that a task of  this nature has for CBR and we describe and provide the results of some experiments we performed with the system.","List(List(53f437f0dabfaee0d9b7327a, null, agomez@itam.mx, 5b86c2c5e1cd8e14a3a340d2, Andrés Gómez De Silva Garza, null, null, null, null, Computer Engineering Department, Instituto Tecnológico Autónomo de México (ITAM), Río Hondo #1, Colonia Tizapán-San Ángel, null, 5f71b2bf1c455f439fe3df7b, null, null, null), List(53f43754dabfaee0d9b6d422, null, null, 5b86c2c5e1cd8e14a3a340d2, Aram Zamora Lores, null, null, null, null, Computer Engineering Department, Instituto Tecnológico Autónomo de México (ITAM), Río Hondo #1, Colonia Tizapán-San Ángel, null, 5f71b2bf1c455f439fe3df7b, null, null, null))",10.1007/11536406_20,"List(Evolutionary algorithm, Creative design, Computer science, Painting, Artificial intelligence, Case-based reasoning, Mondrian)",3-540-28174-6,,"List(previously-existing style, new work, piet mondrian, particular artistic style, novel artistic style, case-based reasoning, new artwork, case-based art, interesting artwork, reasoning method, dutch painter, artificial intelligent, case base reasoning)",en,0.0,251.0,237,//static.aminer.org/pdf/PDF/000/284/165/case_based_art.pdf,"List(53e9a4fab7602d9702e1b383, 56d8d01ddabfae2eee9cdb77, 53e99f8cb7602d970285f956, 53e9ba84b7602d97046a8022)",Case-Based Art,List(http://dx.doi.org/10.1007/11536406_20),,2005,53e18bf620f7dfbc07e8feb2,"List(53f437f0dabfaee0d9b7327a, 53f43754dabfaee0d9b6d422)",5f71b2bf1c455f439fe3df7b
53e997ecb7602d9701fe7d0d,Research highlights ► Action-Graph Games (AGGs) are a fully-expressive game representation. ► AGGs can compactly express utility functions with a wide variety of structure. ► We give a polynomial-time algorithm for computing expected utilities for AGGs. ► We leverage this to achieve exponential speedups of algorithms for Nash equilibria. ► Computational experiments show large improvements over previous techniques.,"List(List(53f4728bdabfaedd74e99819, null, jiang@cs.ubc.ca, 5b86b737e1cd8e14a34fea08, Albert Xin Jiang, null, null, null, null, university of british columbia, null, 5f71b28e1c455f439fe3cad2, null, null, null), List(53f49b5ddabfaebbd777bc95, null, kevinlb@cs.ubc.ca, 5b86c1e5e1cd8e14a39d110f, Kevin Leyton-Brown, null, null, null, null, Department of Computer Science, University of British Columbia, Canada, null, 5f71b28e1c455f439fe3cad2, null, null, null), List(53f453c3dabfaee02ad4ecd1, null, nbhat@physics.utoronto.ca, 5b86c306e1cd8e14a3a50fc8, Navin A. R. Bhat, null, null, null, null, university of toronto, null, 5f71b3b11c455f439fe44a0a, null, null, null))",10.1016/j.geb.2010.10.012,"List(Combinatorial game theory, Mathematical optimization, Expected utility hypothesis, Best response, Theoretical computer science, Repeated game, Game theory, Normal-form game, Nash equilibrium, Sequential game, Mathematics)",,1.0,"List(mixed strategy, expected utility, graphical models, nash equilibria, graphical model)",en,101.0,173.0,141,https://static.aminer.cn/upload/pdf/program/53e997ecb7602d9701fe7d0d_0.pdf,"List(53e99b04b7602d970239507b, 53e9ba4ab7602d970465cde6, 53e9b08ab7602d9703af5eb0, 53e9aa79b7602d97033edfb6, 53e999d8b7602d970221b56d, 53e99a85b7602d97022fa537, 53e9a091b7602d9702978a7f, 53e9b228b7602d9703cbf183, 53e9ab20b7602d97034a6f9c, 599c7815601a182cd259d0f8, 53e99991b7602d97021d6b65, 599c793e601a182cd2626e8e, 53e9a9a2b7602d97033003e7, 53e9be6cb7602d9704b2f0b6, 53e9a0fbb7602d97029e4518, 53e9af61b7602d97039a2de8, 53e9ac39b7602d9703601c34, 53e9a7f1b7602d970312ec88, 53e99813b7602d970202b13f, 53e9a8d4b7602d9703226945, 53e9a4b8b7602d9702dd5faf, 53e99a0eb7602d97022612e1, 53e99d96b7602d9702653bcf, 53e9a154b7602d9702a43e1e, 53e99db1b7602d970266db38, 53e9b010b7602d9703a68c07, 53e99945b7602d9702181b84, 53e9a33db7602d9702c49413, 53e99fb5b7602d97028907b2, 53e9a1a2b7602d9702a9949e, 53e9a540b7602d9702e63d8a, 558a4b07e4b0b32fcb35f57e, 53e9abf6b7602d97035b499d, 53e9aaf3b7602d97034758dd, 53e9ad63b7602d970374afb5, 53e9b866b7602d9704430f01, 53e9a7f8b7602d9703139e0d, 53e9bad7b7602d9704704bee, 53e99e61b7602d9702728009, 53e9983db7602d970206a08d, 53e9aa79b7602d97033edfb5, 53e99e4db7602d97027115ea)",Action-Graph Games,List(http://dx.doi.org/10.1016/j.geb.2010.10.012),71.0,2011,539ff0be831432abcb3f285d,"List(53f4728bdabfaedd74e99819, 53f49b5ddabfaebbd777bc95, 53f453c3dabfaee02ad4ecd1)",5f71b28e1c455f439fe3cad2
53e997f1b7602d9701fee626,"The aim of duplicate detection is to group records in a relation which refer to the same entity in the real world such as  a person or business. Most existing works require user specified parameters such as similarity threshold in order to conduct  duplicate detection. These methods are called user-first in this paper. However, in many scenarios, pre-specification from  the user is very hard and often unreliable, thus limiting applicability of user-first methods. In this paper, we propose a  user-last method, called Active Duplicate Detection (ADD), where an initial solution is returned without forcing user to specify such parameters and then user is involved to refine  the initial solution. Different from user-first methods where user makes decision before any processing, ADD allows user to  make decision based on an initial solution. The identified initial solution in ADD enjoys comparatively high quality and is  easy to be refined in a systematic way (at almost zero cost).","List(List(53f7c5d2dabfae9060ae6fb8, null, dengke@itee.uq.edu.au, 5b86a53ee1cd8e14a3d5bfd7, Ke Deng, null, null, null, null, The University of Queensland Australia, null, 5f71b2901c455f439fe3cb8f, null, null, null), List(54487b38dabfae87b7e2ec9a, null, liwei.wang@whu.edu.cn, 5b869e55e1cd8e14a3a8fde0, Liwei Wang, null, null, null, null, Wuhan University China, null, 5f71b2cf1c455f439fe3e6ee, null, null, null), List(56cb18c0c35f4f3c65660565, null, zxf@itee.uq.edu.au, 5b86a53ee1cd8e14a3d5bfd7, Xiaofang Zhou, null, null, null, null, The University of Queensland Australia, null, 5f71b2901c455f439fe3cb8f, null, null, null), List(562d605b45cedb3398ddf797, null, shazia@itee.uq.edu.au, 5b86a53ee1cd8e14a3d5bfd7, Shazia Wasim Sadiq, null, null, null, null, The University of Queensland Australia, null, 5f71b2901c455f439fe3cb8f, null, null, null), List(53f45f02dabfaedd74e4d127, null, null, 5b86a53ee1cd8e14a3d5bfd7, Gabriel Pui Cheong Fung, null, null, null, null, The University of Queensland Australia, null, 5f71b2901c455f439fe3cb8f, null, null, null))",10.1007/978-3-642-12026-8_43,"List(Edit distance, Data mining, Duplicate detection, Computer science, Tree (data structure), Limiting)",3-642-12025-3,,"List(active duplicate detection, user-first method, user-last method, similarity threshold, real world, group record, initial solution, existing work, duplicate detection, high quality, records, linkage)",en,1.0,579.0,565,,"List(53e9af26b7602d9703961bb4, 53e9a480b7602d9702d9e8f7, 53e9acbcb7602d9703699a45, 53e9aaf3b7602d97034721bf, 53e9a877b7602d97031c413b, 53e9a8b1b7602d97032021a6, 53e9a8ffb7602d970324ec41, 53e9a298b7602d9702b9fade, 53e9a877b7602d97031c4813, 53e9b102b7602d9703b7d3ea, 558acb3de4b031bae1f9b5a8, 53e9adbdb7602d97037bdf8b, 53e99c91b7602d9702541d10)",Active Duplicate Detection,List(http://dx.doi.org/10.1007/978-3-642-12026-8_43),,2010,53a72e9d20f7420be8c94b72,"List(53f7c5d2dabfae9060ae6fb8, 54487b38dabfae87b7e2ec9a, 56cb18c0c35f4f3c65660565, 562d605b45cedb3398ddf797, 53f45f02dabfaedd74e4d127)",5f71b2901c455f439fe3cb8f
53e997f1b7602d9701ff150a,"To date, the primary idea for organizing software systems has been to break the system down into modular units such as subroutines, procedures, objects, clients and servers etc. We note that all of these correspond relatively directly to blocks of executable code. But many issues of concern to programmers don't cleanly follow these modularity boundaries—they don't ""fit"" naturally into these abstractions. We propose a new programming paradigm, called Aspect-Oriented Programming, that allows programmers to express each of the different issues they want to program in an appropriately natural form. A special kind of compiler called an Aspect Weaver™ then automatically combins those separate descriptions into a final executable form. By enabling engineers to reason and program using the natural aspects of concern for a system, even when those cross-cut both each other and the resulting executable code, we believe that Aspect-Oriented Programming will make it possible to program future extremely complex systems, as well as making it easier to program a number of more near term (even present and past) systems.","List(List(53f36889dabfae4b349a2ea8, null, null, 5b86b737e1cd8e14a34fea08, gregor kiczales, null, null, null, null, university of british columbia, null, 5f71b28e1c455f439fe3cad2, null, null, null), List(53f4302bdabfaee4dc73c49f, null, null, 5b86b6e9e1cd8e14a34dcda7, erik hilsdale, null, null, null, null, parc, null, 5f71b2841c455f439fe3c6e9, null, null, null))",10.1145/503271.503260,"List(Functional reactive programming, Procedural programming, Programming language, Software design, Computer science, AspectJ, Concurrent object-oriented programming, Programming domain, Functional logic programming, Programming paradigm, Aspect-oriented programming, Software engineering, Inductive programming, Separation of concerns, Reactive programming, Modular design, Java, Modularity)",,5.0,"List(aspect-oriented programming, agent-oriented programming, software design, separation of concern, modularity, programming language, aspect oriented programming, aspect oriented)",en,9952.0,,313,//static.aminer.org/pdf/20170130/pdfs/sigsoft/9ql40mmuc1ietqp2pa7rsa3vdgbhezfo.pdf,List(53e99a4eb7602d97022b29be),Aspect-oriented programming,"List(http://dx.doi.org/10.1145/503271.503260, https://dl.acm.org/doi/abs/10.1145/503271.503260)",26.0,2001,0163-5948,"List(53f36889dabfae4b349a2ea8, 53f4302bdabfaee4dc73c49f)",5f71b28e1c455f439fe3cad2
53e997f1b7602d9701ff2ea5,Techniques for automatically generating optimal vision programs from high-level task descriptions are presented. Vision programs are the object models that describe strategies to recognize and locate objects in an image. The effectiveness of the program depends on the features used for recognition and the order in which the features are evaluated. We describe three probabilistic feature utility measures and a cost function based on program execution time that serve as the basis of our technique. Techniques for computing the utility measures and combining them to generate the best vision program are described and are demonstrated on a realistic vision application.,"List(List(53f4620cdabfaee02ad82dc4, null, null, null, Chien-Huei Chen, null, null, null, null, Information, Telecommunications, and Automation Division, SRI International, Menlo Park, California 94025 USA, null, 5f71b2aa1c455f439fe3d5f1, List(Information, Telecommunications, and Automation Division, SRI International, Menlo Park, California 94025 USA), null, null), List(53f46784dabfaedd74e6e399, null, null, null, Prasanna G. Mulgaonkar, null, null, null, null, Information, Telecommunications, and Automation Division, SRI International, Menlo Park, California 94025 USA, null, 5f71b2aa1c455f439fe3d5f1, List(Information, Telecommunications, and Automation Division, SRI International, Menlo Park, California 94025 USA), null, null))",10.1016/1049-9660(92)90015-U,"List(Computer science, Execution time, Artificial intelligence, Probabilistic logic, Machine learning)",,2.0,List(automatic vision programming),en,41.0,183.0,170,,"List(53e9a6d1b7602d970300896d, 53e9bbf0b7602d97048494da, 557d093df667eeed56196989, 53e9acb5b7602d9703694314, 557ef95e6fee0fe990cadcee, 53e9b206b7602d9703c9f63f, 557efda06fee0fe990cadefb, 53e9b587b7602d97040c8fa2)",Automatic vision programming.,"List(http://dx.doi.org/10.1016/1049-9660(92)90015-U, https://www.sciencedirect.com/science/article/pii/104996609290015U, http://www.webofknowledge.com/)",55.0,1992,555036f37cea80f954169241,"List(53f4620cdabfaee02ad82dc4, 53f46784dabfaedd74e6e399)",5f71b2aa1c455f439fe3d5f1
53e997f5b7602d9701ffb195,"The impact factor is one of the most used scientometric indicators. Its proper and improper uses have been discussed extensively before. It has been criticized extensively, yet it is still here. In this paper I propose the journal report card, which is a set of measures, each with an easily comprehensible meaning that provides a fuller picture of the journals' standing. The set of measures in the report card include the impact factor, the h-index, number of citations at different points on the ranked list of citations, extent of uncitedness and coverage of the h-core. The report card is computed for two sets of journals, the top-20 journals in JCR 2010 and the top-20 journals in JCR 2010 for the category Information and Library Science.","List(List(53f42de5dabfaedf4351d6f1, null, Judit.Bar-Ilan@biu.ac.il, 5b86a5e4e1cd8e14a3d9d2c3, Judit Bar-Ilan, null, null, null, 0000-0002-8796-5248, Department of Information Science, Bar-Ilan University, Ramat Gan, Israel 52900, null, 5f71b2ad1c455f439fe3d77c, null, null, null))",10.1007/s11192-012-0671-3,"List(World Wide Web, Ranking, Computer science, Report card, Impact factor)",,2.0,"List(Journal report card, Median, 90% percentile, Synchronous h-index for journals)",en,13.0,260.0,249,,"List(53e9aafab7602d970347cf44, 53e99df0b7602d97026af7d4, 53e9b16db7602d9703bf5a4b, 53e9b0c2b7602d9703b338af, 53e9bc26b7602d970488f163, 53e99f0ab7602d97027d873d, 53e9a9d3b7602d97033335eb, 53e99e80b7602d97027481fa, 53e9b5e7b7602d970413abe7, 53e99924b7602d970215ec79, 53e9b7cdb7602d970437d65c, 53e9b9a0b7602d9704590d23)",Journal report card,"List(http://dx.doi.org/10.1007/s11192-012-0671-3, http://dx.doi.org/https://doi.org/10.1007/s11192-012-0671-3, https://link.springer.com/article/10.1007/s11192-012-0671-3, http://www.webofknowledge.com/)",92.0,2012,5390acc420f70186a0eb9ddc,List(53f42de5dabfaedf4351d6f1),5f71b2ad1c455f439fe3d77c


In [0]:
# replace missing org ids in the new df with generated IDs
new_df = new_df.replace('missing_org_id', None)
new_df = new_df.withColumn('ID', F.coalesce(new_df.ID, new_df.Org))
new_df = new_df.drop('Org')

display(new_df.limit(DISPLAY_LIMIT))

ID,Name,Country
5f71b2971c455f439fe3cecb,"Department of Industrial and Systems Engineering, National University of Singapore, Faculty of Engineering, 10 Kent Ridge Crescent, Singapore 119260",Singapore
5f71b2971c455f439fe3cede,"Australian e-Health Research Centre, CSIRO ICT Centre, Australia",Australia
5f71b3711c455f439fe42e84,"Instituto de Física Aplicada, Consejo Superior de Investigaciones Científicas, Serrano 144, 28006 Madrid, Spain",Spain
5f71b2f61c455f439fe3f823,"Department of Electrical Engineering, Biomedical Engineering Laboratory (GPEB), Federal University of Santa Catarina, Florianópolis 88040-900, Brazil",Brazil
5f71b28e1c455f439fe3cad9,"Northwestern Univ., Evanston, IL",United States
5f71b3081c455f439fe3ff73,"Iran Univ Sci & Technol, Sch Comp Engn, Tehran, Iran",Iran
5f71b2ac1c455f439fe3d6cc,"Computer Information Systems Department, Georgia State University, Atlanta, GA 30302-4015, USA",United States
5f71b2841c455f439fe3c6b7,"Computer Science Division, 387 Soda Hall, University of California, Berkeley, CA 94720. E-mail: nir@cs.berkeley.edu",United States
5f71b2b81c455f439fe3dc45,"CACI Products Company, 3333 North Torrey Pines Court, La Jolla, CA",United States
5f71b2831c455f439fe3c634,"Microsoft Research Cambridge, Cambridge, UK",United Kingdom


#### 3.4. DBLP fact table

In [0]:
dblp_df = _df.select('_id','venue_id','Org','Author_ID','references','keywords','fos','title','n_citation','lang','page_start','page_end','doi','isbn','year','volume','issue')

dblp_df = dblp_df.toDF('ID','Venue','Org','Authors','References','Keywords','FOS','Title','NoCitations','Lang','PageStart','PageEnd','DOI','ISBN','Year','Volume','Issue')

In [0]:
display(dblp_df.limit(DISPLAY_LIMIT))

ID,Venue,Org,Authors,References,Keywords,FOS,Title,NoCitations,Lang,PageStart,PageEnd,DOI,ISBN,Year,Volume,Issue
53e997d7b7602d9701fd0004,555037187cea80f954172d7a,5f71b2841c455f439fe3c6bf,"List(53f43819dabfaee02acd9ff6, 53f434b5dabfaeecd694fcdf, 53f42d28dabfaeb2acfe712a)","List(53e9a309b7602d9702c14b8b, 53e99858b7602d970208f24f, 53e9aa61b7602d97033d4b67, 53e9a1e1b7602d9702ae0e7e, 53e9a6cab7602d9702fff64d)","List(role assignment, appropriate assignment method, specialized role, common goal, role selection, complex environment, present method, team member, team setting, limited amount)","List(Teamwork, Computer science, Knowledge management, Human–computer interaction)",Role selection in ad hoc teamwork,1.0,en,1251,1252.0,,0-9817381-3-3,2012,,
53e997ddb7602d9701fd2cce,53a7261520f7420be8b66689,5f71b2831c455f439fe3c61e,"List(53f466afdabfaeee22a54847, 53f467fadabfaedd74e6ffbd, 53f43a87dabfaee02acf230a)","List(53e9b86db7602d9704439294, 53e9b130b7602d9703bb0b59, 53e9b39db7602d9703e80f29)","List(social reading project, reader experience, original text, collaborative learning, text interpretation, social interaction model, social e-book, effective collaborative reading experience, social interaction, future study, classic humanities text)","List(Social group, Social relation, World Wide Web, Social media, Collaborative learning, Future study, Psychology, Social learning, Epistemology, Social competence)",Assessing the possibility of a social e-book by analyzing reader experiences,0.0,en,50,57.0,10.1007/978-3-642-39371-6_6,,2013,,
53e997e4b7602d9701fdcbe4,53a72ea220f7420be8c9573e,5f71b5d91c455f439fe53c8a,"List(53f46944dabfaefedbb94054, 53f438a2dabfaee2a1cfba45, 53f4d24adabfaeedcf780064, 5430219cdabfaeca69bcbdab, 5484ab76dabfaed7b5fa1b09)","List(53e9b715b7602d97042ab658, 53e9a1dbb7602d9702adac6a, 53e9b56cb7602d97040a4772, 53e99d88b7602d97026469fb, 53e9b0a5b7602d9703b1344c, 53e9b0c2b7602d9703b32bfc, 53e9b648b7602d97041a30bb, 53e99976b7602d97021b6824, 53e9a533b7602d9702e5845d, 53e99a04b7602d970224e686)","List(af-agentspeak agent programming language, hybrid control architecture, multi-agent programming, platform service, agent code, university college dublin, previous year, agent factory platform)","List(Computer software, Architecture, Software engineering, Factory, Computer science, CONTEST, Multi-agent system, Artificial intelligence)",Bogtrotters in space,2.0,en,197,207.0,10.1007/978-3-642-31915-0_12,,2011,,
53e997e8b7602d9701fdfeec,555036b57cea80f95414abdc,17179881764,"List(53f4475bdabfaeecd69b037a, 53f44082dabfaee43ec6cfb0, 53f3af18dabfae4b34b108b0, 5447fcdcdabfae87b7dbc44b, 53f4627adabfaefedbb798dc, 5601a96d45cedb3395e88e81, 53f42ef3dabfaee43ebd8d3b)","List(53e99adcb7602d970235febb, 53e9ade9b7602d97037f3d3f, 53e9984bb7602d9702079d8f, 53e9a2ddb7602d9702be93df, 56d833a5dabfae2eee3216f7)","List(convex position, delaunay triangulation, n black point, delaunay graph, set b, proximity graphs, graph drawing, blocking delaunay triangulations, witness graphs, n-1 white point, following bound, general position, black point, white point, smallest set)","List(Graph drawing, General position, Combinatorics, Convex position, Mathematics, Delaunay triangulation, Delaunay graph)",Blocking Delaunay triangulations.,16.0,en,154,159.0,10.1016/j.comgeo.2012.02.005,,2013,46.0,2.0
53e997e9b7602d9701fe40fc,53e18bf620f7dfbc07e8feb2,5f71b2bf1c455f439fe3df7b,"List(53f437f0dabfaee0d9b7327a, 53f43754dabfaee0d9b6d422)","List(53e9a4fab7602d9702e1b383, 56d8d01ddabfae2eee9cdb77, 53e99f8cb7602d970285f956, 53e9ba84b7602d97046a8022)","List(previously-existing style, new work, piet mondrian, particular artistic style, novel artistic style, case-based reasoning, new artwork, case-based art, interesting artwork, reasoning method, dutch painter, artificial intelligent, case base reasoning)","List(Evolutionary algorithm, Creative design, Computer science, Painting, Artificial intelligence, Case-based reasoning, Mondrian)",Case-Based Art,0.0,en,237,251.0,10.1007/11536406_20,3-540-28174-6,2005,,
53e997ecb7602d9701fe7d0d,539ff0be831432abcb3f285d,5f71b28e1c455f439fe3cad2,"List(53f4728bdabfaedd74e99819, 53f49b5ddabfaebbd777bc95, 53f453c3dabfaee02ad4ecd1)","List(53e99b04b7602d970239507b, 53e9ba4ab7602d970465cde6, 53e9b08ab7602d9703af5eb0, 53e9aa79b7602d97033edfb6, 53e999d8b7602d970221b56d, 53e99a85b7602d97022fa537, 53e9a091b7602d9702978a7f, 53e9b228b7602d9703cbf183, 53e9ab20b7602d97034a6f9c, 599c7815601a182cd259d0f8, 53e99991b7602d97021d6b65, 599c793e601a182cd2626e8e, 53e9a9a2b7602d97033003e7, 53e9be6cb7602d9704b2f0b6, 53e9a0fbb7602d97029e4518, 53e9af61b7602d97039a2de8, 53e9ac39b7602d9703601c34, 53e9a7f1b7602d970312ec88, 53e99813b7602d970202b13f, 53e9a8d4b7602d9703226945, 53e9a4b8b7602d9702dd5faf, 53e99a0eb7602d97022612e1, 53e99d96b7602d9702653bcf, 53e9a154b7602d9702a43e1e, 53e99db1b7602d970266db38, 53e9b010b7602d9703a68c07, 53e99945b7602d9702181b84, 53e9a33db7602d9702c49413, 53e99fb5b7602d97028907b2, 53e9a1a2b7602d9702a9949e, 53e9a540b7602d9702e63d8a, 558a4b07e4b0b32fcb35f57e, 53e9abf6b7602d97035b499d, 53e9aaf3b7602d97034758dd, 53e9ad63b7602d970374afb5, 53e9b866b7602d9704430f01, 53e9a7f8b7602d9703139e0d, 53e9bad7b7602d9704704bee, 53e99e61b7602d9702728009, 53e9983db7602d970206a08d, 53e9aa79b7602d97033edfb5, 53e99e4db7602d97027115ea)","List(mixed strategy, expected utility, graphical models, nash equilibria, graphical model)","List(Combinatorial game theory, Mathematical optimization, Expected utility hypothesis, Best response, Theoretical computer science, Repeated game, Game theory, Normal-form game, Nash equilibrium, Sequential game, Mathematics)",Action-Graph Games,101.0,en,141,173.0,10.1016/j.geb.2010.10.012,,2011,71.0,1.0
53e997f1b7602d9701fee626,53a72e9d20f7420be8c94b72,5f71b2901c455f439fe3cb8f,"List(53f7c5d2dabfae9060ae6fb8, 54487b38dabfae87b7e2ec9a, 56cb18c0c35f4f3c65660565, 562d605b45cedb3398ddf797, 53f45f02dabfaedd74e4d127)","List(53e9af26b7602d9703961bb4, 53e9a480b7602d9702d9e8f7, 53e9acbcb7602d9703699a45, 53e9aaf3b7602d97034721bf, 53e9a877b7602d97031c413b, 53e9a8b1b7602d97032021a6, 53e9a8ffb7602d970324ec41, 53e9a298b7602d9702b9fade, 53e9a877b7602d97031c4813, 53e9b102b7602d9703b7d3ea, 558acb3de4b031bae1f9b5a8, 53e9adbdb7602d97037bdf8b, 53e99c91b7602d9702541d10)","List(active duplicate detection, user-first method, user-last method, similarity threshold, real world, group record, initial solution, existing work, duplicate detection, high quality, records, linkage)","List(Edit distance, Data mining, Duplicate detection, Computer science, Tree (data structure), Limiting)",Active Duplicate Detection,1.0,en,565,579.0,10.1007/978-3-642-12026-8_43,3-642-12025-3,2010,,
53e997f1b7602d9701ff150a,0163-5948,5f71b28e1c455f439fe3cad2,"List(53f36889dabfae4b349a2ea8, 53f4302bdabfaee4dc73c49f)",List(53e99a4eb7602d97022b29be),"List(aspect-oriented programming, agent-oriented programming, software design, separation of concern, modularity, programming language, aspect oriented programming, aspect oriented)","List(Functional reactive programming, Procedural programming, Programming language, Software design, Computer science, AspectJ, Concurrent object-oriented programming, Programming domain, Functional logic programming, Programming paradigm, Aspect-oriented programming, Software engineering, Inductive programming, Separation of concerns, Reactive programming, Modular design, Java, Modularity)",Aspect-oriented programming,9952.0,en,313,,10.1145/503271.503260,,2001,26.0,5.0
53e997f1b7602d9701ff2ea5,555036f37cea80f954169241,5f71b2aa1c455f439fe3d5f1,"List(53f4620cdabfaee02ad82dc4, 53f46784dabfaedd74e6e399)","List(53e9a6d1b7602d970300896d, 53e9bbf0b7602d97048494da, 557d093df667eeed56196989, 53e9acb5b7602d9703694314, 557ef95e6fee0fe990cadcee, 53e9b206b7602d9703c9f63f, 557efda06fee0fe990cadefb, 53e9b587b7602d97040c8fa2)",List(automatic vision programming),"List(Computer science, Execution time, Artificial intelligence, Probabilistic logic, Machine learning)",Automatic vision programming.,41.0,en,170,183.0,10.1016/1049-9660(92)90015-U,,1992,55.0,2.0
53e997f5b7602d9701ffb195,5390acc420f70186a0eb9ddc,5f71b2ad1c455f439fe3d77c,List(53f42de5dabfaedf4351d6f1),"List(53e9aafab7602d970347cf44, 53e99df0b7602d97026af7d4, 53e9b16db7602d9703bf5a4b, 53e9b0c2b7602d9703b338af, 53e9bc26b7602d970488f163, 53e99f0ab7602d97027d873d, 53e9a9d3b7602d97033335eb, 53e99e80b7602d97027481fa, 53e9b5e7b7602d970413abe7, 53e99924b7602d970215ec79, 53e9b7cdb7602d970437d65c, 53e9b9a0b7602d9704590d23)","List(Journal report card, Median, 90% percentile, Synchronous h-index for journals)","List(World Wide Web, Ranking, Computer science, Report card, Impact factor)",Journal report card,13.0,en,249,260.0,10.1007/s11192-012-0671-3,,2012,92.0,2.0


In [0]:
dblp_df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Venue: string (nullable = true)
 |-- Org: string (nullable = true)
 |-- Authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- References: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- FOS: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Title: string (nullable = true)
 |-- NoCitations: integer (nullable = true)
 |-- Lang: string (nullable = true)
 |-- PageStart: integer (nullable = true)
 |-- PageEnd: integer (nullable = true)
 |-- DOI: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Issue: integer (nullable = true)



### 4. Load DFs as Delta tables
TODO: create orgs delta table

In [0]:
# DBLP fact table
dblp_df.write.format('delta').mode('overwrite').saveAsTable('dblp_fact_table')

In [0]:
# Venue table
venues_df.write.format("delta").mode("overwrite").saveAsTable("venues") 

In [0]:
# Author table
authors_df.write.format('delta').mode('overwrite').saveAsTable('authors')

In [0]:
# Organization


#### 5. Adding operation

#### 6. Queries