In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from delta.tables import *

import logging
import json
import re


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
DISPLAY_LIMIT = 20

Link to the repo: https://github.com/taidopurason/bdm-project-1

Check out the schema for out Warehouse: TODO: Link to image in github with the schema.

TODO: Since we switched to loading data from to loading from parquet files, some phrasing in the documentation is wrong. Fix it.

### Structure of this notebook
1. Read data from parquet files into a DataFrame.
2. Apply necessary (cleaning) transformations to the dataframe.
3. Create the new DataFrames corresponding to our Warehouse Schema.  
4. Save the DataFrames as Delta tables.
5. Demonstrate adding new entries to the warehouse.
6. Demonstrate queries on the data.

### 1. Extract the data
To check how we downloaded the data from the source, see https://github.com/taidopurason/bdm-project-1/blob/main/Loading%20data%20v2.ipynb. We split the downloaded data into files where each file contains 250,000 json objects.

In [0]:
# Uncomment one or the other line.

# This reads ALL splits into one dataframe
#_df = spark.read.option("multiline", True).json('dbfs:/user/dblpv13/dblpv13.*.json.gz')
#_df = spark.read.parquet('dbfs:/user/dblpv13/dblpv13.*.parquet')


# For a faster setup, read just one split
#_df = spark.read.option("multiline", True).json('dbfs:/user/dblpv13/dblpv13.0.json.gz')
_df = spark.read.parquet('dbfs:/user/dblpv13/dblpv13.0.parquet')

In [0]:
# Immediately delete the abstract column because they look really annoying on GitHub.
_df = _df.drop(F.col('abstract'))

_df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- bio: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- gid: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- name_zh: string (nullable = true)
 |    |    |-- oid: string (nullable = true)
 |    |    |-- oid_zh: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |    |    |-- org: string (nullable = true)
 |    |    |-- org_zh: string (nullable = true)
 |    |    |-- orgid: string (nullable = true)
 |    |    |-- orgs: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- orgs_zh: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- sid: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- fos: array (nullable = 

In [0]:
display(_df.limit(DISPLAY_LIMIT))

_id,authors,doi,fos,isbn,issn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,venue,volume,year
53e99784b7602d9701f3e3f5,,,,,,,List(),en,0.0,,,,,3GIO.,,"List(null, null, null, null, null, null, null, null, null, null, null, null, 0)",,2011
53e99784b7602d9701f3e133,"List(List(53f45728dabfaec09f209538, null, null, null, Peijuan Wang, null, null, null, null, null, null, null, null, null, null), List(5601754345cedb3395e59457, null, null, null, Jiahua Zhang, null, null, null, null, null, null, null, null, null, null), List(53f38438dabfae4b34a08928, null, null, null, Donghui Xie, null, null, null, null, null, null, null, null, null, null), List(5601754345cedb3395e5945a, null, null, null, Yanyan Xu, null, null, null, null, null, null, null, null, null, null), List(53f43d25dabfaeecd6995149, null, null, null, Yun Xu, null, null, null, null, null, null, null, null, null, null))",10.1109/IGARSS.2011.6049503,"List(Agronomy, Moisture, Hydrology, Environmental science, Dry weight, Water content, Stomatal conductance, Transpiration, Irrigation, Soil water, Canopy)",,,,"List(canopy parameters, canopy spectrum, different soil water content control, winter wheat, irrigation, hydrology, radiometry, moisture, indexes, vegetation, indexation, dry weight, soil moisture, water content, indexing terms, spectrum, natural disaster)",en,0.0,1933,1930.0,,,The relationship between canopy parameters and spectrum of winter wheat under different irrigations in Hebei Province.,List(http://dx.doi.org/10.1109/IGARSS.2011.6049503),"List(53a7297d20f7420be8bd4ae7, null, null, International Geoscience and Remote Sensing Symposium, null, null, null, IGARSS, null, null, null, null, 0)",,2011
53e99784b7602d9701f3e151,"List(List(53f46797dabfaeb22f542630, null, null, null, Jairo Rocha, null, null, null, null, null, null, null, null, null, null), List(54328883dabfaeb4c6a8a699, null, null, null, Theo Pavlidis, null, null, null, null, null, null, null, null, null, null))",10.1109/ICDAR.1993.395663,"List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",,,,"List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)",en,17.0,605,602.0,,"List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)",A solution to the problem of touching and broken characters.,List(http://dx.doi.org/10.1109/ICDAR.1993.395663),"List(53a72a4920f7420be8bfa51b, null, null, International Conference on Document Analysis and Recognition, null, null, null, ICDAR-1, null, null, null, null, 0)",,1993
53e99784b7602d9701f3e15d,"List(List(53f43b03dabfaedce555bf2a, null, null, null, Min Pan, null, null, null, null, null, null, null, null, null, null), List(53f45ee9dabfaee43ecda842, null, null, null, Chris C. N. Chu, null, null, null, null, null, null, null, null, null, null), List(53f42e8cdabfaee1c0a4274e, null, null, null, Hai Zhou, null, null, null, null, null, null, null, null, null, null))",10.1109/ISCAS.2005.1465124,"List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",0-7803-8834-8,,,"List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)",en,28.0,2464Vol.3,2461.0,//static.aminer.org/pdf/PDF/000/423/329/timing_yield_estimation_using_statistical_static_timing_analysis.pdf,"List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)",Timing yield estimation using statistical static timing analysis,"List(http://dx.doi.org/10.1109/ISCAS.2005.1465124, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1465124)","List(53a72e2020f7420be8c80142, null, null, International Symposium on Circuits and Systems, null, null, null, ISCAS (3), null, null, null, null, 0)",,2005
53e99784b7602d9701f3e161,"List(List(53f46946dabfaec09f24b4ed, null, null, 5b86cf1ae1cd8e14a3fc787b, Miguel Palma, null, 544bd9c245ce266baf189c4f, null, null, Miguel Palma Studio, null, null, null, null, null))",10.1145/1665137.1665166,,,,,"List(global high technology, daily short-distance flight, enormous waste, daily life)",en,,39,39.0,,,360°,,"List(5390a74a20f70186a0e8b40b, null, null, null, null, null, null, ACM SIGGRAPH ASIA 2009 Art Gallery & Emerging Technologies: Adaptation, null, null, null, null, null)",,2009
53e99784b7602d9701f3e162,"List(List(53f43d95dabfaedf435b63fa, null, null, 5b869031e1cd8e14a34a782f, Maureen Squillace, null, null, null, null, Fox Studios Australia, Moore Park, New South Wales, Australia, null, 5f71b2e41c455f439fe3efd1, null, null, null))",10.1145/1281740.1281746,,,,,List(),en,0.0,14,14.0,,,300,"List(http://dx.doi.org/10.1145/1281740.1281746, http://doi.acm.org/10.1145/1281740.1281746)","List(5736ae3ad39c4f40a7976010, null, null, null, null, null, null, SIGGRAPH Computer Animation Festival, null, null, null, null, 10)",,2007
53e99784b7602d9701f3e165,"List(List(54484654dabfae87b7dfc077, null, null, null, Jon G. Hall, null, null, null, null, null, null, null, null, null, 433474))",10.1111/j.1468-0394.2009.00532.x,,,,4.0,List(),en,0.0,306,305.0,,,34957+70764=105621,List(http://dx.doi.org/10.1111/j.1468-0394.2009.00532.x),"List(53a001b1831432abcb737ee4, null, null, null, null, null, null, Expert Systems, null, null, null, null, 0)",26.0,2009
53e99784b7602d9701f3e922,"List(List(53f39e3edabfae4b34aa8c4a, null, null, null, Jungil Park, null, null, null, null, null, null, null, null, null, 237372), List(53f431bcdabfaee2a1cb41b5, null, null, null, Sunyoung Ahn, null, null, null, null, null, null, null, null, null, 24447851), List(53f46ac3dabfaeee22a63eab, null, null, null, Youngmi Kim Pak, null, null, null, null, null, null, null, null, null, 4241287), List(53f44f6adabfaedf435efcb8, null, null, null, James Jungho Pak, null, null, null, null, null, null, null, null, null, 22875855))",10.1109/NEMS.2009.5068754,,,,,List(),en,1.0,1057,1054.0,//static.aminer.org/pdf/PDF/002/845/190/.pdf,,International Conference on Nano/Micro Engineered and Molecular Systems,List(http://doi.ieeecomputersociety.org/10.1109/NEMS.2009.5068754),"List(53a72dfb20f7420be8c7a2f3, null, null, null, null, null, null, NEMS, null, null, null, null, null)",,2009
53e99784b7602d9701f3e4f4,"List(List(53f45ad4dabfaee1c0b3e206, null, null, null, Bonnie Mitchell, null, null, null, null, null, null, null, null, null, null))",10.1145/1596685.1596687,,,,,"List(visual source material, minute sound, integrated journey temporally, abstract environment, intricate detail, particulated image, artists delve, visual experience, multi-faceted granular complexity, stylized natural element)",en,0.0,8,8.0,,,2BTextures,"List(http://dx.doi.org/10.1145/1596685.1596687, http://doi.acm.org/10.1145/1596685.1596687, db/conf/siggraph/siggraph2009festival.html#Mitchell09, https://doi.org/10.1145/1596685.1596687)","List(5736ae3ad39c4f40a7976060, null, null, null, null, null, null, SIGGRAPH Computer Animation Fesitval, null, null, null, null, 10)",,2009
53e99784b7602d9701f3eaf2,"List(List(53f438d0dabfaeee229c1f1c, null, null, null, Naotaka Tanaka, null, null, null, null, null, null, null, null, null, null), List(53f47083dabfaeee22a79321, null, null, null, Mio Yamamoto, null, null, null, null, null, null, null, null, null, null))",10.1007/3-540-45324-5_74,,3-540-42185-8,,,List(),en,0.0,514,513.0,,,11MonkeysII,List(http://dx.doi.org/10.1007/3-540-45324-5_74),"List(5390b44b20f70186a0efa5ba, null, null, null, null, null, null, RoboCup 2009, null, null, null, null, 0)",,2001


### 2. Transform the data

In [0]:
def replace_empty_string(col):
    return F.when(col == "", None).otherwise(col)

def transform(_df):
    # Create the col of author IDs
    _df = _df.withColumn('Author_ID', F.col('authors._id'))
    # Delete entries where any author ID is null
    _df = _df.where("!exists(Author_ID, x -> x is null)")
    # Drop entries with 1-word titles or empty authors or nonexistant _id or any nonexistant author id.
    # Also removes empty or missing references.
    _df = (_df.filter((F.size(F.col('authors')) > 0) & # By default F.size() returns -1 if the value is null.
                      (F.size(F.split(F.col('title'), ' ')) > 1) &  
                      (F.col('_id') != '') & 
                      (F.col('_id').isNotNull()) & 
                      ~(F.array_contains(F.col('references'), '')) & 
                      ~(F.array_contains(F.col('Author_ID'), ''))))
    # Remove all null references
    _df = _df.withColumn('references', F.expr('filter(references, x -> x is not null)'))    
    # Remove entries that are forewords
    _df = _df.filter(~F.lower(F.col("title")).contains("foreword"))
    # Convert n_citation data type to int
    _df = _df.withColumn('n_citation', F.col('n_citation').cast('int'))
    # Replace empty language values with null.
    _df = _df.withColumn('lang', F.when(F.col('lang') == '', None).otherwise(F.col('lang')))
    # Replace empty 'keyword' and 'fos' arrays with null values.
    _df = (_df.withColumn('keywords', F.when(F.size(F.col('keywords')) == 0, None).otherwise(F.col('keywords')))
              .withColumn('fos', F.when(F.size(F.col('fos')) == 0, None).otherwise(F.col('fos'))))
    # Replace non-numeric page numbers with nulls and convert column type to int. Then replace 0 page numbers with nulls as well.
    _df = (_df.withColumn('page_start', F.when(F.col('page_start').cast('int').isNotNull(), F.col('page_start')).otherwise(None)) # replace non-numeric page numbers with null
              .withColumn('page_end', F.when(F.col('page_end').cast('int').isNotNull(), F.col('page_end')).otherwise(None))
              .withColumn('page_start', F.col('page_start').cast('int')) # convert column type to int
              .withColumn('page_end', F.col('page_end').cast('int'))
              .withColumn('page_start', F.when(F.col('page_start') == 0, None).otherwise(F.col('page_start'))) # replace 0 page numbers with null as well
              .withColumn('page_end', F.when(F.col('page_end') == 0, None).otherwise(F.col('page_end'))))
    # Replace empty dois with nulls.
    _df = _df.withColumn('doi', F.when(F.col('doi') == '', None).otherwise(F.col('doi')))
    # Replace empty years with nulls and change data type to int.
    _df = (_df.withColumn('year', F.when(F.col('year') == 0, None).otherwise(F.col('year')))
              .withColumn('year', F.col('year').cast('int')))
    # Replace non-numeric volume and issue numbers with null and convert data types to int. Then repalce 0 values with null as well.
    _df = (_df.withColumn('volume', F.when(F.col('volume').cast('int').isNotNull(), F.col('volume')).otherwise(None)) # replace non-numeric values
              .withColumn('issue', F.when(F.col('issue').cast('int').isNotNull(), F.col('issue')).otherwise(None))
              .withColumn('volume', F.col('volume').cast('int')) # convert column type to int
              .withColumn('issue', F.col('issue').cast('int'))
              .withColumn('volume', F.when(F.col('volume') == 0, None).otherwise(F.col('volume'))) # replace 0 issue and volume numbers with null as well.
              .withColumn('issue', F.when(F.col('issue') == 0, None).otherwise(F.col('issue'))))
    
    # Replace empty strings in some columns with nulls
    venue = F.col("venue")
    for col in ["_id", "issn", "name", "name_d", "name_s", "online_issn", "publisher", "raw", "raw_zh", "t"]:
        venue = venue.withField(col, replace_empty_string(F.col(f"venue.{col}")))  
    _df = (
        _df
        .withColumn("venue", venue)
        .withColumn("issn", replace_empty_string(F.col("issn")))
        .withColumn("isbn", replace_empty_string(F.col("isbn")))
        .withColumn("isbn", F.when(F.col("isbn") == "isbn", None).otherwise(F.col("isbn")))
        .withColumn("issn", F.when(F.col("issn") == "issn", None).otherwise(F.col("issn")))
    )
    # fix incorrect issn
    _df = (_df
               .withColumn("issn",
                           F.when(F.length(F.col("issn")) == 9, F.col("issn"))
                           .when(F.length(F.col("issn")) == 8, F.concat(F.col("issn").substr(1, 4), F.lit("-"), F.col("issn").substr(5, 4)))
                           .when(F.col("issn").contains("E-ISBN"), F.col("issn").substr(1, 9))
                           .otherwise(None)
                          )
               .withColumn("venue", 
                           F.col("venue")
                           .withField("issn", F.coalesce(F.col("venue.issn"), F.col("issn")))
                           )
               .drop("issn")
              )
    # replace venue with null fields with null
    venue_is_empty = (
        F.col("venue.issn").isNull() &
        F.col("venue.name").isNull() &
        F.col("venue.name_d").isNull() &
        F.col("venue.name_s").isNull() &
        F.col("venue.online_issn").isNull() &
        F.col("venue.publisher").isNull() &
        F.col("venue.raw").isNull() &
        F.col("venue.raw_zh").isNull()
    )
    _df = _df.withColumn("venue", F.when(venue_is_empty, None).otherwise(F.col("venue")))
    # remove rows with null venues
    _df = _df.filter(F.col("venue").isNotNull())
    # coalescing venue._id and venue.issn to make up for missing ids
    _df = _df.withColumn("venue", F.col("venue").withField("_id", F.coalesce(F.col("venue._id"), F.col("venue.issn"))))
    # removing rows with venue id null
    _df = _df.filter(F.col("venue._id").isNotNull())
    return _df

In [0]:
logger.info(f"Initially, there were {_df.count()} rows of data")

_df = transform(_df)

logger.info(f"After the transformations, there are {_df.count()} rows of data")

_df.printSchema()

INFO:__main__:Initially, there were 250000 rows of data
INFO:__main__:After the transformations, there are 156949 rows of data
root
 |-- _id: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- bio: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- gid: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- name_zh: string (nullable = true)
 |    |    |-- oid: string (nullable = true)
 |    |    |-- oid_zh: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |    |    |-- org: string (nullable = true)
 |    |    |-- org_zh: string (nullable = true)
 |    |    |-- orgid: string (nullable = true)
 |    |    |-- orgs: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- orgs_zh: array (nullable = true)
 |    |    |    |-- element: string (co

In [0]:
# Display cleaned data
display(_df.limit(DISPLAY_LIMIT))

_id,authors,doi,fos,isbn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,venue,volume,year,Author_ID
53e99784b7602d9701f3e151,"List(List(53f46797dabfaeb22f542630, null, null, null, Jairo Rocha, null, null, null, null, null, null, null, null, null, null), List(54328883dabfaeb4c6a8a699, null, null, null, Theo Pavlidis, null, null, null, null, null, null, null, null, null, null))",10.1109/ICDAR.1993.395663,"List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",,,"List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)",en,17,605.0,602.0,,"List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)",A solution to the problem of touching and broken characters.,List(http://dx.doi.org/10.1109/ICDAR.1993.395663),"List(53a72a4920f7420be8bfa51b, null, null, International Conference on Document Analysis and Recognition, null, null, null, ICDAR-1, null, null, null, null, 0)",,1993,"List(53f46797dabfaeb22f542630, 54328883dabfaeb4c6a8a699)"
53e99784b7602d9701f3e15d,"List(List(53f43b03dabfaedce555bf2a, null, null, null, Min Pan, null, null, null, null, null, null, null, null, null, null), List(53f45ee9dabfaee43ecda842, null, null, null, Chris C. N. Chu, null, null, null, null, null, null, null, null, null, null), List(53f42e8cdabfaee1c0a4274e, null, null, null, Hai Zhou, null, null, null, null, null, null, null, null, null, null))",10.1109/ISCAS.2005.1465124,"List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",0-7803-8834-8,,"List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)",en,28,,2461.0,//static.aminer.org/pdf/PDF/000/423/329/timing_yield_estimation_using_statistical_static_timing_analysis.pdf,"List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)",Timing yield estimation using statistical static timing analysis,"List(http://dx.doi.org/10.1109/ISCAS.2005.1465124, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1465124)","List(53a72e2020f7420be8c80142, null, null, International Symposium on Circuits and Systems, null, null, null, ISCAS (3), null, null, null, null, 0)",,2005,"List(53f43b03dabfaedce555bf2a, 53f45ee9dabfaee43ecda842, 53f42e8cdabfaee1c0a4274e)"
53e99784b7602d9701f3f411,"List(List(548a2e3ddabfae9b40134fbc, null, null, null, Harry M. Sneed, null, null, null, null, null, null, null, null, null, null))",10.1109/CMPSAC.2002.1044548,"List(XML Base, World Wide Web, XML framework, XML Encryption, Efficient XML Interchange, SGML, Programming language, Software engineering, XML, XML validation, Computer science, cXML)",0-7695-1727-7,,"List(Internet, hypermedia markup languages, information resources, systems re-engineering, COBOL, PL/I, World Wide Web, XML, batch programs, data conversion, e-commerce, eXtensible Markup Language, enterprise application integration, interface reengineering, legacy programs, online programs, software reengineering, subprograms, systems integration)",en,28,172.0,167.0,,"List(53e9adbdb7602d97037be8a2, 53e9bb53b7602d9704792f33, 558aa425e4b0b32fcb37fff4, 558abd44e4b031bae1f9653a, 53e9a326b7602d9702c32229, 53e9b1d7b7602d9703c6ce7c, 558a7de784ae84d265bdee99, 53e9ae17b7602d9703828d13, 53e9aa4fb7602d97033bf9ad)",Using XML to Integrate Existing Software Systems into the Web,"List(http://dx.doi.org/10.1109/CMPSAC.2002.1044548, http://doi.ieeecomputersociety.org/10.1109/CMPSAC.2002.1044548, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1044548)","List(53a72e9920f7420be8c93fac, null, null, Computer Software and Applications Conference, null, null, null, COMPSAC, null, null, null, null, 0)",,2002,List(548a2e3ddabfae9b40134fbc)
53e99784b7602d9701f3f5fe,"List(List(53f46a22dabfaee0d9c3d5e5, null, ysg_2005@hotmail.com, 5b8698cce1cd8e14a3826671, Shuguo Yang, null, null, null, null, School of Mathematics and Physics, Qingdao University of Science and Technology, Qingdao, China 266061, null, 5f71b2e91c455f439fe3f23f, null, null, null))",10.1007/s11704-011-0127-6,"List(Virtualization, Service level objective, Virtual machine, Computer science, Testbed, Quality of service, Provisioning, Resource allocation, Web application, Operating system, Distributed computing)",,4.0,"List(resource allocation, cpu utilization, quality of service)",en,2,512.0,506.0,,"List(53e9a073b7602d9702957efa, 53e9ad87b7602d970377bfb5, 53e9be51b7602d9704b11381, 53e9be04b7602d9704abb31d, 53e9992bb7602d9702169236, 53e998cdb7602d97021044db, 53e9afa6b7602d97039f6054, 53e99822b7602d9702044e60)",Research on resource allocation for multi-tier web applications in a virtualization environment,"List(http://dx.doi.org/10.1007/s11704-011-0127-6, http://link.springer.com/article/10.1007/s11704-011-0127-6, http://www.webofknowledge.com/)","List(572de199d39c4f49934b3d5c, 1673-7350, null, null, null, null, null, Frontiers of Computer Science in China, null, null, null, null, 0)",5.0,2011,List(53f46a22dabfaee0d9c3d5e5)
53e99792b7602d9701f5af1a,"List(List(5631df8845cedb3399f3e752, null, null, null, Shigeru Fujita, null, null, null, null, null, null, null, null, null, null), List(53f4775edabfaee4dc891b69, null, null, null, Kenji Sugawara, null, null, null, null, null, null, null, null, null, null), List(54096ca7dabfae450f483585, null, null, null, Claude Moulin, null, null, null, null, null, null, null, null, null, null), List(5448b55bdabfae87b7e68206, null, null, null, Jean-Paul A. Barthès, null, null, null, null, null, null, null, null, null, null))",10.1109/COGINF.2010.5599834,"List(Syma, Computer science, Symbiotic computing, Multi-agent system, Human–computer interaction, Schedule, Artificial intelligence, Ubiquitous computing, Cognition)",,,"List(cognition, multi-agent systems, ubiquitous computing, ADIPS-DASH, OMAS, SYMA, actuators, awareness and operation module, cognition functions, decision functions, intelligent multiagent system, multiparadigm-multiagent framework, perceptual interaction, social interaction, symbiotic base mechanism, symbiotic multiagent system, Awareness, Cognition Layer model, Multi-agent system, Social-ware, Symbiotic Computing)",en,4,630.0,625.0,,"List(53e9b3dab7602d9703ec7ddf, 53e9a3edb7602d9702d03525, 53e9b9fbb7602d97045f67ae, 53e9b4c3b7602d9703fdfe37, 53e9a310b7602d9702c1a36e, 53e9abfeb7602d97035c19c5)",The design of awareness and operation module for the symbiotic applications.,"List(http://dx.doi.org/10.1109/COGINF.2010.5599834, http://doi.ieeecomputersociety.org/10.1109/COGINF.2010.5599834, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5599834)","List(53a72bad20f7420be8c2d5af, null, null, IEEE International Conference on Cognitive Informatics, null, null, null, IEEE ICCI, null, null, null, null, 0)",,2010,"List(5631df8845cedb3399f3e752, 53f4775edabfaee4dc891b69, 54096ca7dabfae450f483585, 5448b55bdabfae87b7e68206)"
53e99792b7602d9701f5af27,"List(List(53f46e66dabfaee02adb48fd, null, ysq05@mails.tsinghua.edu.cn, null, Shengqi Ye, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(53f362d7dabfae4b3498de6a, null, null, null, Yingjia He, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(54488a23dabfae87b7e3f16a, null, null, null, Jianming Hu, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(561cba1b45ce11c523ca3441, null, null, null, Zuo Zhang, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null))",10.1109/FSKD.2008.678,"List(Spline (mathematics), Mars Exploration Program, Data modeling, Multivariate adaptive regression splines, Traffic flow, Computer science, Regression analysis, Artificial intelligence, Artificial neural network, Machine learning, Beijing)",,,"List(considerable accuracy, promising traffic flow forecasting, proposed mars method, neural networks, short-term traffic flow forecasting, historical traffic flow data, traffic flow forecasting, multivariate adaptive regression splines, mars model, analytical function, mars method, analytic function, forecasting, predictive models, spline, mars, traffic flow, neural network, detectors, data models, regression analysis)",en,11,675.0,669.0,,"List(53e9b95bb7602d9704549008, 53e9ba11b7602d97046117d8, 53e9b8f6b7602d97044dc1a6, 53e99d51b7602d970260acca, 53e9a751b7602d9703088787)",Short-Term Traffic Flow Forecasting Based on MARS,"List(http://dx.doi.org/10.1109/FSKD.2008.678, http://www.webofknowledge.com/)","List(53a72cfa20f7420be8c554b2, null, null, null, null, null, null, FSKD (5), null, null, null, null, 0)",,2008,"List(53f46e66dabfaee02adb48fd, 53f362d7dabfae4b3498de6a, 54488a23dabfae87b7e3f16a, 561cba1b45ce11c523ca3441)"
53e99792b7602d9701f5af35,"List(List(53f43a51dabfaec22baa659b, null, dedwards@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Dennis Edwards, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f3b3ffdabfae4b34b2dae9, null, ssimmons@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Sharon Simmons, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f4333fdabfaeb22f451979, null, nwilde@uwf.edu, null, Norman Wilde, null, null, null, null, Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., null, null, List(Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null))",10.1016/j.jss.2004.12.018,"List(Data mining, Causality, End user, Ranking, Computer science, Military systems, Software, Feature model, Component-based software engineering, A-weighting, Distributed computing)",,1.0,"List(Feature location, Distributed systems, Software reconnaissance)",en,62,68.0,57.0,//static.aminer.org/pdf/PDF/000/996/035/an_approach_to_feature_location_in_distributed_systems.pdf,"List(53e9b6eeb7602d970427df40, 53e9b6eeb7602d9704283b9f, 53e9b40eb7602d9703f01b25, 53e9a3c0b7602d9702ccdfc9, 53e99818b7602d97020347a2, 53e9a2acb7602d9702bb4d7e, 558aa7ea84ae84d265bee194, 558a5258e4b037c08756714c, 53e9b946b7602d97045336a9, 53e9b1d6b7602d9703c67695, 53e9a516b7602d9702e3bcea, 53e9ac33b7602d97035f892c, 53e9ba22b7602d9704628817, 53e9af3ab7602d97039769c8, 53e9b1a3b7602d9703c2c6f7, 53e9ac89b7602d9703660f90, 53e9ad2db7602d970370e8a2, 53e9a735b7602d970306db2b, 53e99960b7602d97021a17da)",An approach to feature location in distributed systems,"List(http://dx.doi.org/10.1016/j.jss.2004.12.018, https://www.sciencedirect.com/science/article/pii/S016412120500004X, http://www.webofknowledge.com/)","List(54825226582fc50b5e05610e, 0164-1212, null, null, null, null, null, Journal of Systems and Software, null, null, null, null, 0)",79.0,2006,"List(53f43a51dabfaec22baa659b, 53f3b3ffdabfae4b34b2dae9, 53f4333fdabfaeb22f451979)"
53e99792b7602d9701f5b06f,"List(List(53f45e2adabfaeb22f51d645, null, null, null, Luís Macedo, null, null, null, 0000-0002-3144-0362, null, null, null, null, null, null), List(53f45576dabfaeee22a30c3d, null, null, null, Amílcar Cardoso, null, null, null, null, null, null, null, null, null, null))",10.1007/BFb0056317,"List(Adjacency matrix, Graph, Knowledge representation and reasoning, Storytelling, Architectural design, Computer science, Artificial intelligence, Case-based reasoning, Recursion, Subgraph isomorphism problem)",3-540-64990-5,,"List(nested graph-structured representations, adjacency matrix)",en,20,12.0,1.0,,"List(53e9b049b7602d9703aadc37, 53e99df1b7602d97026b4d0e, 53e9a6fdb7602d97030331ef, 53e9b39db7602d9703e81ff1, 53e9b6fab7602d970428ee72, 53e9b109b7602d9703b875ef, 53e9a5e9b7602d9702f136da, 53e99e28b7602d97026f0125, 53e9ba17b7602d970461b707, 53e99ad1b7602d970235524d, 53e9b35ab7602d9703e35fe3, 53e9a70bb7602d9703040c52, 53e99ccab7602d970258074d, 558a73e3e4b0b32fcb36e62f, 53e99d0cb7602d97025c15c4, 53e9b1aab7602d9703c36404, 5c790e6c4895d9cbc61790aa)",Nested Graph-Structured Representations for Cases,"List(http://dx.doi.org/10.1007/BFb0056317, http://www.webofknowledge.com/)","List(53a7271520f7420be8b8b5ba, 0302-9743, null, null, null, null, null, EWCBR, null, null, null, null, 0)",1488.0,1998,"List(53f45e2adabfaeb22f51d645, 53f45576dabfaeee22a30c3d)"
53e99792b7602d9701f5b074,"List(List(53f4357bdabfaee4dc77b09a, null, kanchana.thilakarathna@nicta.com.au, null, Kanchana Thilakarathna, null, null, null, 0000-0003-4332-0082, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f4662fdabfaee2a1dadc95, null, null, null, Xinlong Guan, null, null, null, null, Natl ICT Australia, Sydney, NSW, Australia, null, null, List(Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f484c5dabfaee4dc8b0b1e, null, null, null, Aruna Seneviratne, null, null, null, 0000-0001-6894-7987, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null))",10.1145/2594368.2601465,"List(World Wide Web, Content sharing, Android (operating system), Social network, Computer science, Active networking, Overlay, User-centered design)",,,"List(cellular data traffic offloading, mobile social networking, store and forward networks, user generated content sharing)",en,2,361.0,360.0,,"List(53e9b04eb7602d9703ab29b9, 557c6f6a08b02739a5ca7106, 53e9be79b7602d9704b38a13, 53e9b360b7602d9703e3d236)",Demo: Yalut -- user-centric social networking overlay,"List(http://dx.doi.org/10.1145/2594368.2601465, http://doi.acm.org/10.1145/2594368.2601465, http://dl.acm.org/citation.cfm?id=2594368.2601465&coll=DL&dl=GUIDE&CFID=521580964&CFTOKEN=96511501&preflayout=flat, http://www.webofknowledge.com/)","List(53a72cf620f7420be8c548e2, null, null, null, null, null, null, MobiSys, null, null, null, null, 0)",,2014,"List(53f4357bdabfaee4dc77b09a, 53f4662fdabfaee2a1dadc95, 53f484c5dabfaee4dc8b0b1e)"
53e99792b7602d9701f5b085,"List(List(53f43415dabfaee43ec18eea, null, null, null, Bernard L. Menezes, null, null, null, null, Dept. of Electrical and Computer Eng., null, null, List(Dept. of Electrical and Computer Eng.), null, null), List(53f47f2cdabfaee43ed52fa2, null, null, null, K. Thadani, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f42f14dabfaee02ac76859, null, null, null, Alfred G. Dale, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f46382dabfaee02ad88cb3, null, null, null, Roy M. Jenevein, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null))",10.1007/978-1-4613-1679-4_6,"List(Multiprocessor architecture, Space-based architecture, Computer architecture, Computer science, Parallel computing, Symmetric multiprocessor system, Bandwidth (signal processing), Database machine, Host processor, Hypercube, Computation)",,,,en,9,88.0,75.0,//static.aminer.org/pdf/PDF/000/463/867/design_of_a_hyperkyklos_based_multiprocessor_architecture_for_high_performance.pdf,"List(53e9aacab7602d9703449777, 53e9b153b7602d9703bda549, 53e9a82bb7602d970317135f, 53e9bb7ab7602d97047bc792)",Design of a HyperKYKLOS-based Multiprocessor Architecture for High-Performance Join Operations,"List(http://dx.doi.org/10.1007/978-1-4613-1679-4_6, https://link.springer.com/chapter/10.1007%2F978-1-4613-1679-4_6, http://dblp.uni-trier.de/db/conf/iwdm/iwdm87.html#MenezesTDJ87, https://rd.springer.com/chapter/10.1007/978-1-4613-1679-4_6)","List(53a72ac520f7420be8c0cd21, null, null, null, null, null, null, IWDM, null, null, null, null, 0)",,1987,"List(53f43415dabfaee43ec18eea, 53f47f2cdabfaee43ed52fa2, 53f42f14dabfaee02ac76859, 53f46382dabfaee02ad88cb3)"


### 3. Create the new DFs
#### 3.1. Venue DF

In [0]:
# creating the venues df
def create_venues_df(_df):
    venues_df = (_df
                 .withColumn("has_volume_or_issue", F.when(F.col("volume").isNotNull() | F.col("issue").isNotNull(), True).otherwise(None))
                 .select("venue.*", "has_volume_or_issue")
                 .filter(F.col("_id").isNotNull())
                 .drop("src", "sid", "type"))

    # removing the columns from the original df
    _df = _df.withColumn("venue_id", F.col("venue._id")).drop("venue")
    
    # combining rows with the same id, but different column values
    # taking the first non-null value for the id as the column value
    venue_columns = (
        "issn",
        "name",
        "name_d",
        "name_s",
        "raw",
        "raw_zh",
        "online_issn",
        "publisher",
        "t",
        "has_volume_or_issue")
    venues_df = venues_df.groupBy(F.col("_id")).agg(*(F.first(F.col(col), ignorenulls=True).alias(col) for col in venue_columns))
    
    venues_df = (
        venues_df
        # coalescing the name and raw columns
        .withColumn("raw", F.coalesce(
                F.col("raw"), 
                F.col("raw_zh"),
            ))
        .withColumn("name", F.coalesce(
                F.col("name"), 
                F.col("name_d"),
                F.col("raw")
            ))
        .drop("name_d", "name_s", "raw_zh") 
        # creating the type field
        .withColumn("type",            
                   F.when(
                       (
                           F.col("raw").contains("@") | 
                           F.lower(F.col("raw")).contains("workshop") |
                           F.lower(F.col("name")).contains("workshop")
                       ), 
                       "Workshop"
                   ).when(
                       (F.col("t") == "J"),
                       "Journal"
                   ).when(
                       (
                           (F.col("t") == "C") |
                           F.lower(F.col("raw")).contains("conference") |
                           F.lower(("name")).contains("conference") |
                           F.lower(F.col("raw")).contains("symposium") |
                           F.lower(("name")).contains("symposium") |
                           F.lower(F.col("raw")).contains("proceedings") |
                           F.lower(("name")).contains("proceedings")
                       ),
                       "Conference"
                   ).when(
                       (
                           F.lower(F.col("raw")).contains("journal") |
                           F.lower(("name")).contains("journal") |
                           F.col("has_volume_or_issue")
                       ),
                       "Journal"
                   ).otherwise(None)
          )
        .drop("t", "has_volume_or_issue")
        .withColumnRenamed("_id", "ID")
    )
    return _df, venues_df

_df, venues_df = create_venues_df(_df)

display(venues_df.limit(DISPLAY_LIMIT))

ID,issn,name,raw,online_issn,publisher,type
0001-0782,0001-0782,COMMUNICATIONS OF THE ACM,COMMUNICATIONS OF THE ACM,1557-7317,,Journal
0001-253X,0001-253X,ASLIB PROCEEDINGS,ASLIB PROCEEDINGS,1758-3748,,Journal
0001-2815,0001-2815,TISSUE ANTIGENS,TISSUE ANTIGENS,,,Journal
0001-4966,0001-4966,JOURNAL OF THE ACOUSTICAL SOCIETY OF AMERICA,JOURNAL OF THE ACOUSTICAL SOCIETY OF AMERICA,1520-8524,,Journal
0001-5903,0001-5903,Acta Informatica,Acta Informatica,,,Journal
0001-8708,0001-8708,Advances in Mathematics,Advances in Mathematics,,Academic Press,Journal
0002-8231,0002-8231,JOURNAL OF THE AMERICAN SOCIETY FOR INFORMATION SCIENCE,JOURNAL OF THE AMERICAN SOCIETY FOR INFORMATION SCIENCE,,,Journal
0002-9149,0002-9149,AMERICAN JOURNAL OF CARDIOLOGY,AMERICAN JOURNAL OF CARDIOLOGY,,,Journal
0002-9343,0002-9343,AMERICAN JOURNAL OF MEDICINE,AMERICAN JOURNAL OF MEDICINE,,,Journal
0002-9378,0002-9378,AMERICAN JOURNAL OF OBSTETRICS AND GYNECOLOGY,AMERICAN JOURNAL OF OBSTETRICS AND GYNECOLOGY,,,Journal


In [0]:
display(_df.limit(DISPLAY_LIMIT))

_id,authors,doi,fos,isbn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,volume,year,Author_ID,venue_id
53e99784b7602d9701f3e151,"List(List(53f46797dabfaeb22f542630, null, null, null, Jairo Rocha, null, null, null, null, null, null, null, null, null, null), List(54328883dabfaeb4c6a8a699, null, null, null, Theo Pavlidis, null, null, null, null, null, null, null, null, null, null))",10.1109/ICDAR.1993.395663,"List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",,,"List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)",en,17,605.0,602.0,,"List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)",A solution to the problem of touching and broken characters.,List(http://dx.doi.org/10.1109/ICDAR.1993.395663),,1993,"List(53f46797dabfaeb22f542630, 54328883dabfaeb4c6a8a699)",53a72a4920f7420be8bfa51b
53e99784b7602d9701f3e15d,"List(List(53f43b03dabfaedce555bf2a, null, null, null, Min Pan, null, null, null, null, null, null, null, null, null, null), List(53f45ee9dabfaee43ecda842, null, null, null, Chris C. N. Chu, null, null, null, null, null, null, null, null, null, null), List(53f42e8cdabfaee1c0a4274e, null, null, null, Hai Zhou, null, null, null, null, null, null, null, null, null, null))",10.1109/ISCAS.2005.1465124,"List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",0-7803-8834-8,,"List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)",en,28,,2461.0,//static.aminer.org/pdf/PDF/000/423/329/timing_yield_estimation_using_statistical_static_timing_analysis.pdf,"List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)",Timing yield estimation using statistical static timing analysis,"List(http://dx.doi.org/10.1109/ISCAS.2005.1465124, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1465124)",,2005,"List(53f43b03dabfaedce555bf2a, 53f45ee9dabfaee43ecda842, 53f42e8cdabfaee1c0a4274e)",53a72e2020f7420be8c80142
53e99784b7602d9701f3f411,"List(List(548a2e3ddabfae9b40134fbc, null, null, null, Harry M. Sneed, null, null, null, null, null, null, null, null, null, null))",10.1109/CMPSAC.2002.1044548,"List(XML Base, World Wide Web, XML framework, XML Encryption, Efficient XML Interchange, SGML, Programming language, Software engineering, XML, XML validation, Computer science, cXML)",0-7695-1727-7,,"List(Internet, hypermedia markup languages, information resources, systems re-engineering, COBOL, PL/I, World Wide Web, XML, batch programs, data conversion, e-commerce, eXtensible Markup Language, enterprise application integration, interface reengineering, legacy programs, online programs, software reengineering, subprograms, systems integration)",en,28,172.0,167.0,,"List(53e9adbdb7602d97037be8a2, 53e9bb53b7602d9704792f33, 558aa425e4b0b32fcb37fff4, 558abd44e4b031bae1f9653a, 53e9a326b7602d9702c32229, 53e9b1d7b7602d9703c6ce7c, 558a7de784ae84d265bdee99, 53e9ae17b7602d9703828d13, 53e9aa4fb7602d97033bf9ad)",Using XML to Integrate Existing Software Systems into the Web,"List(http://dx.doi.org/10.1109/CMPSAC.2002.1044548, http://doi.ieeecomputersociety.org/10.1109/CMPSAC.2002.1044548, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1044548)",,2002,List(548a2e3ddabfae9b40134fbc),53a72e9920f7420be8c93fac
53e99784b7602d9701f3f5fe,"List(List(53f46a22dabfaee0d9c3d5e5, null, ysg_2005@hotmail.com, 5b8698cce1cd8e14a3826671, Shuguo Yang, null, null, null, null, School of Mathematics and Physics, Qingdao University of Science and Technology, Qingdao, China 266061, null, 5f71b2e91c455f439fe3f23f, null, null, null))",10.1007/s11704-011-0127-6,"List(Virtualization, Service level objective, Virtual machine, Computer science, Testbed, Quality of service, Provisioning, Resource allocation, Web application, Operating system, Distributed computing)",,4.0,"List(resource allocation, cpu utilization, quality of service)",en,2,512.0,506.0,,"List(53e9a073b7602d9702957efa, 53e9ad87b7602d970377bfb5, 53e9be51b7602d9704b11381, 53e9be04b7602d9704abb31d, 53e9992bb7602d9702169236, 53e998cdb7602d97021044db, 53e9afa6b7602d97039f6054, 53e99822b7602d9702044e60)",Research on resource allocation for multi-tier web applications in a virtualization environment,"List(http://dx.doi.org/10.1007/s11704-011-0127-6, http://link.springer.com/article/10.1007/s11704-011-0127-6, http://www.webofknowledge.com/)",5.0,2011,List(53f46a22dabfaee0d9c3d5e5),572de199d39c4f49934b3d5c
53e99792b7602d9701f5af1a,"List(List(5631df8845cedb3399f3e752, null, null, null, Shigeru Fujita, null, null, null, null, null, null, null, null, null, null), List(53f4775edabfaee4dc891b69, null, null, null, Kenji Sugawara, null, null, null, null, null, null, null, null, null, null), List(54096ca7dabfae450f483585, null, null, null, Claude Moulin, null, null, null, null, null, null, null, null, null, null), List(5448b55bdabfae87b7e68206, null, null, null, Jean-Paul A. Barthès, null, null, null, null, null, null, null, null, null, null))",10.1109/COGINF.2010.5599834,"List(Syma, Computer science, Symbiotic computing, Multi-agent system, Human–computer interaction, Schedule, Artificial intelligence, Ubiquitous computing, Cognition)",,,"List(cognition, multi-agent systems, ubiquitous computing, ADIPS-DASH, OMAS, SYMA, actuators, awareness and operation module, cognition functions, decision functions, intelligent multiagent system, multiparadigm-multiagent framework, perceptual interaction, social interaction, symbiotic base mechanism, symbiotic multiagent system, Awareness, Cognition Layer model, Multi-agent system, Social-ware, Symbiotic Computing)",en,4,630.0,625.0,,"List(53e9b3dab7602d9703ec7ddf, 53e9a3edb7602d9702d03525, 53e9b9fbb7602d97045f67ae, 53e9b4c3b7602d9703fdfe37, 53e9a310b7602d9702c1a36e, 53e9abfeb7602d97035c19c5)",The design of awareness and operation module for the symbiotic applications.,"List(http://dx.doi.org/10.1109/COGINF.2010.5599834, http://doi.ieeecomputersociety.org/10.1109/COGINF.2010.5599834, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5599834)",,2010,"List(5631df8845cedb3399f3e752, 53f4775edabfaee4dc891b69, 54096ca7dabfae450f483585, 5448b55bdabfae87b7e68206)",53a72bad20f7420be8c2d5af
53e99792b7602d9701f5af27,"List(List(53f46e66dabfaee02adb48fd, null, ysq05@mails.tsinghua.edu.cn, null, Shengqi Ye, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(53f362d7dabfae4b3498de6a, null, null, null, Yingjia He, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(54488a23dabfae87b7e3f16a, null, null, null, Jianming Hu, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(561cba1b45ce11c523ca3441, null, null, null, Zuo Zhang, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null))",10.1109/FSKD.2008.678,"List(Spline (mathematics), Mars Exploration Program, Data modeling, Multivariate adaptive regression splines, Traffic flow, Computer science, Regression analysis, Artificial intelligence, Artificial neural network, Machine learning, Beijing)",,,"List(considerable accuracy, promising traffic flow forecasting, proposed mars method, neural networks, short-term traffic flow forecasting, historical traffic flow data, traffic flow forecasting, multivariate adaptive regression splines, mars model, analytical function, mars method, analytic function, forecasting, predictive models, spline, mars, traffic flow, neural network, detectors, data models, regression analysis)",en,11,675.0,669.0,,"List(53e9b95bb7602d9704549008, 53e9ba11b7602d97046117d8, 53e9b8f6b7602d97044dc1a6, 53e99d51b7602d970260acca, 53e9a751b7602d9703088787)",Short-Term Traffic Flow Forecasting Based on MARS,"List(http://dx.doi.org/10.1109/FSKD.2008.678, http://www.webofknowledge.com/)",,2008,"List(53f46e66dabfaee02adb48fd, 53f362d7dabfae4b3498de6a, 54488a23dabfae87b7e3f16a, 561cba1b45ce11c523ca3441)",53a72cfa20f7420be8c554b2
53e99792b7602d9701f5af35,"List(List(53f43a51dabfaec22baa659b, null, dedwards@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Dennis Edwards, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f3b3ffdabfae4b34b2dae9, null, ssimmons@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Sharon Simmons, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f4333fdabfaeb22f451979, null, nwilde@uwf.edu, null, Norman Wilde, null, null, null, null, Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., null, null, List(Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null))",10.1016/j.jss.2004.12.018,"List(Data mining, Causality, End user, Ranking, Computer science, Military systems, Software, Feature model, Component-based software engineering, A-weighting, Distributed computing)",,1.0,"List(Feature location, Distributed systems, Software reconnaissance)",en,62,68.0,57.0,//static.aminer.org/pdf/PDF/000/996/035/an_approach_to_feature_location_in_distributed_systems.pdf,"List(53e9b6eeb7602d970427df40, 53e9b6eeb7602d9704283b9f, 53e9b40eb7602d9703f01b25, 53e9a3c0b7602d9702ccdfc9, 53e99818b7602d97020347a2, 53e9a2acb7602d9702bb4d7e, 558aa7ea84ae84d265bee194, 558a5258e4b037c08756714c, 53e9b946b7602d97045336a9, 53e9b1d6b7602d9703c67695, 53e9a516b7602d9702e3bcea, 53e9ac33b7602d97035f892c, 53e9ba22b7602d9704628817, 53e9af3ab7602d97039769c8, 53e9b1a3b7602d9703c2c6f7, 53e9ac89b7602d9703660f90, 53e9ad2db7602d970370e8a2, 53e9a735b7602d970306db2b, 53e99960b7602d97021a17da)",An approach to feature location in distributed systems,"List(http://dx.doi.org/10.1016/j.jss.2004.12.018, https://www.sciencedirect.com/science/article/pii/S016412120500004X, http://www.webofknowledge.com/)",79.0,2006,"List(53f43a51dabfaec22baa659b, 53f3b3ffdabfae4b34b2dae9, 53f4333fdabfaeb22f451979)",54825226582fc50b5e05610e
53e99792b7602d9701f5b06f,"List(List(53f45e2adabfaeb22f51d645, null, null, null, Luís Macedo, null, null, null, 0000-0002-3144-0362, null, null, null, null, null, null), List(53f45576dabfaeee22a30c3d, null, null, null, Amílcar Cardoso, null, null, null, null, null, null, null, null, null, null))",10.1007/BFb0056317,"List(Adjacency matrix, Graph, Knowledge representation and reasoning, Storytelling, Architectural design, Computer science, Artificial intelligence, Case-based reasoning, Recursion, Subgraph isomorphism problem)",3-540-64990-5,,"List(nested graph-structured representations, adjacency matrix)",en,20,12.0,1.0,,"List(53e9b049b7602d9703aadc37, 53e99df1b7602d97026b4d0e, 53e9a6fdb7602d97030331ef, 53e9b39db7602d9703e81ff1, 53e9b6fab7602d970428ee72, 53e9b109b7602d9703b875ef, 53e9a5e9b7602d9702f136da, 53e99e28b7602d97026f0125, 53e9ba17b7602d970461b707, 53e99ad1b7602d970235524d, 53e9b35ab7602d9703e35fe3, 53e9a70bb7602d9703040c52, 53e99ccab7602d970258074d, 558a73e3e4b0b32fcb36e62f, 53e99d0cb7602d97025c15c4, 53e9b1aab7602d9703c36404, 5c790e6c4895d9cbc61790aa)",Nested Graph-Structured Representations for Cases,"List(http://dx.doi.org/10.1007/BFb0056317, http://www.webofknowledge.com/)",1488.0,1998,"List(53f45e2adabfaeb22f51d645, 53f45576dabfaeee22a30c3d)",53a7271520f7420be8b8b5ba
53e99792b7602d9701f5b074,"List(List(53f4357bdabfaee4dc77b09a, null, kanchana.thilakarathna@nicta.com.au, null, Kanchana Thilakarathna, null, null, null, 0000-0003-4332-0082, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f4662fdabfaee2a1dadc95, null, null, null, Xinlong Guan, null, null, null, null, Natl ICT Australia, Sydney, NSW, Australia, null, null, List(Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f484c5dabfaee4dc8b0b1e, null, null, null, Aruna Seneviratne, null, null, null, 0000-0001-6894-7987, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null))",10.1145/2594368.2601465,"List(World Wide Web, Content sharing, Android (operating system), Social network, Computer science, Active networking, Overlay, User-centered design)",,,"List(cellular data traffic offloading, mobile social networking, store and forward networks, user generated content sharing)",en,2,361.0,360.0,,"List(53e9b04eb7602d9703ab29b9, 557c6f6a08b02739a5ca7106, 53e9be79b7602d9704b38a13, 53e9b360b7602d9703e3d236)",Demo: Yalut -- user-centric social networking overlay,"List(http://dx.doi.org/10.1145/2594368.2601465, http://doi.acm.org/10.1145/2594368.2601465, http://dl.acm.org/citation.cfm?id=2594368.2601465&coll=DL&dl=GUIDE&CFID=521580964&CFTOKEN=96511501&preflayout=flat, http://www.webofknowledge.com/)",,2014,"List(53f4357bdabfaee4dc77b09a, 53f4662fdabfaee2a1dadc95, 53f484c5dabfaee4dc8b0b1e)",53a72cf620f7420be8c548e2
53e99792b7602d9701f5b085,"List(List(53f43415dabfaee43ec18eea, null, null, null, Bernard L. Menezes, null, null, null, null, Dept. of Electrical and Computer Eng., null, null, List(Dept. of Electrical and Computer Eng.), null, null), List(53f47f2cdabfaee43ed52fa2, null, null, null, K. Thadani, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f42f14dabfaee02ac76859, null, null, null, Alfred G. Dale, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f46382dabfaee02ad88cb3, null, null, null, Roy M. Jenevein, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null))",10.1007/978-1-4613-1679-4_6,"List(Multiprocessor architecture, Space-based architecture, Computer architecture, Computer science, Parallel computing, Symmetric multiprocessor system, Bandwidth (signal processing), Database machine, Host processor, Hypercube, Computation)",,,,en,9,88.0,75.0,//static.aminer.org/pdf/PDF/000/463/867/design_of_a_hyperkyklos_based_multiprocessor_architecture_for_high_performance.pdf,"List(53e9aacab7602d9703449777, 53e9b153b7602d9703bda549, 53e9a82bb7602d970317135f, 53e9bb7ab7602d97047bc792)",Design of a HyperKYKLOS-based Multiprocessor Architecture for High-Performance Join Operations,"List(http://dx.doi.org/10.1007/978-1-4613-1679-4_6, https://link.springer.com/chapter/10.1007%2F978-1-4613-1679-4_6, http://dblp.uni-trier.de/db/conf/iwdm/iwdm87.html#MenezesTDJ87, https://rd.springer.com/chapter/10.1007/978-1-4613-1679-4_6)",,1987,"List(53f43415dabfaee43ec18eea, 53f47f2cdabfaee43ed52fa2, 53f42f14dabfaee02ac76859, 53f46382dabfaee02ad88cb3)",53a72ac520f7420be8c0cd21


#### 3.2. Author DF

In [0]:
# Create the Authors DF
def create_authors_df(_df):
    df2 = _df.withColumn('auth_expl', F.explode(F.col("authors"))) # explode the authors array
    df2 = (df2.withColumn('auth_id', F.col('auth_expl._id')) # separate the authors id and name
              .withColumn('auth_name', F.col('auth_expl.name')))

    # Make the authors df of distinct auth_id and auth_name pairs. Also keeps only the first instance of duplicate ID entries.
    authors_df = df2.select('auth_id', 'auth_name').groupBy(F.col("auth_id")).agg(F.first(F.col("auth_name"), ignorenulls=True).alias("auth_name"))
    # there were only 92 rows where both auth id and name were null.
    # altogether, there are 400k unique authors.

    authors_df = (authors_df.withColumnRenamed('auth_id', 'ID')
                            .withColumnRenamed('auth_name', 'Name'))
    return _df, authors_df

_df, authors_df = create_authors_df(_df)
display(authors_df.limit(DISPLAY_LIMIT))

ID,Name
53f3186ddabfae9a84425c58,Hakan Ancin
53f3186fdabfae9a84425cde,A. M. A. Hariri
53f3186fdabfae9a84425cfb,Matthew Prowse
53f31870dabfae9a84425d19,Sui-ping Qi
53f31871dabfae9a84425db7,Renato Fabbri
53f31873dabfae9a84425e8a,Joachim Schimpf
53f31874dabfae9a84425eee,A. Faruq
53f31874dabfae9a84425f10,E. Di Bernardo
53f31875dabfae9a84425f46,Steven F. Roth
53f31876dabfae9a84425fb2,Jarkko Oksala


In [0]:
# The authors ID column (named Author_ID) is already in the original DF from the Transform function.
display(_df.limit(DISPLAY_LIMIT))

_id,authors,doi,fos,isbn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,volume,year,Author_ID,venue_id
53e99784b7602d9701f3e151,"List(List(53f46797dabfaeb22f542630, null, null, null, Jairo Rocha, null, null, null, null, null, null, null, null, null, null), List(54328883dabfaeb4c6a8a699, null, null, null, Theo Pavlidis, null, null, null, null, null, null, null, null, null, null))",10.1109/ICDAR.1993.395663,"List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",,,"List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)",en,17,605.0,602.0,,"List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)",A solution to the problem of touching and broken characters.,List(http://dx.doi.org/10.1109/ICDAR.1993.395663),,1993,"List(53f46797dabfaeb22f542630, 54328883dabfaeb4c6a8a699)",53a72a4920f7420be8bfa51b
53e99784b7602d9701f3e15d,"List(List(53f43b03dabfaedce555bf2a, null, null, null, Min Pan, null, null, null, null, null, null, null, null, null, null), List(53f45ee9dabfaee43ecda842, null, null, null, Chris C. N. Chu, null, null, null, null, null, null, null, null, null, null), List(53f42e8cdabfaee1c0a4274e, null, null, null, Hai Zhou, null, null, null, null, null, null, null, null, null, null))",10.1109/ISCAS.2005.1465124,"List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",0-7803-8834-8,,"List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)",en,28,,2461.0,//static.aminer.org/pdf/PDF/000/423/329/timing_yield_estimation_using_statistical_static_timing_analysis.pdf,"List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)",Timing yield estimation using statistical static timing analysis,"List(http://dx.doi.org/10.1109/ISCAS.2005.1465124, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1465124)",,2005,"List(53f43b03dabfaedce555bf2a, 53f45ee9dabfaee43ecda842, 53f42e8cdabfaee1c0a4274e)",53a72e2020f7420be8c80142
53e99784b7602d9701f3f411,"List(List(548a2e3ddabfae9b40134fbc, null, null, null, Harry M. Sneed, null, null, null, null, null, null, null, null, null, null))",10.1109/CMPSAC.2002.1044548,"List(XML Base, World Wide Web, XML framework, XML Encryption, Efficient XML Interchange, SGML, Programming language, Software engineering, XML, XML validation, Computer science, cXML)",0-7695-1727-7,,"List(Internet, hypermedia markup languages, information resources, systems re-engineering, COBOL, PL/I, World Wide Web, XML, batch programs, data conversion, e-commerce, eXtensible Markup Language, enterprise application integration, interface reengineering, legacy programs, online programs, software reengineering, subprograms, systems integration)",en,28,172.0,167.0,,"List(53e9adbdb7602d97037be8a2, 53e9bb53b7602d9704792f33, 558aa425e4b0b32fcb37fff4, 558abd44e4b031bae1f9653a, 53e9a326b7602d9702c32229, 53e9b1d7b7602d9703c6ce7c, 558a7de784ae84d265bdee99, 53e9ae17b7602d9703828d13, 53e9aa4fb7602d97033bf9ad)",Using XML to Integrate Existing Software Systems into the Web,"List(http://dx.doi.org/10.1109/CMPSAC.2002.1044548, http://doi.ieeecomputersociety.org/10.1109/CMPSAC.2002.1044548, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1044548)",,2002,List(548a2e3ddabfae9b40134fbc),53a72e9920f7420be8c93fac
53e99784b7602d9701f3f5fe,"List(List(53f46a22dabfaee0d9c3d5e5, null, ysg_2005@hotmail.com, 5b8698cce1cd8e14a3826671, Shuguo Yang, null, null, null, null, School of Mathematics and Physics, Qingdao University of Science and Technology, Qingdao, China 266061, null, 5f71b2e91c455f439fe3f23f, null, null, null))",10.1007/s11704-011-0127-6,"List(Virtualization, Service level objective, Virtual machine, Computer science, Testbed, Quality of service, Provisioning, Resource allocation, Web application, Operating system, Distributed computing)",,4.0,"List(resource allocation, cpu utilization, quality of service)",en,2,512.0,506.0,,"List(53e9a073b7602d9702957efa, 53e9ad87b7602d970377bfb5, 53e9be51b7602d9704b11381, 53e9be04b7602d9704abb31d, 53e9992bb7602d9702169236, 53e998cdb7602d97021044db, 53e9afa6b7602d97039f6054, 53e99822b7602d9702044e60)",Research on resource allocation for multi-tier web applications in a virtualization environment,"List(http://dx.doi.org/10.1007/s11704-011-0127-6, http://link.springer.com/article/10.1007/s11704-011-0127-6, http://www.webofknowledge.com/)",5.0,2011,List(53f46a22dabfaee0d9c3d5e5),572de199d39c4f49934b3d5c
53e99792b7602d9701f5af1a,"List(List(5631df8845cedb3399f3e752, null, null, null, Shigeru Fujita, null, null, null, null, null, null, null, null, null, null), List(53f4775edabfaee4dc891b69, null, null, null, Kenji Sugawara, null, null, null, null, null, null, null, null, null, null), List(54096ca7dabfae450f483585, null, null, null, Claude Moulin, null, null, null, null, null, null, null, null, null, null), List(5448b55bdabfae87b7e68206, null, null, null, Jean-Paul A. Barthès, null, null, null, null, null, null, null, null, null, null))",10.1109/COGINF.2010.5599834,"List(Syma, Computer science, Symbiotic computing, Multi-agent system, Human–computer interaction, Schedule, Artificial intelligence, Ubiquitous computing, Cognition)",,,"List(cognition, multi-agent systems, ubiquitous computing, ADIPS-DASH, OMAS, SYMA, actuators, awareness and operation module, cognition functions, decision functions, intelligent multiagent system, multiparadigm-multiagent framework, perceptual interaction, social interaction, symbiotic base mechanism, symbiotic multiagent system, Awareness, Cognition Layer model, Multi-agent system, Social-ware, Symbiotic Computing)",en,4,630.0,625.0,,"List(53e9b3dab7602d9703ec7ddf, 53e9a3edb7602d9702d03525, 53e9b9fbb7602d97045f67ae, 53e9b4c3b7602d9703fdfe37, 53e9a310b7602d9702c1a36e, 53e9abfeb7602d97035c19c5)",The design of awareness and operation module for the symbiotic applications.,"List(http://dx.doi.org/10.1109/COGINF.2010.5599834, http://doi.ieeecomputersociety.org/10.1109/COGINF.2010.5599834, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5599834)",,2010,"List(5631df8845cedb3399f3e752, 53f4775edabfaee4dc891b69, 54096ca7dabfae450f483585, 5448b55bdabfae87b7e68206)",53a72bad20f7420be8c2d5af
53e99792b7602d9701f5af27,"List(List(53f46e66dabfaee02adb48fd, null, ysq05@mails.tsinghua.edu.cn, null, Shengqi Ye, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(53f362d7dabfae4b3498de6a, null, null, null, Yingjia He, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(54488a23dabfae87b7e3f16a, null, null, null, Jianming Hu, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(561cba1b45ce11c523ca3441, null, null, null, Zuo Zhang, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null))",10.1109/FSKD.2008.678,"List(Spline (mathematics), Mars Exploration Program, Data modeling, Multivariate adaptive regression splines, Traffic flow, Computer science, Regression analysis, Artificial intelligence, Artificial neural network, Machine learning, Beijing)",,,"List(considerable accuracy, promising traffic flow forecasting, proposed mars method, neural networks, short-term traffic flow forecasting, historical traffic flow data, traffic flow forecasting, multivariate adaptive regression splines, mars model, analytical function, mars method, analytic function, forecasting, predictive models, spline, mars, traffic flow, neural network, detectors, data models, regression analysis)",en,11,675.0,669.0,,"List(53e9b95bb7602d9704549008, 53e9ba11b7602d97046117d8, 53e9b8f6b7602d97044dc1a6, 53e99d51b7602d970260acca, 53e9a751b7602d9703088787)",Short-Term Traffic Flow Forecasting Based on MARS,"List(http://dx.doi.org/10.1109/FSKD.2008.678, http://www.webofknowledge.com/)",,2008,"List(53f46e66dabfaee02adb48fd, 53f362d7dabfae4b3498de6a, 54488a23dabfae87b7e3f16a, 561cba1b45ce11c523ca3441)",53a72cfa20f7420be8c554b2
53e99792b7602d9701f5af35,"List(List(53f43a51dabfaec22baa659b, null, dedwards@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Dennis Edwards, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f3b3ffdabfae4b34b2dae9, null, ssimmons@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Sharon Simmons, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f4333fdabfaeb22f451979, null, nwilde@uwf.edu, null, Norman Wilde, null, null, null, null, Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., null, null, List(Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null))",10.1016/j.jss.2004.12.018,"List(Data mining, Causality, End user, Ranking, Computer science, Military systems, Software, Feature model, Component-based software engineering, A-weighting, Distributed computing)",,1.0,"List(Feature location, Distributed systems, Software reconnaissance)",en,62,68.0,57.0,//static.aminer.org/pdf/PDF/000/996/035/an_approach_to_feature_location_in_distributed_systems.pdf,"List(53e9b6eeb7602d970427df40, 53e9b6eeb7602d9704283b9f, 53e9b40eb7602d9703f01b25, 53e9a3c0b7602d9702ccdfc9, 53e99818b7602d97020347a2, 53e9a2acb7602d9702bb4d7e, 558aa7ea84ae84d265bee194, 558a5258e4b037c08756714c, 53e9b946b7602d97045336a9, 53e9b1d6b7602d9703c67695, 53e9a516b7602d9702e3bcea, 53e9ac33b7602d97035f892c, 53e9ba22b7602d9704628817, 53e9af3ab7602d97039769c8, 53e9b1a3b7602d9703c2c6f7, 53e9ac89b7602d9703660f90, 53e9ad2db7602d970370e8a2, 53e9a735b7602d970306db2b, 53e99960b7602d97021a17da)",An approach to feature location in distributed systems,"List(http://dx.doi.org/10.1016/j.jss.2004.12.018, https://www.sciencedirect.com/science/article/pii/S016412120500004X, http://www.webofknowledge.com/)",79.0,2006,"List(53f43a51dabfaec22baa659b, 53f3b3ffdabfae4b34b2dae9, 53f4333fdabfaeb22f451979)",54825226582fc50b5e05610e
53e99792b7602d9701f5b06f,"List(List(53f45e2adabfaeb22f51d645, null, null, null, Luís Macedo, null, null, null, 0000-0002-3144-0362, null, null, null, null, null, null), List(53f45576dabfaeee22a30c3d, null, null, null, Amílcar Cardoso, null, null, null, null, null, null, null, null, null, null))",10.1007/BFb0056317,"List(Adjacency matrix, Graph, Knowledge representation and reasoning, Storytelling, Architectural design, Computer science, Artificial intelligence, Case-based reasoning, Recursion, Subgraph isomorphism problem)",3-540-64990-5,,"List(nested graph-structured representations, adjacency matrix)",en,20,12.0,1.0,,"List(53e9b049b7602d9703aadc37, 53e99df1b7602d97026b4d0e, 53e9a6fdb7602d97030331ef, 53e9b39db7602d9703e81ff1, 53e9b6fab7602d970428ee72, 53e9b109b7602d9703b875ef, 53e9a5e9b7602d9702f136da, 53e99e28b7602d97026f0125, 53e9ba17b7602d970461b707, 53e99ad1b7602d970235524d, 53e9b35ab7602d9703e35fe3, 53e9a70bb7602d9703040c52, 53e99ccab7602d970258074d, 558a73e3e4b0b32fcb36e62f, 53e99d0cb7602d97025c15c4, 53e9b1aab7602d9703c36404, 5c790e6c4895d9cbc61790aa)",Nested Graph-Structured Representations for Cases,"List(http://dx.doi.org/10.1007/BFb0056317, http://www.webofknowledge.com/)",1488.0,1998,"List(53f45e2adabfaeb22f51d645, 53f45576dabfaeee22a30c3d)",53a7271520f7420be8b8b5ba
53e99792b7602d9701f5b074,"List(List(53f4357bdabfaee4dc77b09a, null, kanchana.thilakarathna@nicta.com.au, null, Kanchana Thilakarathna, null, null, null, 0000-0003-4332-0082, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f4662fdabfaee2a1dadc95, null, null, null, Xinlong Guan, null, null, null, null, Natl ICT Australia, Sydney, NSW, Australia, null, null, List(Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f484c5dabfaee4dc8b0b1e, null, null, null, Aruna Seneviratne, null, null, null, 0000-0001-6894-7987, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null))",10.1145/2594368.2601465,"List(World Wide Web, Content sharing, Android (operating system), Social network, Computer science, Active networking, Overlay, User-centered design)",,,"List(cellular data traffic offloading, mobile social networking, store and forward networks, user generated content sharing)",en,2,361.0,360.0,,"List(53e9b04eb7602d9703ab29b9, 557c6f6a08b02739a5ca7106, 53e9be79b7602d9704b38a13, 53e9b360b7602d9703e3d236)",Demo: Yalut -- user-centric social networking overlay,"List(http://dx.doi.org/10.1145/2594368.2601465, http://doi.acm.org/10.1145/2594368.2601465, http://dl.acm.org/citation.cfm?id=2594368.2601465&coll=DL&dl=GUIDE&CFID=521580964&CFTOKEN=96511501&preflayout=flat, http://www.webofknowledge.com/)",,2014,"List(53f4357bdabfaee4dc77b09a, 53f4662fdabfaee2a1dadc95, 53f484c5dabfaee4dc8b0b1e)",53a72cf620f7420be8c548e2
53e99792b7602d9701f5b085,"List(List(53f43415dabfaee43ec18eea, null, null, null, Bernard L. Menezes, null, null, null, null, Dept. of Electrical and Computer Eng., null, null, List(Dept. of Electrical and Computer Eng.), null, null), List(53f47f2cdabfaee43ed52fa2, null, null, null, K. Thadani, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f42f14dabfaee02ac76859, null, null, null, Alfred G. Dale, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f46382dabfaee02ad88cb3, null, null, null, Roy M. Jenevein, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null))",10.1007/978-1-4613-1679-4_6,"List(Multiprocessor architecture, Space-based architecture, Computer architecture, Computer science, Parallel computing, Symmetric multiprocessor system, Bandwidth (signal processing), Database machine, Host processor, Hypercube, Computation)",,,,en,9,88.0,75.0,//static.aminer.org/pdf/PDF/000/463/867/design_of_a_hyperkyklos_based_multiprocessor_architecture_for_high_performance.pdf,"List(53e9aacab7602d9703449777, 53e9b153b7602d9703bda549, 53e9a82bb7602d970317135f, 53e9bb7ab7602d97047bc792)",Design of a HyperKYKLOS-based Multiprocessor Architecture for High-Performance Join Operations,"List(http://dx.doi.org/10.1007/978-1-4613-1679-4_6, https://link.springer.com/chapter/10.1007%2F978-1-4613-1679-4_6, http://dblp.uni-trier.de/db/conf/iwdm/iwdm87.html#MenezesTDJ87, https://rd.springer.com/chapter/10.1007/978-1-4613-1679-4_6)",,1987,"List(53f43415dabfaee43ec18eea, 53f47f2cdabfaee43ed52fa2, 53f42f14dabfaee02ac76859, 53f46382dabfaee02ad88cb3)",53a72ac520f7420be8c0cd21


#### 3.3. Organization DF

In [0]:
# Create the new df

# finds the country names in a list of strings
# modified to only use the first element of the list
# uses regex to remove punctuation from the string and to match the given names of the countries and some abbreviations

def getCountry(s):
    if s is None:
        return None
    arr = []
    countries = ["Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua & Deps", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina", "Burundi", "Cambodia", "Cameroon", "Canada", "Cape Verde", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo", "Congo Democratic Republic", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czech Republic", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "East Timor", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Ethiopia", "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-bissau", "Guyana", "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Ivory Coast", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "South Korea", "Kosovo", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg", "Macedonia", "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar", "Burma", "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "Norway", "Romania", "Pakistan", "Palau", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Oman", "Russia", "Rwanda", "St Kitts & Nevis", "St Lucia", "Saint Vincent & The Grenadines", "Samoa", "San Marino", "Sao Tome & Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Swaziland", "Sweden", "Switzerland", "Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Togo", "Tonga", "Trinidad & Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe"]
    state_names = ["alaska", "alabama", "arkansas", "american samoa", "arizona", "california", "colorado", "connecticut", "district ", "of columbia", "delaware", "florida", "georgia", "guam", "hawaii", "iowa", "idaho", "illinois", "indiana", "kansas", "kentucky", "louisiana", "massachusetts", "maryland", "maine", "michigan", "minnesota", "missouri", "mississippi", "montana", "north carolina", "north dakota", "nebraska", "new hampshire", "new jersey", "new mexico", "nevada", "new york", "ohio", "oklahoma", "oregon", "pennsylvania", "puerto rico", "rhode island", "south carolina", "south dakota", "tennessee", "texas", "utah", "virginia", "virgin islands", "vermont", "washington", "wisconsin", "west virginia", "wyoming"]
    states = ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']
    
    for i in s:
        if i["org"] is None:
            arr.append(None)
            break
        sent = re.sub("[^a-zA-Z -]", "", i["org"])
        x = None
        for j in countries:
            x = re.search(j.lower(), sent.lower())
            if x is not None:
                if j.lower() == 'india':
                    x = re.search('indiana', sent.lower())
                    if x is not None:
                        arr.append("United States")
                elif j.lower() == 'georgia':
                    x = re.search('USA', sent)
                    if x is not None:
                        arr.append("United States")
                else:
                    arr.append(j)
                break
        if x is None:
            x = re.search("USA", sent)
            if x is not None:
                arr.append("United States")
                break
        if x is None:
            x = re.search("UK", sent)
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            x = re.search("england", sent.lower())
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            x = re.search("scotland", sent.lower())
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            x = re.search("wales", sent.lower())
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            for j in states:
                x = re.search(j, sent)
                if x is not None:
                    arr.append("United States")
                    break
        if x is None:
            for j in state_names:
                x = re.search(j, sent.lower())
                if x is not None:
                    arr.append("United States")
                    break
        break
                    
    if len(arr) > 0:
        return arr[0]
    else:
        return None

getCountryUDF = udf(getCountry)

In [0]:
# Organization (affiliation of the first author)
# ID - authors.orgid
# Name - authors.org
# Country - getCountryUDF(F.arrays_zip("authors.org"))
def organization(df):
    new_df = (df
              .withColumn("ID", F.col("authors.orgid").getItem(0))
              .withColumn("Name", F.col("authors.org").getItem(0))
              .filter(F.col("ID").isNotNull())
              #.groupBy(F.col("ID")).agg(F.first(F.col("Name"), ignorenulls=True).alias("Name"))
              .dropDuplicates(["ID"])
              .withColumn("Country", getCountryUDF(F.arrays_zip("authors.org")))
              .select("ID", "Name", "Country"))
    
    new_df = new_df.na.drop("all")
    return new_df

def create_orgs_df(_df):
    org_df = organization(_df)
    
    org_cols = ('Name', 'Country')
    new_df = org_df.select('ID', 'Name', 'Country')
    
    _df = _df.withColumn("Org", F.col("authors.orgid").getItem(0))
    
    # create the new df with new IDs
    #new_df = org_df.select('ID', 'Name', 'Country').distinct()
    #new_df = new_df.withColumn('Org', F.monotonically_increasing_id())
    #new_df = new_df.withColumn('Org', new_df.Org.cast(T.StringType()))
    #new_df = new_df.fillna('missing_org_id', 'ID')
    
    # add separate org id and name columns to the original df
    #df2 = _df.withColumn("ID", F.col("authors.orgid").getItem(0)).withColumn("Name", F.col("authors.org").getItem(0))
    #df2 = df2.fillna('missing_org_id', 'ID')
    
    # join newly created IDs to the original df and replace missing org ids with generated IDs
    #_df = df2.join(new_df.select('Org', 'ID', 'Name'), on=['ID', 'Name'])
    #_df = _df.replace('missing_org_id', None)
    #_df = _df.withColumn('Org', F.coalesce(_df.ID, _df.Org))
    #_df = _df.drop('ID', 'Name')
    
    # replace missing org ids in the new df with generated IDs
    #new_df = new_df.replace('missing_org_id', None)
    #new_df = new_df.withColumn('ID', F.coalesce(new_df.ID, new_df.Org))
    #new_df = new_df.drop('Org')

    return _df, new_df

In [0]:
_df, orgs_df = create_orgs_df(_df)
display(orgs_df.limit(DISPLAY_LIMIT))

ID,Name,Country
5f71b2801c455f439fe3c575,"Chair ANSI X3L1.2, GIS Extensions to SQL",
5f71b2811c455f439fe3c57c,Arizona State University,United States
5f71b2811c455f439fe3c57e,"Adobe Research, Adobe Systems Incorporated, San Francisco, CA",United States
5f71b2811c455f439fe3c58a,ACM,
5f71b2811c455f439fe3c592,"Department of Computer Science, Brown University, Providence, RI",United States
5f71b2811c455f439fe3c599,"The Burroughs, Hendon, Middlsex University, London, UK",United Kingdom
5f71b2811c455f439fe3c59c,"Future Technologies Group, British Telecom Laboratories, MLB1 PP12, Adastral Park, Martlesham Heath, Ipswich, IP5 3RE Suffolk, UK",United Kingdom
5f71b2811c455f439fe3c5a3,"Eshraghian Labs. Pty Ltd, Bentley, WA, Australia",Australia
5f71b2811c455f439fe3c5a5,Chalmers University of Technology (e-mail: koen@cs.chalmers.se),
5f71b2811c455f439fe3c5ab,"Department of Mechanical Engineering, Columbia University New York, NY",United States


In [0]:
display(_df.limit(DISPLAY_LIMIT))

_id,authors,doi,fos,isbn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,volume,year,Author_ID,venue_id,Org
53e99784b7602d9701f3e151,"List(List(53f46797dabfaeb22f542630, null, null, null, Jairo Rocha, null, null, null, null, null, null, null, null, null, null), List(54328883dabfaeb4c6a8a699, null, null, null, Theo Pavlidis, null, null, null, null, null, null, null, null, null, null))",10.1109/ICDAR.1993.395663,"List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",,,"List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)",en,17,605.0,602.0,,"List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)",A solution to the problem of touching and broken characters.,List(http://dx.doi.org/10.1109/ICDAR.1993.395663),,1993,"List(53f46797dabfaeb22f542630, 54328883dabfaeb4c6a8a699)",53a72a4920f7420be8bfa51b,
53e99784b7602d9701f3e15d,"List(List(53f43b03dabfaedce555bf2a, null, null, null, Min Pan, null, null, null, null, null, null, null, null, null, null), List(53f45ee9dabfaee43ecda842, null, null, null, Chris C. N. Chu, null, null, null, null, null, null, null, null, null, null), List(53f42e8cdabfaee1c0a4274e, null, null, null, Hai Zhou, null, null, null, null, null, null, null, null, null, null))",10.1109/ISCAS.2005.1465124,"List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",0-7803-8834-8,,"List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)",en,28,,2461.0,//static.aminer.org/pdf/PDF/000/423/329/timing_yield_estimation_using_statistical_static_timing_analysis.pdf,"List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)",Timing yield estimation using statistical static timing analysis,"List(http://dx.doi.org/10.1109/ISCAS.2005.1465124, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1465124)",,2005,"List(53f43b03dabfaedce555bf2a, 53f45ee9dabfaee43ecda842, 53f42e8cdabfaee1c0a4274e)",53a72e2020f7420be8c80142,
53e99784b7602d9701f3f411,"List(List(548a2e3ddabfae9b40134fbc, null, null, null, Harry M. Sneed, null, null, null, null, null, null, null, null, null, null))",10.1109/CMPSAC.2002.1044548,"List(XML Base, World Wide Web, XML framework, XML Encryption, Efficient XML Interchange, SGML, Programming language, Software engineering, XML, XML validation, Computer science, cXML)",0-7695-1727-7,,"List(Internet, hypermedia markup languages, information resources, systems re-engineering, COBOL, PL/I, World Wide Web, XML, batch programs, data conversion, e-commerce, eXtensible Markup Language, enterprise application integration, interface reengineering, legacy programs, online programs, software reengineering, subprograms, systems integration)",en,28,172.0,167.0,,"List(53e9adbdb7602d97037be8a2, 53e9bb53b7602d9704792f33, 558aa425e4b0b32fcb37fff4, 558abd44e4b031bae1f9653a, 53e9a326b7602d9702c32229, 53e9b1d7b7602d9703c6ce7c, 558a7de784ae84d265bdee99, 53e9ae17b7602d9703828d13, 53e9aa4fb7602d97033bf9ad)",Using XML to Integrate Existing Software Systems into the Web,"List(http://dx.doi.org/10.1109/CMPSAC.2002.1044548, http://doi.ieeecomputersociety.org/10.1109/CMPSAC.2002.1044548, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1044548)",,2002,List(548a2e3ddabfae9b40134fbc),53a72e9920f7420be8c93fac,
53e99784b7602d9701f3f5fe,"List(List(53f46a22dabfaee0d9c3d5e5, null, ysg_2005@hotmail.com, 5b8698cce1cd8e14a3826671, Shuguo Yang, null, null, null, null, School of Mathematics and Physics, Qingdao University of Science and Technology, Qingdao, China 266061, null, 5f71b2e91c455f439fe3f23f, null, null, null))",10.1007/s11704-011-0127-6,"List(Virtualization, Service level objective, Virtual machine, Computer science, Testbed, Quality of service, Provisioning, Resource allocation, Web application, Operating system, Distributed computing)",,4.0,"List(resource allocation, cpu utilization, quality of service)",en,2,512.0,506.0,,"List(53e9a073b7602d9702957efa, 53e9ad87b7602d970377bfb5, 53e9be51b7602d9704b11381, 53e9be04b7602d9704abb31d, 53e9992bb7602d9702169236, 53e998cdb7602d97021044db, 53e9afa6b7602d97039f6054, 53e99822b7602d9702044e60)",Research on resource allocation for multi-tier web applications in a virtualization environment,"List(http://dx.doi.org/10.1007/s11704-011-0127-6, http://link.springer.com/article/10.1007/s11704-011-0127-6, http://www.webofknowledge.com/)",5.0,2011,List(53f46a22dabfaee0d9c3d5e5),572de199d39c4f49934b3d5c,5f71b2e91c455f439fe3f23f
53e99792b7602d9701f5af1a,"List(List(5631df8845cedb3399f3e752, null, null, null, Shigeru Fujita, null, null, null, null, null, null, null, null, null, null), List(53f4775edabfaee4dc891b69, null, null, null, Kenji Sugawara, null, null, null, null, null, null, null, null, null, null), List(54096ca7dabfae450f483585, null, null, null, Claude Moulin, null, null, null, null, null, null, null, null, null, null), List(5448b55bdabfae87b7e68206, null, null, null, Jean-Paul A. Barthès, null, null, null, null, null, null, null, null, null, null))",10.1109/COGINF.2010.5599834,"List(Syma, Computer science, Symbiotic computing, Multi-agent system, Human–computer interaction, Schedule, Artificial intelligence, Ubiquitous computing, Cognition)",,,"List(cognition, multi-agent systems, ubiquitous computing, ADIPS-DASH, OMAS, SYMA, actuators, awareness and operation module, cognition functions, decision functions, intelligent multiagent system, multiparadigm-multiagent framework, perceptual interaction, social interaction, symbiotic base mechanism, symbiotic multiagent system, Awareness, Cognition Layer model, Multi-agent system, Social-ware, Symbiotic Computing)",en,4,630.0,625.0,,"List(53e9b3dab7602d9703ec7ddf, 53e9a3edb7602d9702d03525, 53e9b9fbb7602d97045f67ae, 53e9b4c3b7602d9703fdfe37, 53e9a310b7602d9702c1a36e, 53e9abfeb7602d97035c19c5)",The design of awareness and operation module for the symbiotic applications.,"List(http://dx.doi.org/10.1109/COGINF.2010.5599834, http://doi.ieeecomputersociety.org/10.1109/COGINF.2010.5599834, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5599834)",,2010,"List(5631df8845cedb3399f3e752, 53f4775edabfaee4dc891b69, 54096ca7dabfae450f483585, 5448b55bdabfae87b7e68206)",53a72bad20f7420be8c2d5af,
53e99792b7602d9701f5af27,"List(List(53f46e66dabfaee02adb48fd, null, ysq05@mails.tsinghua.edu.cn, null, Shengqi Ye, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(53f362d7dabfae4b3498de6a, null, null, null, Yingjia He, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(54488a23dabfae87b7e3f16a, null, null, null, Jianming Hu, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(561cba1b45ce11c523ca3441, null, null, null, Zuo Zhang, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null))",10.1109/FSKD.2008.678,"List(Spline (mathematics), Mars Exploration Program, Data modeling, Multivariate adaptive regression splines, Traffic flow, Computer science, Regression analysis, Artificial intelligence, Artificial neural network, Machine learning, Beijing)",,,"List(considerable accuracy, promising traffic flow forecasting, proposed mars method, neural networks, short-term traffic flow forecasting, historical traffic flow data, traffic flow forecasting, multivariate adaptive regression splines, mars model, analytical function, mars method, analytic function, forecasting, predictive models, spline, mars, traffic flow, neural network, detectors, data models, regression analysis)",en,11,675.0,669.0,,"List(53e9b95bb7602d9704549008, 53e9ba11b7602d97046117d8, 53e9b8f6b7602d97044dc1a6, 53e99d51b7602d970260acca, 53e9a751b7602d9703088787)",Short-Term Traffic Flow Forecasting Based on MARS,"List(http://dx.doi.org/10.1109/FSKD.2008.678, http://www.webofknowledge.com/)",,2008,"List(53f46e66dabfaee02adb48fd, 53f362d7dabfae4b3498de6a, 54488a23dabfae87b7e3f16a, 561cba1b45ce11c523ca3441)",53a72cfa20f7420be8c554b2,
53e99792b7602d9701f5af35,"List(List(53f43a51dabfaec22baa659b, null, dedwards@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Dennis Edwards, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f3b3ffdabfae4b34b2dae9, null, ssimmons@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Sharon Simmons, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f4333fdabfaeb22f451979, null, nwilde@uwf.edu, null, Norman Wilde, null, null, null, null, Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., null, null, List(Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null))",10.1016/j.jss.2004.12.018,"List(Data mining, Causality, End user, Ranking, Computer science, Military systems, Software, Feature model, Component-based software engineering, A-weighting, Distributed computing)",,1.0,"List(Feature location, Distributed systems, Software reconnaissance)",en,62,68.0,57.0,//static.aminer.org/pdf/PDF/000/996/035/an_approach_to_feature_location_in_distributed_systems.pdf,"List(53e9b6eeb7602d970427df40, 53e9b6eeb7602d9704283b9f, 53e9b40eb7602d9703f01b25, 53e9a3c0b7602d9702ccdfc9, 53e99818b7602d97020347a2, 53e9a2acb7602d9702bb4d7e, 558aa7ea84ae84d265bee194, 558a5258e4b037c08756714c, 53e9b946b7602d97045336a9, 53e9b1d6b7602d9703c67695, 53e9a516b7602d9702e3bcea, 53e9ac33b7602d97035f892c, 53e9ba22b7602d9704628817, 53e9af3ab7602d97039769c8, 53e9b1a3b7602d9703c2c6f7, 53e9ac89b7602d9703660f90, 53e9ad2db7602d970370e8a2, 53e9a735b7602d970306db2b, 53e99960b7602d97021a17da)",An approach to feature location in distributed systems,"List(http://dx.doi.org/10.1016/j.jss.2004.12.018, https://www.sciencedirect.com/science/article/pii/S016412120500004X, http://www.webofknowledge.com/)",79.0,2006,"List(53f43a51dabfaec22baa659b, 53f3b3ffdabfae4b34b2dae9, 53f4333fdabfaeb22f451979)",54825226582fc50b5e05610e,5f71b2bd1c455f439fe3dea6
53e99792b7602d9701f5b06f,"List(List(53f45e2adabfaeb22f51d645, null, null, null, Luís Macedo, null, null, null, 0000-0002-3144-0362, null, null, null, null, null, null), List(53f45576dabfaeee22a30c3d, null, null, null, Amílcar Cardoso, null, null, null, null, null, null, null, null, null, null))",10.1007/BFb0056317,"List(Adjacency matrix, Graph, Knowledge representation and reasoning, Storytelling, Architectural design, Computer science, Artificial intelligence, Case-based reasoning, Recursion, Subgraph isomorphism problem)",3-540-64990-5,,"List(nested graph-structured representations, adjacency matrix)",en,20,12.0,1.0,,"List(53e9b049b7602d9703aadc37, 53e99df1b7602d97026b4d0e, 53e9a6fdb7602d97030331ef, 53e9b39db7602d9703e81ff1, 53e9b6fab7602d970428ee72, 53e9b109b7602d9703b875ef, 53e9a5e9b7602d9702f136da, 53e99e28b7602d97026f0125, 53e9ba17b7602d970461b707, 53e99ad1b7602d970235524d, 53e9b35ab7602d9703e35fe3, 53e9a70bb7602d9703040c52, 53e99ccab7602d970258074d, 558a73e3e4b0b32fcb36e62f, 53e99d0cb7602d97025c15c4, 53e9b1aab7602d9703c36404, 5c790e6c4895d9cbc61790aa)",Nested Graph-Structured Representations for Cases,"List(http://dx.doi.org/10.1007/BFb0056317, http://www.webofknowledge.com/)",1488.0,1998,"List(53f45e2adabfaeb22f51d645, 53f45576dabfaeee22a30c3d)",53a7271520f7420be8b8b5ba,
53e99792b7602d9701f5b074,"List(List(53f4357bdabfaee4dc77b09a, null, kanchana.thilakarathna@nicta.com.au, null, Kanchana Thilakarathna, null, null, null, 0000-0003-4332-0082, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f4662fdabfaee2a1dadc95, null, null, null, Xinlong Guan, null, null, null, null, Natl ICT Australia, Sydney, NSW, Australia, null, null, List(Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f484c5dabfaee4dc8b0b1e, null, null, null, Aruna Seneviratne, null, null, null, 0000-0001-6894-7987, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null))",10.1145/2594368.2601465,"List(World Wide Web, Content sharing, Android (operating system), Social network, Computer science, Active networking, Overlay, User-centered design)",,,"List(cellular data traffic offloading, mobile social networking, store and forward networks, user generated content sharing)",en,2,361.0,360.0,,"List(53e9b04eb7602d9703ab29b9, 557c6f6a08b02739a5ca7106, 53e9be79b7602d9704b38a13, 53e9b360b7602d9703e3d236)",Demo: Yalut -- user-centric social networking overlay,"List(http://dx.doi.org/10.1145/2594368.2601465, http://doi.acm.org/10.1145/2594368.2601465, http://dl.acm.org/citation.cfm?id=2594368.2601465&coll=DL&dl=GUIDE&CFID=521580964&CFTOKEN=96511501&preflayout=flat, http://www.webofknowledge.com/)",,2014,"List(53f4357bdabfaee4dc77b09a, 53f4662fdabfaee2a1dadc95, 53f484c5dabfaee4dc8b0b1e)",53a72cf620f7420be8c548e2,
53e99792b7602d9701f5b085,"List(List(53f43415dabfaee43ec18eea, null, null, null, Bernard L. Menezes, null, null, null, null, Dept. of Electrical and Computer Eng., null, null, List(Dept. of Electrical and Computer Eng.), null, null), List(53f47f2cdabfaee43ed52fa2, null, null, null, K. Thadani, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f42f14dabfaee02ac76859, null, null, null, Alfred G. Dale, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f46382dabfaee02ad88cb3, null, null, null, Roy M. Jenevein, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null))",10.1007/978-1-4613-1679-4_6,"List(Multiprocessor architecture, Space-based architecture, Computer architecture, Computer science, Parallel computing, Symmetric multiprocessor system, Bandwidth (signal processing), Database machine, Host processor, Hypercube, Computation)",,,,en,9,88.0,75.0,//static.aminer.org/pdf/PDF/000/463/867/design_of_a_hyperkyklos_based_multiprocessor_architecture_for_high_performance.pdf,"List(53e9aacab7602d9703449777, 53e9b153b7602d9703bda549, 53e9a82bb7602d970317135f, 53e9bb7ab7602d97047bc792)",Design of a HyperKYKLOS-based Multiprocessor Architecture for High-Performance Join Operations,"List(http://dx.doi.org/10.1007/978-1-4613-1679-4_6, https://link.springer.com/chapter/10.1007%2F978-1-4613-1679-4_6, http://dblp.uni-trier.de/db/conf/iwdm/iwdm87.html#MenezesTDJ87, https://rd.springer.com/chapter/10.1007/978-1-4613-1679-4_6)",,1987,"List(53f43415dabfaee43ec18eea, 53f47f2cdabfaee43ed52fa2, 53f42f14dabfaee02ac76859, 53f46382dabfaee02ad88cb3)",53a72ac520f7420be8c0cd21,


#### 3.4. DBLP fact table

In [0]:
def create_dblp_df(_df):
    dblp_df = _df.select('_id','venue_id','Org','Author_ID','references','keywords','fos','title','n_citation','lang','page_start','page_end','doi','isbn','year','volume','issue')
    return _df, dblp_df.toDF('ID','Venue','Org','Authors','References','Keywords','FOS','Title','NoCitations','Lang','PageStart','PageEnd','DOI','ISBN','Year','Volume','Issue')
_df, dblp_df = create_dblp_df(_df)

In [0]:
display(dblp_df.limit(DISPLAY_LIMIT))

ID,Venue,Org,Authors,References,Keywords,FOS,Title,NoCitations,Lang,PageStart,PageEnd,DOI,ISBN,Year,Volume,Issue
53e99784b7602d9701f3e151,53a72a4920f7420be8bfa51b,,"List(53f46797dabfaeb22f542630, 54328883dabfaeb4c6a8a699)","List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)","List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)","List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",A solution to the problem of touching and broken characters.,17,en,602.0,605.0,10.1109/ICDAR.1993.395663,,1993,,
53e99784b7602d9701f3e15d,53a72e2020f7420be8c80142,,"List(53f43b03dabfaedce555bf2a, 53f45ee9dabfaee43ecda842, 53f42e8cdabfaee1c0a4274e)","List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)","List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)","List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",Timing yield estimation using statistical static timing analysis,28,en,2461.0,,10.1109/ISCAS.2005.1465124,0-7803-8834-8,2005,,
53e99784b7602d9701f3f411,53a72e9920f7420be8c93fac,,List(548a2e3ddabfae9b40134fbc),"List(53e9adbdb7602d97037be8a2, 53e9bb53b7602d9704792f33, 558aa425e4b0b32fcb37fff4, 558abd44e4b031bae1f9653a, 53e9a326b7602d9702c32229, 53e9b1d7b7602d9703c6ce7c, 558a7de784ae84d265bdee99, 53e9ae17b7602d9703828d13, 53e9aa4fb7602d97033bf9ad)","List(Internet, hypermedia markup languages, information resources, systems re-engineering, COBOL, PL/I, World Wide Web, XML, batch programs, data conversion, e-commerce, eXtensible Markup Language, enterprise application integration, interface reengineering, legacy programs, online programs, software reengineering, subprograms, systems integration)","List(XML Base, World Wide Web, XML framework, XML Encryption, Efficient XML Interchange, SGML, Programming language, Software engineering, XML, XML validation, Computer science, cXML)",Using XML to Integrate Existing Software Systems into the Web,28,en,167.0,172.0,10.1109/CMPSAC.2002.1044548,0-7695-1727-7,2002,,
53e99784b7602d9701f3f5fe,572de199d39c4f49934b3d5c,5f71b2e91c455f439fe3f23f,List(53f46a22dabfaee0d9c3d5e5),"List(53e9a073b7602d9702957efa, 53e9ad87b7602d970377bfb5, 53e9be51b7602d9704b11381, 53e9be04b7602d9704abb31d, 53e9992bb7602d9702169236, 53e998cdb7602d97021044db, 53e9afa6b7602d97039f6054, 53e99822b7602d9702044e60)","List(resource allocation, cpu utilization, quality of service)","List(Virtualization, Service level objective, Virtual machine, Computer science, Testbed, Quality of service, Provisioning, Resource allocation, Web application, Operating system, Distributed computing)",Research on resource allocation for multi-tier web applications in a virtualization environment,2,en,506.0,512.0,10.1007/s11704-011-0127-6,,2011,5.0,4.0
53e99792b7602d9701f5af1a,53a72bad20f7420be8c2d5af,,"List(5631df8845cedb3399f3e752, 53f4775edabfaee4dc891b69, 54096ca7dabfae450f483585, 5448b55bdabfae87b7e68206)","List(53e9b3dab7602d9703ec7ddf, 53e9a3edb7602d9702d03525, 53e9b9fbb7602d97045f67ae, 53e9b4c3b7602d9703fdfe37, 53e9a310b7602d9702c1a36e, 53e9abfeb7602d97035c19c5)","List(cognition, multi-agent systems, ubiquitous computing, ADIPS-DASH, OMAS, SYMA, actuators, awareness and operation module, cognition functions, decision functions, intelligent multiagent system, multiparadigm-multiagent framework, perceptual interaction, social interaction, symbiotic base mechanism, symbiotic multiagent system, Awareness, Cognition Layer model, Multi-agent system, Social-ware, Symbiotic Computing)","List(Syma, Computer science, Symbiotic computing, Multi-agent system, Human–computer interaction, Schedule, Artificial intelligence, Ubiquitous computing, Cognition)",The design of awareness and operation module for the symbiotic applications.,4,en,625.0,630.0,10.1109/COGINF.2010.5599834,,2010,,
53e99792b7602d9701f5af27,53a72cfa20f7420be8c554b2,,"List(53f46e66dabfaee02adb48fd, 53f362d7dabfae4b3498de6a, 54488a23dabfae87b7e3f16a, 561cba1b45ce11c523ca3441)","List(53e9b95bb7602d9704549008, 53e9ba11b7602d97046117d8, 53e9b8f6b7602d97044dc1a6, 53e99d51b7602d970260acca, 53e9a751b7602d9703088787)","List(considerable accuracy, promising traffic flow forecasting, proposed mars method, neural networks, short-term traffic flow forecasting, historical traffic flow data, traffic flow forecasting, multivariate adaptive regression splines, mars model, analytical function, mars method, analytic function, forecasting, predictive models, spline, mars, traffic flow, neural network, detectors, data models, regression analysis)","List(Spline (mathematics), Mars Exploration Program, Data modeling, Multivariate adaptive regression splines, Traffic flow, Computer science, Regression analysis, Artificial intelligence, Artificial neural network, Machine learning, Beijing)",Short-Term Traffic Flow Forecasting Based on MARS,11,en,669.0,675.0,10.1109/FSKD.2008.678,,2008,,
53e99792b7602d9701f5af35,54825226582fc50b5e05610e,5f71b2bd1c455f439fe3dea6,"List(53f43a51dabfaec22baa659b, 53f3b3ffdabfae4b34b2dae9, 53f4333fdabfaeb22f451979)","List(53e9b6eeb7602d970427df40, 53e9b6eeb7602d9704283b9f, 53e9b40eb7602d9703f01b25, 53e9a3c0b7602d9702ccdfc9, 53e99818b7602d97020347a2, 53e9a2acb7602d9702bb4d7e, 558aa7ea84ae84d265bee194, 558a5258e4b037c08756714c, 53e9b946b7602d97045336a9, 53e9b1d6b7602d9703c67695, 53e9a516b7602d9702e3bcea, 53e9ac33b7602d97035f892c, 53e9ba22b7602d9704628817, 53e9af3ab7602d97039769c8, 53e9b1a3b7602d9703c2c6f7, 53e9ac89b7602d9703660f90, 53e9ad2db7602d970370e8a2, 53e9a735b7602d970306db2b, 53e99960b7602d97021a17da)","List(Feature location, Distributed systems, Software reconnaissance)","List(Data mining, Causality, End user, Ranking, Computer science, Military systems, Software, Feature model, Component-based software engineering, A-weighting, Distributed computing)",An approach to feature location in distributed systems,62,en,57.0,68.0,10.1016/j.jss.2004.12.018,,2006,79.0,1.0
53e99792b7602d9701f5b06f,53a7271520f7420be8b8b5ba,,"List(53f45e2adabfaeb22f51d645, 53f45576dabfaeee22a30c3d)","List(53e9b049b7602d9703aadc37, 53e99df1b7602d97026b4d0e, 53e9a6fdb7602d97030331ef, 53e9b39db7602d9703e81ff1, 53e9b6fab7602d970428ee72, 53e9b109b7602d9703b875ef, 53e9a5e9b7602d9702f136da, 53e99e28b7602d97026f0125, 53e9ba17b7602d970461b707, 53e99ad1b7602d970235524d, 53e9b35ab7602d9703e35fe3, 53e9a70bb7602d9703040c52, 53e99ccab7602d970258074d, 558a73e3e4b0b32fcb36e62f, 53e99d0cb7602d97025c15c4, 53e9b1aab7602d9703c36404, 5c790e6c4895d9cbc61790aa)","List(nested graph-structured representations, adjacency matrix)","List(Adjacency matrix, Graph, Knowledge representation and reasoning, Storytelling, Architectural design, Computer science, Artificial intelligence, Case-based reasoning, Recursion, Subgraph isomorphism problem)",Nested Graph-Structured Representations for Cases,20,en,1.0,12.0,10.1007/BFb0056317,3-540-64990-5,1998,1488.0,
53e99792b7602d9701f5b074,53a72cf620f7420be8c548e2,,"List(53f4357bdabfaee4dc77b09a, 53f4662fdabfaee2a1dadc95, 53f484c5dabfaee4dc8b0b1e)","List(53e9b04eb7602d9703ab29b9, 557c6f6a08b02739a5ca7106, 53e9be79b7602d9704b38a13, 53e9b360b7602d9703e3d236)","List(cellular data traffic offloading, mobile social networking, store and forward networks, user generated content sharing)","List(World Wide Web, Content sharing, Android (operating system), Social network, Computer science, Active networking, Overlay, User-centered design)",Demo: Yalut -- user-centric social networking overlay,2,en,360.0,361.0,10.1145/2594368.2601465,,2014,,
53e99792b7602d9701f5b085,53a72ac520f7420be8c0cd21,,"List(53f43415dabfaee43ec18eea, 53f47f2cdabfaee43ed52fa2, 53f42f14dabfaee02ac76859, 53f46382dabfaee02ad88cb3)","List(53e9aacab7602d9703449777, 53e9b153b7602d9703bda549, 53e9a82bb7602d970317135f, 53e9bb7ab7602d97047bc792)",,"List(Multiprocessor architecture, Space-based architecture, Computer architecture, Computer science, Parallel computing, Symmetric multiprocessor system, Bandwidth (signal processing), Database machine, Host processor, Hypercube, Computation)",Design of a HyperKYKLOS-based Multiprocessor Architecture for High-Performance Join Operations,9,en,75.0,88.0,10.1007/978-1-4613-1679-4_6,,1987,,


In [0]:
dblp_df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Venue: string (nullable = true)
 |-- Org: string (nullable = true)
 |-- Authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- References: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- FOS: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Title: string (nullable = true)
 |-- NoCitations: integer (nullable = true)
 |-- Lang: string (nullable = true)
 |-- PageStart: integer (nullable = true)
 |-- PageEnd: integer (nullable = true)
 |-- DOI: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Issue: integer (nullable = true)



### 4. Load DFs as Delta tables

In [0]:
# DBLP fact table
dblp_df.write.format('delta').mode('overwrite').saveAsTable('dblp_fact_table')
dblp_table = DeltaTable.forName(spark, 'dblp_fact_table')

In [0]:
# Venue table
venues_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("venues")
venue_table = DeltaTable.forName(spark, 'venues')

In [0]:
# Author table
authors_df.write.format('delta').mode('overwrite').saveAsTable('authors')
author_table = DeltaTable.forName(spark, 'authors')

In [0]:
# Organization
orgs_df.write.format('delta').mode('overwrite').saveAsTable('orgs')
org_table = DeltaTable.forName(spark, 'orgs')

### 5. Incremental updates
For simulating updates, we will read another json file into Delta tables as we did above. Then, incrementally add (stream) new entries from the new tables to the old ones.

In [0]:
logger.info(f'Row counts before streaming:\n\tDBLP fact table: {dblp_table.toDF().count()}\n\tAuthor table: {author_table.toDF().count()}\n\tVenue table: {venue_table.toDF().count()}\n\tOrganization table: {org_table.toDF().count()}')

INFO:__main__:Row counts before streaming:
	DBLP fact table: 156949
	Author table: 265555
	Venue table: 12580
	Organization table: 5370


In [0]:
from pyspark.sql.types import StructType,StructField, StringType, LongType, ArrayType

# Define the schema for incoming data
schema = StructType([
    StructField("_id", StringType(), True),
    StructField("abstract", StringType(), True),
    StructField("authors",
                ArrayType(
                    StructType([
                        StructField("_id", StringType(), True),
                        StructField("bio", StringType(), True),
                        StructField("email", StringType(), True),
                        StructField("gid", StringType(), True),
                        StructField("name", StringType(), True),
                        StructField("name_zh", StringType(), True),
                        StructField("oid", StringType(), True),
                        StructField("oid_zh", StringType(), True),
                        StructField("orcid", StringType(), True),
                        StructField("org", StringType(), True),
                        StructField("org_zh", StringType(), True),
                        StructField("orgid", StringType(), True),
                        StructField("orgs", ArrayType(StringType(), True), True),
                        StructField("orgs_zh", ArrayType(StringType(), True), True),
                        StructField("sid", StringType(), True)
                    ]),
                    True
                ),
                True
                ),
    StructField("doi", StringType(), True),
    StructField("fos", ArrayType(StringType(), True), True),
    StructField("isbn", StringType(), True),
    StructField("issn", StringType(), True),
    StructField("issue", StringType(), True),
    StructField("keywords", ArrayType(StringType(), True), True),
    StructField("lang", StringType(), True),
    StructField("n_citation", LongType(), True),
    StructField("page_end", StringType(), True),
    StructField("page_start", StringType(), True),
    StructField("pdf", StringType(), True),
    StructField("references", ArrayType(StringType(), True), True),
    StructField("title", StringType(), True),
    StructField("url", ArrayType(StringType(), True), True),
    StructField("venue",
                StructType([
                    StructField("_id", StringType(), True),
                    StructField("issn", StringType(), True),
                    StructField("name", StringType(), True),
                    StructField("name_d", StringType(), True),
                    StructField("name_s", StringType(), True),
                    StructField("online_issn", StringType(), True),
                    StructField("publisher", StringType(), True),
                    StructField("raw", StringType(), True),
                    StructField("raw_zh", StringType(), True),
                    StructField("sid", StringType(), True),
                    StructField("src", StringType(), True),
                    StructField("t", StringType(), True),
                    StructField("type", LongType(), True)
                ]),
                True
                ),
    StructField("volume", StringType(), True),
    StructField("year", LongType(), True)
])

# DF of incoming data
# Read an uncleaned split
streaming_df = (spark.readStream
                .schema(schema)
                .option("maxFilesPerTrigger", 1)
                .parquet('dbfs:/user/dblpv13/dblpv13.5.parquet')                         
)

# Clean the incoming data
streaming_df = transform(streaming_df)

# Perform this on each batch of incoming data
def update_tables(batch_df, batch_id):
    
    # From incoming data, make the same DFs that are in our warehouse
    _df, venues_df = create_venues_df(batch_df)
    _df, authors_df = create_authors_df(_df)
    _df, orgs_df = create_orgs_df(_df)
    _df, dblp_df = create_dblp_df(_df)
    
    # For each existing Delta table and new DF
    for table, df in (
        (venue_table, venues_df), # The *_table variables refer to previously created Delta tables in our warehouse
        (author_table, authors_df), 
        (org_table, orgs_df),
        (dblp_table, dblp_df)
    ):
        # Upsert each Delta table with data from the DF
        (table
         .alias("t")
         .merge(
             df.alias("s"),
             "s.ID == t.ID"
         ).whenMatchedUpdateAll() # update when id exists
        .whenNotMatchedInsertAll() # if id does not exist: insert
        .execute())


(streaming_df.writeStream
    .format("delta")
    .foreachBatch(update_tables)
    .option("checkpointLocation", "/tmp/checkpoints/proj1")
    .outputMode("update")
    .start()
)

Out[164]: <pyspark.sql.streaming.StreamingQuery at 0x7f7e859620d0>

In [0]:
# When satisfied, stop all streams
for stream in spark.streams.active:
    stream.stop()

In [0]:
logger.info(f'Row counts after streaming:\n\tDBLP fact table: {dblp_table.toDF().count()}\n\tAuthor table: {author_table.toDF().count()}\n\tVenue table: {venue_table.toDF().count()}\n\tOrganization table: {org_table.toDF().count()}')

INFO:__main__:Row counts after streaming:
	DBLP fact table: 156949
	Author table: 265555
	Venue table: 12580
	Organization table: 5370


#### 6. Queries
TODO: Add explanations (in Markdown cells) to each query about what it does.

Finds top-10 venues by number of publications.

In [0]:
%sql
SELECT Venue, Publications, Name, Raw, Type FROM 
  (SELECT Venue, COUNT(ID) AS Publications
  FROM dblp_fact_table
  GROUP BY Venue)
INNER JOIN Venues ON Venues.ID=Venue 
ORDER BY Publications DESC 
LIMIT 10;

Venue,publication_count,issn,name,raw,online_issn,publisher,type
0302-9743,1692,0302-9743,Lecture Notes in Computer Science,Lecture Notes in Computer Science,1611-3349,,Journal
555036d37cea80f95415b0ba,1111,1573-6709,Nature Communications,arXiv: Computational Geometry,,,Journal
53e180d020f7dfbc07e8b56e,984,2081-4836,arXiv: Computational Geometry,Clinical Orthopaedics and Related Research,,,Journal
539ffbf8831432abcb6033b5,896,0012-365X,Discrete Mathematics,Discrete Mathematics,,,Journal
53a72dad20f7420be8c6e63a,865,1522-4880,International Conference on Image Processing,ICIP,,,Conference
555036f57cea80f954169e28,857,0001-0782,Communications of The ACM,Commun. ACM,,,Journal
555036b87cea80f95414c3bf,833,0304-3975,Colloquium on trees in Algebra and Programming,Theor. Comput. Sci.,,,Journal
53a726bb20f7420be8b7f846,817,1050-4729,International Conference on Robotics and Automation,ICRA,,,Conference
555037547cea80f9541805e0,766,1520-6149,IEEE Transactions on Signal Processing,ICASSP,,,Journal
54824fa4582fc50b5e02e699,641,0018-9448,IEEE Transactions on Information Theory,IEEE Transactions on Information Theory,,,Journal


Finds authors who had the most publications at the single venue.

In [0]:
%sql
SELECT Venue, Author, Publications, Name AS AuthorName, VenueName, VenueRaw, VenueType FROM
  (SELECT Venue, Author, Publications, name AS VenueName, raw AS VenueRaw, type AS VenueType FROM
    (SELECT * FROM
      (SELECT Venue, Author, COUNT(ID) AS Publications
      FROM (SELECT ID, Venue, EXPLODE(Authors) AS Author FROM dblp_fact_table) 
      GROUP BY Venue, Author)
    INNER JOIN Venues ON Venues.ID=Venue)
    )
INNER JOIN Authors ON Authors.ID=Author
ORDER BY Publications DESC
LIMIT 10;

INFO:py4j.java_gateway:Callback Connection ready to receive messages
INFO:py4j.java_gateway:Received command c on object id p1


Venue,Author,publication_count,venue_name,type,author_name
555036f57cea80f954169e28,53f58b15dabfaece00f8046d,15,Communications of The ACM,Journal,Peter J. Denning
0022-4812,5484a268dabfaed7b5fa1a8b,14,JOURNAL OF SYMBOLIC LOGIC,Journal,Saharon Shelah
53a726bb20f7420be8b7f846,5440c393dabfae805a6f34a5,12,International Conference on Robotics and Automation,Conference,Vijay Kumar
53a728e520f7420be8bbc4bb,53f366a7dabfae4b3499c6fe,11,Neural Information Processing Systems,Journal,Geoffrey E. Hinton
555036f67cea80f95416a9f4,54488419dabfae87b7e38d20,11,International Conference on Computer Graphics and Interactive Techniques,Conference,Daniel Cohen-Or
53a7256420f7420be8b4e0aa,53f48082dabfae963d25a84d,11,Computer Vision and Pattern Recognition,Journal,Shree K. Nayar
54824fa4582fc50b5e02e699,54055927dabfae8faa5c5dfa,11,IEEE Transactions on Information Theory,Journal,H. Vincent Poor
555036c27cea80f954152280,5444f64edabfae87074eb9c0,11,I. J. Bifurcation and Chaos,Journal,Guanrong Chen
53a7256420f7420be8b4e0aa,53f4ec7fdabfae0354f8045b,10,Computer Vision and Pattern Recognition,Journal,Pascal Fua
555036c27cea80f954152280,54093a47dabfae8faa677518,10,I. J. Bifurcation and Chaos,Journal,Leon O. Chua
