In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

import logging
import json 


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
DISPLAY_LIMIT = 20

### The structure
1. Read data from zipped json files into one dataframe.
2. Apply necessary (cleaning) transformations to the dataframe.
3. Create the new DataFrames corresponding to our Warehouse Schema.  
4. Save the DataFrames as Delta tables.
5. Demonstrate adding new entries to the warehouse.
6. Demonstrate queries on the data.

### 1. Extract the data

In [0]:
# Uncomment one or the other line.

# This reads ALL splits into one dataframe
#_df = spark.read.option("multiline", True).json('dbfs:/user/dblpv13/dblpv13.*.json.gz')

# For testing, read just one split
_df = spark.read.option("multiline", True).json('dbfs:/user/dblpv13/dblpv13.0.json.gz')

In [0]:
# Immediately delete the abstract column because they look really annoying on GitHub.
_df = _df.drop(F.col('abstract'))

_df.printSchema()

root
 |-- _id: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- bio: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- gid: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- name_zh: string (nullable = true)
 |    |    |-- oid: string (nullable = true)
 |    |    |-- oid_zh: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |    |    |-- org: string (nullable = true)
 |    |    |-- org_zh: string (nullable = true)
 |    |    |-- orgid: string (nullable = true)
 |    |    |-- orgs: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- orgs_zh: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- sid: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- fos: array (nullable = 

### 2. Transform the data
TODO: delete all authors where author id is null.

TODO: delete all entries where org id is null.

We don't want any null values in the FK columns.

In [0]:
logger.info(f"Initially, there were {_df.count()} rows of data")

INFO:__main__:Initially, there were 250000 rows of data


In [0]:
# Drop entries with 1-word titles or empty authors or nonexistant _id or 
_df = (_df.filter((F.size(F.col('authors')) > 0) & # By default F.size() returns -1 if the value is null.
                  (F.size(F.split(F.col('title'), ' ')) > 1) &  
                  (F.col('_id') != '') & 
                  (F.col('_id').isNotNull()) & 
                  ~(F.array_contains(F.col('references'), ''))))

In [0]:
# remove forewords
_df = _df.filter(~F.lower(F.col("title")).contains("foreword"))

In [0]:
# Convert n_citation data type to int
_df = _df.withColumn('n_citation', F.col('n_citation').cast('int'))

In [0]:
# Replace empty language values with null.
_df = _df.withColumn('lang', F.when(F.col('lang') == '', None).otherwise(F.col('lang')))

In [0]:
# Replace empty 'keyword' and 'fos' arrays with null values.
_df = (_df.withColumn('keywords', F.when(F.size(F.col('keywords')) == 0, None).otherwise(F.col('keywords')))
          .withColumn('fos', F.when(F.size(F.col('fos')) == 0, None).otherwise(F.col('fos'))))

In [0]:
# Replace non-numeric page numbers with nulls and convert column type to int. Then replace 0 page numbers with nulls as well.
_df = (_df.withColumn('page_start', F.when(F.col('page_start').cast('int').isNotNull(), F.col('page_start')).otherwise(None)) # replace non-numeric page numbers with null
          .withColumn('page_end', F.when(F.col('page_end').cast('int').isNotNull(), F.col('page_end')).otherwise(None))
          .withColumn('page_start', F.col('page_start').cast('int')) # convert column type to int
          .withColumn('page_end', F.col('page_end').cast('int'))
          .withColumn('page_start', F.when(F.col('page_start') == 0, None).otherwise(F.col('page_start'))) # replace 0 page numbers with null as well
          .withColumn('page_end', F.when(F.col('page_end') == 0, None).otherwise(F.col('page_end'))))

In [0]:
# Replace empty dois with nulls.
_df = _df.withColumn('doi', F.when(F.col('doi') == '', None).otherwise(F.col('doi')))

In [0]:
# Replace empty years with nulls and change data type to int.
_df = (_df.withColumn('year', F.when(F.col('year') == 0, None).otherwise(F.col('year')))
          .withColumn('year', F.col('year').cast('int')))

In [0]:
# Replace non-numeric volume and issue numbers with null and convert data types to int. Then repalce 0 values with null as well.
_df = (_df.withColumn('volume', F.when(F.col('volume').cast('int').isNotNull(), F.col('volume')).otherwise(None)) # replace non-numeric values
          .withColumn('issue', F.when(F.col('issue').cast('int').isNotNull(), F.col('issue')).otherwise(None))
          .withColumn('volume', F.col('volume').cast('int')) # convert column type to int
          .withColumn('issue', F.col('issue').cast('int'))
          .withColumn('volume', F.when(F.col('volume') == 0, None).otherwise(F.col('volume'))) # replace 0 issue and volume numbers with null as well.
          .withColumn('issue', F.when(F.col('issue') == 0, None).otherwise(F.col('issue'))))

In [0]:
# replace empty strings in some columns with nulls
def replace_empty_string(col):
    return F.when(col == "", None).otherwise(col)

venue = F.col("venue")
for col in ["_id", "issn", "name", "name_d", "name_s", "online_issn", "publisher", "raw", "raw_zh", "t"]:
    venue = venue.withField(col, replace_empty_string(F.col(f"venue.{col}")))
    
_df = (
    _df
    .withColumn("venue", venue)
    .withColumn("issn", replace_empty_string(F.col("issn")))
    .withColumn("isbn", replace_empty_string(F.col("isbn")))
    .withColumn("isbn", F.when(F.col("isbn") == "isbn", None).otherwise(F.col("isbn")))
    .withColumn("issn", F.when(F.col("issn") == "issn", None).otherwise(F.col("issn")))
)

In [0]:
# fix incorrect issn
_df = (_df
           .withColumn("issn",
                       F.when(F.length(F.col("issn")) == 9, F.col("issn"))
                       .when(F.length(F.col("issn")) == 8, F.concat(F.col("issn").substr(1, 4), F.lit("-"), F.col("issn").substr(5, 4)))
                       .when(F.col("issn").contains("E-ISBN"), F.col("issn").substr(1, 9))
                       .otherwise(None)
                      )
           .withColumn("venue", 
                       F.col("venue")
                       .withField("issn", F.coalesce(F.col("venue.issn"), F.col("issn")))
                       )
           .drop("issn")
          )

In [0]:
# replace venue with null fields with null
venue_is_empty = (
    F.col("venue.issn").isNull() &
    F.col("venue.name").isNull() &
    F.col("venue.name_d").isNull() &
    F.col("venue.name_s").isNull() &
    F.col("venue.online_issn").isNull() &
    F.col("venue.publisher").isNull() &
    F.col("venue.raw").isNull() &
    F.col("venue.raw_zh").isNull()
)
_df = _df.withColumn("venue", F.when(venue_is_empty, None).otherwise(F.col("venue")))

In [0]:
# remove rows with null venues
_df = _df.filter(F.col("venue").isNotNull())

In [0]:
# coalescing venue._id and venue.issn to make up for missing ids
_df = _df.withColumn("venue", F.col("venue").withField("_id", F.coalesce(F.col("venue._id"), F.col("venue.issn"))))

# removing rows with venue id null
_df = _df.filter(F.col("venue._id").isNotNull())

In [0]:
logger.info(f"Now, there are {_df.count()} rows of data")

_df.printSchema()

INFO:__main__:Now, there are 163401 rows of data
root
 |-- _id: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- bio: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- gid: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- name_zh: string (nullable = true)
 |    |    |-- oid: string (nullable = true)
 |    |    |-- oid_zh: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |    |    |-- org: string (nullable = true)
 |    |    |-- org_zh: string (nullable = true)
 |    |    |-- orgid: string (nullable = true)
 |    |    |-- orgs: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- orgs_zh: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- sid: string (nullable = true)
 |-- doi: stri

In [0]:
# Display endresult
display(_df.limit(DISPLAY_LIMIT))

_id,authors,doi,fos,isbn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,venue,volume,year
53e99784b7602d9701f3e151,"List(List(53f46797dabfaeb22f542630, null, null, null, Jairo Rocha, null, null, null, null, null, null, null, null, null, null), List(54328883dabfaeb4c6a8a699, null, null, null, Theo Pavlidis, null, null, null, null, null, null, null, null, null, null))",10.1109/ICDAR.1993.395663,"List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",,,"List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)",en,17,605.0,602.0,,"List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)",A solution to the problem of touching and broken characters.,List(http://dx.doi.org/10.1109/ICDAR.1993.395663),"List(53a72a4920f7420be8bfa51b, null, null, International Conference on Document Analysis and Recognition, null, null, null, ICDAR-1, null, null, null, null, 0)",,1993
53e99784b7602d9701f3e15d,"List(List(53f43b03dabfaedce555bf2a, null, null, null, Min Pan, null, null, null, null, null, null, null, null, null, null), List(53f45ee9dabfaee43ecda842, null, null, null, Chris C. N. Chu, null, null, null, null, null, null, null, null, null, null), List(53f42e8cdabfaee1c0a4274e, null, null, null, Hai Zhou, null, null, null, null, null, null, null, null, null, null))",10.1109/ISCAS.2005.1465124,"List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",0-7803-8834-8,,"List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)",en,28,,2461.0,//static.aminer.org/pdf/PDF/000/423/329/timing_yield_estimation_using_statistical_static_timing_analysis.pdf,"List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)",Timing yield estimation using statistical static timing analysis,"List(http://dx.doi.org/10.1109/ISCAS.2005.1465124, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1465124)","List(53a72e2020f7420be8c80142, null, null, International Symposium on Circuits and Systems, null, null, null, ISCAS (3), null, null, null, null, 0)",,2005
53e99784b7602d9701f3f411,"List(List(548a2e3ddabfae9b40134fbc, null, null, null, Harry M. Sneed, null, null, null, null, null, null, null, null, null, null))",10.1109/CMPSAC.2002.1044548,"List(XML Base, World Wide Web, XML framework, XML Encryption, Efficient XML Interchange, SGML, Programming language, Software engineering, XML, XML validation, Computer science, cXML)",0-7695-1727-7,,"List(Internet, hypermedia markup languages, information resources, systems re-engineering, COBOL, PL/I, World Wide Web, XML, batch programs, data conversion, e-commerce, eXtensible Markup Language, enterprise application integration, interface reengineering, legacy programs, online programs, software reengineering, subprograms, systems integration)",en,28,172.0,167.0,,"List(53e9adbdb7602d97037be8a2, 53e9bb53b7602d9704792f33, 558aa425e4b0b32fcb37fff4, 558abd44e4b031bae1f9653a, 53e9a326b7602d9702c32229, 53e9b1d7b7602d9703c6ce7c, 558a7de784ae84d265bdee99, 53e9ae17b7602d9703828d13, 53e9aa4fb7602d97033bf9ad)",Using XML to Integrate Existing Software Systems into the Web,"List(http://dx.doi.org/10.1109/CMPSAC.2002.1044548, http://doi.ieeecomputersociety.org/10.1109/CMPSAC.2002.1044548, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1044548)","List(53a72e9920f7420be8c93fac, null, null, Computer Software and Applications Conference, null, null, null, COMPSAC, null, null, null, null, 0)",,2002
53e99784b7602d9701f3f5fe,"List(List(53f46a22dabfaee0d9c3d5e5, null, ysg_2005@hotmail.com, 5b8698cce1cd8e14a3826671, Shuguo Yang, null, null, null, null, School of Mathematics and Physics, Qingdao University of Science and Technology, Qingdao, China 266061, null, 5f71b2e91c455f439fe3f23f, null, null, null))",10.1007/s11704-011-0127-6,"List(Virtualization, Service level objective, Virtual machine, Computer science, Testbed, Quality of service, Provisioning, Resource allocation, Web application, Operating system, Distributed computing)",,4.0,"List(resource allocation, cpu utilization, quality of service)",en,2,512.0,506.0,,"List(53e9a073b7602d9702957efa, 53e9ad87b7602d970377bfb5, 53e9be51b7602d9704b11381, 53e9be04b7602d9704abb31d, 53e9992bb7602d9702169236, 53e998cdb7602d97021044db, 53e9afa6b7602d97039f6054, 53e99822b7602d9702044e60)",Research on resource allocation for multi-tier web applications in a virtualization environment,"List(http://dx.doi.org/10.1007/s11704-011-0127-6, http://link.springer.com/article/10.1007/s11704-011-0127-6, http://www.webofknowledge.com/)","List(572de199d39c4f49934b3d5c, 1673-7350, null, null, null, null, null, Frontiers of Computer Science in China, null, null, null, null, 0)",5.0,2011
53e99792b7602d9701f5af1a,"List(List(5631df8845cedb3399f3e752, null, null, null, Shigeru Fujita, null, null, null, null, null, null, null, null, null, null), List(53f4775edabfaee4dc891b69, null, null, null, Kenji Sugawara, null, null, null, null, null, null, null, null, null, null), List(54096ca7dabfae450f483585, null, null, null, Claude Moulin, null, null, null, null, null, null, null, null, null, null), List(5448b55bdabfae87b7e68206, null, null, null, Jean-Paul A. Barthès, null, null, null, null, null, null, null, null, null, null))",10.1109/COGINF.2010.5599834,"List(Syma, Computer science, Symbiotic computing, Multi-agent system, Human–computer interaction, Schedule, Artificial intelligence, Ubiquitous computing, Cognition)",,,"List(cognition, multi-agent systems, ubiquitous computing, ADIPS-DASH, OMAS, SYMA, actuators, awareness and operation module, cognition functions, decision functions, intelligent multiagent system, multiparadigm-multiagent framework, perceptual interaction, social interaction, symbiotic base mechanism, symbiotic multiagent system, Awareness, Cognition Layer model, Multi-agent system, Social-ware, Symbiotic Computing)",en,4,630.0,625.0,,"List(53e9b3dab7602d9703ec7ddf, 53e9a3edb7602d9702d03525, 53e9b9fbb7602d97045f67ae, 53e9b4c3b7602d9703fdfe37, 53e9a310b7602d9702c1a36e, 53e9abfeb7602d97035c19c5)",The design of awareness and operation module for the symbiotic applications.,"List(http://dx.doi.org/10.1109/COGINF.2010.5599834, http://doi.ieeecomputersociety.org/10.1109/COGINF.2010.5599834, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5599834)","List(53a72bad20f7420be8c2d5af, null, null, IEEE International Conference on Cognitive Informatics, null, null, null, IEEE ICCI, null, null, null, null, 0)",,2010
53e99792b7602d9701f5af27,"List(List(53f46e66dabfaee02adb48fd, null, ysq05@mails.tsinghua.edu.cn, null, Shengqi Ye, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(53f362d7dabfae4b3498de6a, null, null, null, Yingjia He, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(54488a23dabfae87b7e3f16a, null, null, null, Jianming Hu, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(561cba1b45ce11c523ca3441, null, null, null, Zuo Zhang, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null))",10.1109/FSKD.2008.678,"List(Spline (mathematics), Mars Exploration Program, Data modeling, Multivariate adaptive regression splines, Traffic flow, Computer science, Regression analysis, Artificial intelligence, Artificial neural network, Machine learning, Beijing)",,,"List(considerable accuracy, promising traffic flow forecasting, proposed mars method, neural networks, short-term traffic flow forecasting, historical traffic flow data, traffic flow forecasting, multivariate adaptive regression splines, mars model, analytical function, mars method, analytic function, forecasting, predictive models, spline, mars, traffic flow, neural network, detectors, data models, regression analysis)",en,11,675.0,669.0,,"List(53e9b95bb7602d9704549008, 53e9ba11b7602d97046117d8, 53e9b8f6b7602d97044dc1a6, 53e99d51b7602d970260acca, 53e9a751b7602d9703088787)",Short-Term Traffic Flow Forecasting Based on MARS,"List(http://dx.doi.org/10.1109/FSKD.2008.678, http://www.webofknowledge.com/)","List(53a72cfa20f7420be8c554b2, null, null, null, null, null, null, FSKD (5), null, null, null, null, 0)",,2008
53e99792b7602d9701f5af35,"List(List(53f43a51dabfaec22baa659b, null, dedwards@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Dennis Edwards, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f3b3ffdabfae4b34b2dae9, null, ssimmons@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Sharon Simmons, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f4333fdabfaeb22f451979, null, nwilde@uwf.edu, null, Norman Wilde, null, null, null, null, Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., null, null, List(Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null))",10.1016/j.jss.2004.12.018,"List(Data mining, Causality, End user, Ranking, Computer science, Military systems, Software, Feature model, Component-based software engineering, A-weighting, Distributed computing)",,1.0,"List(Feature location, Distributed systems, Software reconnaissance)",en,62,68.0,57.0,//static.aminer.org/pdf/PDF/000/996/035/an_approach_to_feature_location_in_distributed_systems.pdf,"List(53e9b6eeb7602d970427df40, 53e9b6eeb7602d9704283b9f, 53e9b40eb7602d9703f01b25, 53e9a3c0b7602d9702ccdfc9, 53e99818b7602d97020347a2, 53e9a2acb7602d9702bb4d7e, 558aa7ea84ae84d265bee194, 558a5258e4b037c08756714c, 53e9b946b7602d97045336a9, 53e9b1d6b7602d9703c67695, 53e9a516b7602d9702e3bcea, 53e9ac33b7602d97035f892c, 53e9ba22b7602d9704628817, 53e9af3ab7602d97039769c8, 53e9b1a3b7602d9703c2c6f7, 53e9ac89b7602d9703660f90, 53e9ad2db7602d970370e8a2, 53e9a735b7602d970306db2b, 53e99960b7602d97021a17da)",An approach to feature location in distributed systems,"List(http://dx.doi.org/10.1016/j.jss.2004.12.018, https://www.sciencedirect.com/science/article/pii/S016412120500004X, http://www.webofknowledge.com/)","List(54825226582fc50b5e05610e, 0164-1212, null, null, null, null, null, Journal of Systems and Software, null, null, null, null, 0)",79.0,2006
53e99792b7602d9701f5b06f,"List(List(53f45e2adabfaeb22f51d645, null, null, null, Luís Macedo, null, null, null, 0000-0002-3144-0362, null, null, null, null, null, null), List(53f45576dabfaeee22a30c3d, null, null, null, Amílcar Cardoso, null, null, null, null, null, null, null, null, null, null))",10.1007/BFb0056317,"List(Adjacency matrix, Graph, Knowledge representation and reasoning, Storytelling, Architectural design, Computer science, Artificial intelligence, Case-based reasoning, Recursion, Subgraph isomorphism problem)",3-540-64990-5,,"List(nested graph-structured representations, adjacency matrix)",en,20,12.0,1.0,,"List(53e9b049b7602d9703aadc37, 53e99df1b7602d97026b4d0e, 53e9a6fdb7602d97030331ef, 53e9b39db7602d9703e81ff1, 53e9b6fab7602d970428ee72, 53e9b109b7602d9703b875ef, 53e9a5e9b7602d9702f136da, 53e99e28b7602d97026f0125, 53e9ba17b7602d970461b707, 53e99ad1b7602d970235524d, 53e9b35ab7602d9703e35fe3, 53e9a70bb7602d9703040c52, 53e99ccab7602d970258074d, 558a73e3e4b0b32fcb36e62f, 53e99d0cb7602d97025c15c4, 53e9b1aab7602d9703c36404, 5c790e6c4895d9cbc61790aa)",Nested Graph-Structured Representations for Cases,"List(http://dx.doi.org/10.1007/BFb0056317, http://www.webofknowledge.com/)","List(53a7271520f7420be8b8b5ba, 0302-9743, null, null, null, null, null, EWCBR, null, null, null, null, 0)",1488.0,1998
53e99792b7602d9701f5b074,"List(List(53f4357bdabfaee4dc77b09a, null, kanchana.thilakarathna@nicta.com.au, null, Kanchana Thilakarathna, null, null, null, 0000-0003-4332-0082, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f4662fdabfaee2a1dadc95, null, null, null, Xinlong Guan, null, null, null, null, Natl ICT Australia, Sydney, NSW, Australia, null, null, List(Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f484c5dabfaee4dc8b0b1e, null, null, null, Aruna Seneviratne, null, null, null, 0000-0001-6894-7987, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null))",10.1145/2594368.2601465,"List(World Wide Web, Content sharing, Android (operating system), Social network, Computer science, Active networking, Overlay, User-centered design)",,,"List(cellular data traffic offloading, mobile social networking, store and forward networks, user generated content sharing)",en,2,361.0,360.0,,"List(53e9b04eb7602d9703ab29b9, 557c6f6a08b02739a5ca7106, 53e9be79b7602d9704b38a13, 53e9b360b7602d9703e3d236)",Demo: Yalut -- user-centric social networking overlay,"List(http://dx.doi.org/10.1145/2594368.2601465, http://doi.acm.org/10.1145/2594368.2601465, http://dl.acm.org/citation.cfm?id=2594368.2601465&coll=DL&dl=GUIDE&CFID=521580964&CFTOKEN=96511501&preflayout=flat, http://www.webofknowledge.com/)","List(53a72cf620f7420be8c548e2, null, null, null, null, null, null, MobiSys, null, null, null, null, 0)",,2014
53e99792b7602d9701f5b085,"List(List(53f43415dabfaee43ec18eea, null, null, null, Bernard L. Menezes, null, null, null, null, Dept. of Electrical and Computer Eng., null, null, List(Dept. of Electrical and Computer Eng.), null, null), List(53f47f2cdabfaee43ed52fa2, null, null, null, K. Thadani, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f42f14dabfaee02ac76859, null, null, null, Alfred G. Dale, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f46382dabfaee02ad88cb3, null, null, null, Roy M. Jenevein, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null))",10.1007/978-1-4613-1679-4_6,"List(Multiprocessor architecture, Space-based architecture, Computer architecture, Computer science, Parallel computing, Symmetric multiprocessor system, Bandwidth (signal processing), Database machine, Host processor, Hypercube, Computation)",,,,en,9,88.0,75.0,//static.aminer.org/pdf/PDF/000/463/867/design_of_a_hyperkyklos_based_multiprocessor_architecture_for_high_performance.pdf,"List(53e9aacab7602d9703449777, 53e9b153b7602d9703bda549, 53e9a82bb7602d970317135f, 53e9bb7ab7602d97047bc792)",Design of a HyperKYKLOS-based Multiprocessor Architecture for High-Performance Join Operations,"List(http://dx.doi.org/10.1007/978-1-4613-1679-4_6, https://link.springer.com/chapter/10.1007%2F978-1-4613-1679-4_6, http://dblp.uni-trier.de/db/conf/iwdm/iwdm87.html#MenezesTDJ87, https://rd.springer.com/chapter/10.1007/978-1-4613-1679-4_6)","List(53a72ac520f7420be8c0cd21, null, null, null, null, null, null, IWDM, null, null, null, null, 0)",,1987


### 3. Create the new DFs
#### 3.1. Venue DF

In [0]:
# creating the venues df
venues_df = (_df
             .withColumn("has_volume_or_issue", F.when(F.col("volume").isNotNull() | F.col("issue").isNotNull(), True).otherwise(None))
             .select("venue.*", "has_volume_or_issue")
             .filter(F.col("_id").isNotNull())
             .drop("src", "sid", "type").distinct())

# removing the columns from the original df
_df = _df.withColumn("venue_id", F.col("venue._id")).drop("venue")

display(venues_df.limit(DISPLAY_LIMIT))

_id,issn,name,name_d,name_s,online_issn,publisher,raw,raw_zh,t,has_volume_or_issue
53a72bad20f7420be8c2d5af,,,IEEE International Conference on Cognitive Informatics,,,,IEEE ICCI,,,
555036db7cea80f9541603d7,,,,,,,J. Multivariate Analysis,,,True
555036ba7cea80f95414db52,,,Advances in Operations Research,,,,Adv. Operations Research,,,True
53a72e2020f7420be8c80142,,,International Symposium on Circuits and Systems,,,,ISCAS (3),,,
53a724b320f7420be8b37f4c,,,Hawaii International Conference on System Sciences,,,,HICSS,,,
53a72cf620f7420be8c548e2,,,,,,,MobiSys,,,
555036c77cea80f954155203,,,IEEE Transactions on Circuits and Systems,,,,IEEE Trans. on Circuits and Systems,,,True
0377-2217,0377-2217,,,,,North-Holland,European Journal of Operational Research,,J,True
555036b77cea80f95414b7de,1869-1919,,,,,,SCIENCE CHINA Information Sciences,,,True
53a72a4920f7420be8bfa51b,,,International Conference on Document Analysis and Recognition,,,,ICDAR-1,,,


In [0]:
# combining rows with the same id, but different column values
# taking the first non-null value for the id as the column value

venue_columns = (
    "issn",
    "name",
    "name_d",
    "name_s",
    "raw",
    "raw_zh",
    "online_issn",
    "publisher",
    "t",
    "has_volume_or_issue"
)

venues_df = venues_df.groupBy(F.col("_id")).agg(*(F.first(F.col(col), ignorenulls=True).alias(col) for col in venue_columns))

In [0]:
venues_df = (
    venues_df
    # coalescing the name and raw columns
    .withColumn("raw", F.coalesce(
            F.col("raw"), 
            F.col("raw_zh"),
        ))
    .withColumn("name", F.coalesce(
            F.col("name"), 
            F.col("name_d"),
        ))
    .drop("name_d", "name_s", "raw_zh") 
    # creating the type field
    .withColumn("type",            
               F.when(
                   (
                       F.col("raw").contains("@") | 
                       F.lower(F.col("raw")).contains("workshop") |
                       F.lower(F.col("name")).contains("workshop")
                   ), 
                   "Workshop"
               ).when(
                   (F.col("t") == "J"),
                   "Journal"
               ).when(
                   (
                       (F.col("t") == "C") |
                       F.lower(F.col("raw")).contains("conference") |
                       F.lower(("name")).contains("conference") |
                       F.lower(F.col("raw")).contains("symposium") |
                       F.lower(("name")).contains("symposium") |
                       F.lower(F.col("raw")).contains("proceedings") |
                       F.lower(("name")).contains("proceedings")
                   ),
                   "Conference"
               ).when(
                   (
                       F.lower(F.col("raw")).contains("journal") |
                       F.lower(("name")).contains("journal") |
                       F.col("has_volume_or_issue")
                   ),
                   "Journal"
               ).otherwise(None)
      )
    .drop("t", "has_volume_or_issue")
)

display(venues_df.limit(DISPLAY_LIMIT))

_id,issn,name,raw,online_issn,publisher,type
0001-0782,0001-0782,,COMMUNICATIONS OF THE ACM,1557-7317,,Journal
0001-253X,0001-253X,,ASLIB PROCEEDINGS,1758-3748,,Journal
0001-2815,0001-2815,,TISSUE ANTIGENS,,,Journal
0001-4966,0001-4966,,JOURNAL OF THE ACOUSTICAL SOCIETY OF AMERICA,1520-8524,,Journal
0001-5903,0001-5903,,Acta Informatica,,,Journal
0001-8708,0001-8708,,Advances in Mathematics,,Academic Press,Journal
0002-8231,0002-8231,,JOURNAL OF THE AMERICAN SOCIETY FOR INFORMATION SCIENCE,,,Journal
0002-9149,0002-9149,,AMERICAN JOURNAL OF CARDIOLOGY,,,Journal
0002-9343,0002-9343,,AMERICAN JOURNAL OF MEDICINE,,,Journal
0002-9378,0002-9378,,AMERICAN JOURNAL OF OBSTETRICS AND GYNECOLOGY,,,Journal


In [0]:
# save the venues table
venues_df.write.format("delta").mode("overwrite").saveAsTable("venues") 

In [0]:
display(_df.limit(DISPLAY_LIMIT))

_id,authors,doi,fos,isbn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,volume,year,venue_id
53e99784b7602d9701f3e151,"List(List(53f46797dabfaeb22f542630, null, null, null, Jairo Rocha, null, null, null, null, null, null, null, null, null, null), List(54328883dabfaeb4c6a8a699, null, null, null, Theo Pavlidis, null, null, null, null, null, null, null, null, null, null))",10.1109/ICDAR.1993.395663,"List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",,,"List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)",en,17,605.0,602.0,,"List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)",A solution to the problem of touching and broken characters.,List(http://dx.doi.org/10.1109/ICDAR.1993.395663),,1993,53a72a4920f7420be8bfa51b
53e99784b7602d9701f3e15d,"List(List(53f43b03dabfaedce555bf2a, null, null, null, Min Pan, null, null, null, null, null, null, null, null, null, null), List(53f45ee9dabfaee43ecda842, null, null, null, Chris C. N. Chu, null, null, null, null, null, null, null, null, null, null), List(53f42e8cdabfaee1c0a4274e, null, null, null, Hai Zhou, null, null, null, null, null, null, null, null, null, null))",10.1109/ISCAS.2005.1465124,"List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",0-7803-8834-8,,"List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)",en,28,,2461.0,//static.aminer.org/pdf/PDF/000/423/329/timing_yield_estimation_using_statistical_static_timing_analysis.pdf,"List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)",Timing yield estimation using statistical static timing analysis,"List(http://dx.doi.org/10.1109/ISCAS.2005.1465124, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1465124)",,2005,53a72e2020f7420be8c80142
53e99784b7602d9701f3f411,"List(List(548a2e3ddabfae9b40134fbc, null, null, null, Harry M. Sneed, null, null, null, null, null, null, null, null, null, null))",10.1109/CMPSAC.2002.1044548,"List(XML Base, World Wide Web, XML framework, XML Encryption, Efficient XML Interchange, SGML, Programming language, Software engineering, XML, XML validation, Computer science, cXML)",0-7695-1727-7,,"List(Internet, hypermedia markup languages, information resources, systems re-engineering, COBOL, PL/I, World Wide Web, XML, batch programs, data conversion, e-commerce, eXtensible Markup Language, enterprise application integration, interface reengineering, legacy programs, online programs, software reengineering, subprograms, systems integration)",en,28,172.0,167.0,,"List(53e9adbdb7602d97037be8a2, 53e9bb53b7602d9704792f33, 558aa425e4b0b32fcb37fff4, 558abd44e4b031bae1f9653a, 53e9a326b7602d9702c32229, 53e9b1d7b7602d9703c6ce7c, 558a7de784ae84d265bdee99, 53e9ae17b7602d9703828d13, 53e9aa4fb7602d97033bf9ad)",Using XML to Integrate Existing Software Systems into the Web,"List(http://dx.doi.org/10.1109/CMPSAC.2002.1044548, http://doi.ieeecomputersociety.org/10.1109/CMPSAC.2002.1044548, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1044548)",,2002,53a72e9920f7420be8c93fac
53e99784b7602d9701f3f5fe,"List(List(53f46a22dabfaee0d9c3d5e5, null, ysg_2005@hotmail.com, 5b8698cce1cd8e14a3826671, Shuguo Yang, null, null, null, null, School of Mathematics and Physics, Qingdao University of Science and Technology, Qingdao, China 266061, null, 5f71b2e91c455f439fe3f23f, null, null, null))",10.1007/s11704-011-0127-6,"List(Virtualization, Service level objective, Virtual machine, Computer science, Testbed, Quality of service, Provisioning, Resource allocation, Web application, Operating system, Distributed computing)",,4.0,"List(resource allocation, cpu utilization, quality of service)",en,2,512.0,506.0,,"List(53e9a073b7602d9702957efa, 53e9ad87b7602d970377bfb5, 53e9be51b7602d9704b11381, 53e9be04b7602d9704abb31d, 53e9992bb7602d9702169236, 53e998cdb7602d97021044db, 53e9afa6b7602d97039f6054, 53e99822b7602d9702044e60)",Research on resource allocation for multi-tier web applications in a virtualization environment,"List(http://dx.doi.org/10.1007/s11704-011-0127-6, http://link.springer.com/article/10.1007/s11704-011-0127-6, http://www.webofknowledge.com/)",5.0,2011,572de199d39c4f49934b3d5c
53e99792b7602d9701f5af1a,"List(List(5631df8845cedb3399f3e752, null, null, null, Shigeru Fujita, null, null, null, null, null, null, null, null, null, null), List(53f4775edabfaee4dc891b69, null, null, null, Kenji Sugawara, null, null, null, null, null, null, null, null, null, null), List(54096ca7dabfae450f483585, null, null, null, Claude Moulin, null, null, null, null, null, null, null, null, null, null), List(5448b55bdabfae87b7e68206, null, null, null, Jean-Paul A. Barthès, null, null, null, null, null, null, null, null, null, null))",10.1109/COGINF.2010.5599834,"List(Syma, Computer science, Symbiotic computing, Multi-agent system, Human–computer interaction, Schedule, Artificial intelligence, Ubiquitous computing, Cognition)",,,"List(cognition, multi-agent systems, ubiquitous computing, ADIPS-DASH, OMAS, SYMA, actuators, awareness and operation module, cognition functions, decision functions, intelligent multiagent system, multiparadigm-multiagent framework, perceptual interaction, social interaction, symbiotic base mechanism, symbiotic multiagent system, Awareness, Cognition Layer model, Multi-agent system, Social-ware, Symbiotic Computing)",en,4,630.0,625.0,,"List(53e9b3dab7602d9703ec7ddf, 53e9a3edb7602d9702d03525, 53e9b9fbb7602d97045f67ae, 53e9b4c3b7602d9703fdfe37, 53e9a310b7602d9702c1a36e, 53e9abfeb7602d97035c19c5)",The design of awareness and operation module for the symbiotic applications.,"List(http://dx.doi.org/10.1109/COGINF.2010.5599834, http://doi.ieeecomputersociety.org/10.1109/COGINF.2010.5599834, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5599834)",,2010,53a72bad20f7420be8c2d5af
53e99792b7602d9701f5af27,"List(List(53f46e66dabfaee02adb48fd, null, ysq05@mails.tsinghua.edu.cn, null, Shengqi Ye, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(53f362d7dabfae4b3498de6a, null, null, null, Yingjia He, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(54488a23dabfae87b7e3f16a, null, null, null, Jianming Hu, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(561cba1b45ce11c523ca3441, null, null, null, Zuo Zhang, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null))",10.1109/FSKD.2008.678,"List(Spline (mathematics), Mars Exploration Program, Data modeling, Multivariate adaptive regression splines, Traffic flow, Computer science, Regression analysis, Artificial intelligence, Artificial neural network, Machine learning, Beijing)",,,"List(considerable accuracy, promising traffic flow forecasting, proposed mars method, neural networks, short-term traffic flow forecasting, historical traffic flow data, traffic flow forecasting, multivariate adaptive regression splines, mars model, analytical function, mars method, analytic function, forecasting, predictive models, spline, mars, traffic flow, neural network, detectors, data models, regression analysis)",en,11,675.0,669.0,,"List(53e9b95bb7602d9704549008, 53e9ba11b7602d97046117d8, 53e9b8f6b7602d97044dc1a6, 53e99d51b7602d970260acca, 53e9a751b7602d9703088787)",Short-Term Traffic Flow Forecasting Based on MARS,"List(http://dx.doi.org/10.1109/FSKD.2008.678, http://www.webofknowledge.com/)",,2008,53a72cfa20f7420be8c554b2
53e99792b7602d9701f5af35,"List(List(53f43a51dabfaec22baa659b, null, dedwards@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Dennis Edwards, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f3b3ffdabfae4b34b2dae9, null, ssimmons@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Sharon Simmons, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f4333fdabfaeb22f451979, null, nwilde@uwf.edu, null, Norman Wilde, null, null, null, null, Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., null, null, List(Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null))",10.1016/j.jss.2004.12.018,"List(Data mining, Causality, End user, Ranking, Computer science, Military systems, Software, Feature model, Component-based software engineering, A-weighting, Distributed computing)",,1.0,"List(Feature location, Distributed systems, Software reconnaissance)",en,62,68.0,57.0,//static.aminer.org/pdf/PDF/000/996/035/an_approach_to_feature_location_in_distributed_systems.pdf,"List(53e9b6eeb7602d970427df40, 53e9b6eeb7602d9704283b9f, 53e9b40eb7602d9703f01b25, 53e9a3c0b7602d9702ccdfc9, 53e99818b7602d97020347a2, 53e9a2acb7602d9702bb4d7e, 558aa7ea84ae84d265bee194, 558a5258e4b037c08756714c, 53e9b946b7602d97045336a9, 53e9b1d6b7602d9703c67695, 53e9a516b7602d9702e3bcea, 53e9ac33b7602d97035f892c, 53e9ba22b7602d9704628817, 53e9af3ab7602d97039769c8, 53e9b1a3b7602d9703c2c6f7, 53e9ac89b7602d9703660f90, 53e9ad2db7602d970370e8a2, 53e9a735b7602d970306db2b, 53e99960b7602d97021a17da)",An approach to feature location in distributed systems,"List(http://dx.doi.org/10.1016/j.jss.2004.12.018, https://www.sciencedirect.com/science/article/pii/S016412120500004X, http://www.webofknowledge.com/)",79.0,2006,54825226582fc50b5e05610e
53e99792b7602d9701f5b06f,"List(List(53f45e2adabfaeb22f51d645, null, null, null, Luís Macedo, null, null, null, 0000-0002-3144-0362, null, null, null, null, null, null), List(53f45576dabfaeee22a30c3d, null, null, null, Amílcar Cardoso, null, null, null, null, null, null, null, null, null, null))",10.1007/BFb0056317,"List(Adjacency matrix, Graph, Knowledge representation and reasoning, Storytelling, Architectural design, Computer science, Artificial intelligence, Case-based reasoning, Recursion, Subgraph isomorphism problem)",3-540-64990-5,,"List(nested graph-structured representations, adjacency matrix)",en,20,12.0,1.0,,"List(53e9b049b7602d9703aadc37, 53e99df1b7602d97026b4d0e, 53e9a6fdb7602d97030331ef, 53e9b39db7602d9703e81ff1, 53e9b6fab7602d970428ee72, 53e9b109b7602d9703b875ef, 53e9a5e9b7602d9702f136da, 53e99e28b7602d97026f0125, 53e9ba17b7602d970461b707, 53e99ad1b7602d970235524d, 53e9b35ab7602d9703e35fe3, 53e9a70bb7602d9703040c52, 53e99ccab7602d970258074d, 558a73e3e4b0b32fcb36e62f, 53e99d0cb7602d97025c15c4, 53e9b1aab7602d9703c36404, 5c790e6c4895d9cbc61790aa)",Nested Graph-Structured Representations for Cases,"List(http://dx.doi.org/10.1007/BFb0056317, http://www.webofknowledge.com/)",1488.0,1998,53a7271520f7420be8b8b5ba
53e99792b7602d9701f5b074,"List(List(53f4357bdabfaee4dc77b09a, null, kanchana.thilakarathna@nicta.com.au, null, Kanchana Thilakarathna, null, null, null, 0000-0003-4332-0082, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f4662fdabfaee2a1dadc95, null, null, null, Xinlong Guan, null, null, null, null, Natl ICT Australia, Sydney, NSW, Australia, null, null, List(Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f484c5dabfaee4dc8b0b1e, null, null, null, Aruna Seneviratne, null, null, null, 0000-0001-6894-7987, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null))",10.1145/2594368.2601465,"List(World Wide Web, Content sharing, Android (operating system), Social network, Computer science, Active networking, Overlay, User-centered design)",,,"List(cellular data traffic offloading, mobile social networking, store and forward networks, user generated content sharing)",en,2,361.0,360.0,,"List(53e9b04eb7602d9703ab29b9, 557c6f6a08b02739a5ca7106, 53e9be79b7602d9704b38a13, 53e9b360b7602d9703e3d236)",Demo: Yalut -- user-centric social networking overlay,"List(http://dx.doi.org/10.1145/2594368.2601465, http://doi.acm.org/10.1145/2594368.2601465, http://dl.acm.org/citation.cfm?id=2594368.2601465&coll=DL&dl=GUIDE&CFID=521580964&CFTOKEN=96511501&preflayout=flat, http://www.webofknowledge.com/)",,2014,53a72cf620f7420be8c548e2
53e99792b7602d9701f5b085,"List(List(53f43415dabfaee43ec18eea, null, null, null, Bernard L. Menezes, null, null, null, null, Dept. of Electrical and Computer Eng., null, null, List(Dept. of Electrical and Computer Eng.), null, null), List(53f47f2cdabfaee43ed52fa2, null, null, null, K. Thadani, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f42f14dabfaee02ac76859, null, null, null, Alfred G. Dale, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f46382dabfaee02ad88cb3, null, null, null, Roy M. Jenevein, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null))",10.1007/978-1-4613-1679-4_6,"List(Multiprocessor architecture, Space-based architecture, Computer architecture, Computer science, Parallel computing, Symmetric multiprocessor system, Bandwidth (signal processing), Database machine, Host processor, Hypercube, Computation)",,,,en,9,88.0,75.0,//static.aminer.org/pdf/PDF/000/463/867/design_of_a_hyperkyklos_based_multiprocessor_architecture_for_high_performance.pdf,"List(53e9aacab7602d9703449777, 53e9b153b7602d9703bda549, 53e9a82bb7602d970317135f, 53e9bb7ab7602d97047bc792)",Design of a HyperKYKLOS-based Multiprocessor Architecture for High-Performance Join Operations,"List(http://dx.doi.org/10.1007/978-1-4613-1679-4_6, https://link.springer.com/chapter/10.1007%2F978-1-4613-1679-4_6, http://dblp.uni-trier.de/db/conf/iwdm/iwdm87.html#MenezesTDJ87, https://rd.springer.com/chapter/10.1007/978-1-4613-1679-4_6)",,1987,53a72ac520f7420be8c0cd21


#### 3.2. Author DF

In [0]:
# Create the Authors DF

df2 = _df.withColumn('auth_expl', F.explode(F.col("authors"))) # explode the authors array
df2 = (df2.withColumn('auth_id', F.col('auth_expl._id')) # separate the authors id and name
          .withColumn('auth_name', F.col('auth_expl.name')))

authors_df = df2.select('auth_id', 'auth_name').distinct() # make the authors df of distinct auth_id and auth_name pairs
# there were only 92 rows where both auth id and name were null.
# altogether, there are 400k unique authors.

authors_df = (authors_df.withColumnRenamed('auth_id', 'ID')
                        .withColumnRenamed('auth_name', 'Name'))

del df2
display(authors_df.limit(DISPLAY_LIMIT))

ID,Name
53f46a22dabfaee0d9c3d5e5,Shuguo Yang
53f431bcdabfaee2a1cb41b5,Sunyoung Ahn
53f45ee9dabfaee43ecda842,Chris C. N. Chu
548a2e3ddabfae9b40134fbc,Harry M. Sneed
53f39e3edabfae4b34aa8c4a,Jungil Park
53f42e8cdabfaee1c0a4274e,Hai Zhou
53f44f6adabfaedf435efcb8,James Jungho Pak
53f42d8bdabfaec22ba1a1e3,M. M. Gore
53f43b03dabfaedce555bf2a,Min Pan
54328883dabfaeb4c6a8a699,Theo Pavlidis


In [0]:
# Generate the authors FK column in the original DF
_df = _df.withColumn('Author_ID', F.col('authors._id'))
display(_df.limit(DISPLAY_LIMIT))

_id,authors,doi,fos,isbn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,volume,year,venue_id,Author_ID
53e99784b7602d9701f3e133,"List(List(53f45728dabfaec09f209538, null, null, null, Peijuan Wang, null, null, null, null, null, null, null, null, null, null), List(5601754345cedb3395e59457, null, null, null, Jiahua Zhang, null, null, null, null, null, null, null, null, null, null), List(53f38438dabfae4b34a08928, null, null, null, Donghui Xie, null, null, null, null, null, null, null, null, null, null), List(5601754345cedb3395e5945a, null, null, null, Yanyan Xu, null, null, null, null, null, null, null, null, null, null), List(53f43d25dabfaeecd6995149, null, null, null, Yun Xu, null, null, null, null, null, null, null, null, null, null))",10.1109/IGARSS.2011.6049503,"List(Agronomy, Moisture, Hydrology, Environmental science, Dry weight, Water content, Stomatal conductance, Transpiration, Irrigation, Soil water, Canopy)",,,"List(canopy parameters, canopy spectrum, different soil water content control, winter wheat, irrigation, hydrology, radiometry, moisture, indexes, vegetation, indexation, dry weight, soil moisture, water content, indexing terms, spectrum, natural disaster)",en,0,1933.0,1930.0,,,The relationship between canopy parameters and spectrum of winter wheat under different irrigations in Hebei Province.,List(http://dx.doi.org/10.1109/IGARSS.2011.6049503),,2011,8021.0,"List(53f45728dabfaec09f209538, 5601754345cedb3395e59457, 53f38438dabfae4b34a08928, 5601754345cedb3395e5945a, 53f43d25dabfaeecd6995149)"
53e99784b7602d9701f3e151,"List(List(53f46797dabfaeb22f542630, null, null, null, Jairo Rocha, null, null, null, null, null, null, null, null, null, null), List(54328883dabfaeb4c6a8a699, null, null, null, Theo Pavlidis, null, null, null, null, null, null, null, null, null, null))",10.1109/ICDAR.1993.395663,"List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",,,"List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)",en,17,605.0,602.0,,"List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)",A solution to the problem of touching and broken characters.,List(http://dx.doi.org/10.1109/ICDAR.1993.395663),,1993,3130.0,"List(53f46797dabfaeb22f542630, 54328883dabfaeb4c6a8a699)"
53e99784b7602d9701f3e15d,"List(List(53f43b03dabfaedce555bf2a, null, null, null, Min Pan, null, null, null, null, null, null, null, null, null, null), List(53f45ee9dabfaee43ecda842, null, null, null, Chris C. N. Chu, null, null, null, null, null, null, null, null, null, null), List(53f42e8cdabfaee1c0a4274e, null, null, null, Hai Zhou, null, null, null, null, null, null, null, null, null, null))",10.1109/ISCAS.2005.1465124,"List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",0-7803-8834-8,,"List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)",en,28,,2461.0,//static.aminer.org/pdf/PDF/000/423/329/timing_yield_estimation_using_statistical_static_timing_analysis.pdf,"List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)",Timing yield estimation using statistical static timing analysis,"List(http://dx.doi.org/10.1109/ISCAS.2005.1465124, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1465124)",,2005,10395.0,"List(53f43b03dabfaedce555bf2a, 53f45ee9dabfaee43ecda842, 53f42e8cdabfaee1c0a4274e)"
53e99784b7602d9701f3e922,"List(List(53f39e3edabfae4b34aa8c4a, null, null, null, Jungil Park, null, null, null, null, null, null, null, null, null, 237372), List(53f431bcdabfaee2a1cb41b5, null, null, null, Sunyoung Ahn, null, null, null, null, null, null, null, null, null, 24447851), List(53f46ac3dabfaeee22a63eab, null, null, null, Youngmi Kim Pak, null, null, null, null, null, null, null, null, null, 4241287), List(53f44f6adabfaedf435efcb8, null, null, null, James Jungho Pak, null, null, null, null, null, null, null, null, null, 22875855))",10.1109/NEMS.2009.5068754,,,,,en,1,1057.0,1054.0,//static.aminer.org/pdf/PDF/002/845/190/.pdf,,International Conference on Nano/Micro Engineered and Molecular Systems,List(http://doi.ieeecomputersociety.org/10.1109/NEMS.2009.5068754),,2009,10582.0,"List(53f39e3edabfae4b34aa8c4a, 53f431bcdabfaee2a1cb41b5, 53f46ac3dabfaeee22a63eab, 53f44f6adabfaedf435efcb8)"
53e99784b7602d9701f3f411,"List(List(548a2e3ddabfae9b40134fbc, null, null, null, Harry M. Sneed, null, null, null, null, null, null, null, null, null, null))",10.1109/CMPSAC.2002.1044548,"List(XML Base, World Wide Web, XML framework, XML Encryption, Efficient XML Interchange, SGML, Programming language, Software engineering, XML, XML validation, Computer science, cXML)",0-7695-1727-7,,"List(Internet, hypermedia markup languages, information resources, systems re-engineering, COBOL, PL/I, World Wide Web, XML, batch programs, data conversion, e-commerce, eXtensible Markup Language, enterprise application integration, interface reengineering, legacy programs, online programs, software reengineering, subprograms, systems integration)",en,28,172.0,167.0,,"List(53e9adbdb7602d97037be8a2, 53e9bb53b7602d9704792f33, 558aa425e4b0b32fcb37fff4, 558abd44e4b031bae1f9653a, 53e9a326b7602d9702c32229, 53e9b1d7b7602d9703c6ce7c, 558a7de784ae84d265bdee99, 53e9ae17b7602d9703828d13, 53e9aa4fb7602d97033bf9ad)",Using XML to Integrate Existing Software Systems into the Web,"List(http://dx.doi.org/10.1109/CMPSAC.2002.1044548, http://doi.ieeecomputersociety.org/10.1109/CMPSAC.2002.1044548, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1044548)",,2002,7268.0,List(548a2e3ddabfae9b40134fbc)
53e99784b7602d9701f3f5fe,"List(List(53f46a22dabfaee0d9c3d5e5, null, ysg_2005@hotmail.com, 5b8698cce1cd8e14a3826671, Shuguo Yang, null, null, null, null, School of Mathematics and Physics, Qingdao University of Science and Technology, Qingdao, China 266061, null, 5f71b2e91c455f439fe3f23f, null, null, null))",10.1007/s11704-011-0127-6,"List(Virtualization, Service level objective, Virtual machine, Computer science, Testbed, Quality of service, Provisioning, Resource allocation, Web application, Operating system, Distributed computing)",,4.0,"List(resource allocation, cpu utilization, quality of service)",en,2,512.0,506.0,,"List(53e9a073b7602d9702957efa, 53e9ad87b7602d970377bfb5, 53e9be51b7602d9704b11381, 53e9be04b7602d9704abb31d, 53e9992bb7602d9702169236, 53e998cdb7602d97021044db, 53e9afa6b7602d97039f6054, 53e99822b7602d9702044e60)",Research on resource allocation for multi-tier web applications in a virtualization environment,"List(http://dx.doi.org/10.1007/s11704-011-0127-6, http://link.springer.com/article/10.1007/s11704-011-0127-6, http://www.webofknowledge.com/)",5.0,2011,9794.0,List(53f46a22dabfaee0d9c3d5e5)
53e99784b7602d9701f3f600,"List(List(560175ed45cedb3395e5a3a0, null, null, 5b868bb4e1cd8e14a32ddb28, Sandeep Gupta, null, null, null, null, Alcatel Development Center, Udyog Vihar, Gurgaon, India, null, 5f71b3b51c455f439fe44c34, null, null, null), List(53f42d8bdabfaec22ba1a1e3, null, null, 5b86b0bfe1cd8e14a32127bb, M. M. Gore, null, null, null, null, Department of Computer Science and Engineering, Motilal Nehru National Institute of Technology, Allahabad, India, null, 5f71b3271c455f439fe40d90, null, null, null))",10.1007/978-3-540-30536-1_64,"List(Changeover, Transport engineering, Planner, New delhi, Public transport, Schedule, Local bus, Engineering, Metropolitan area)",3-540-24076-4,,"List(city road, local bus service, exact bus route, efficient travel planner, bus travel, particular destination, city bus, bus movement, proposed system, large metropolitan city, current location, real time, public transport)",en,0,537.0,537.0,,,BUSTRAP – an efficient travel planner for metropolitans,List(http://dx.doi.org/10.1007/978-3-540-30536-1_64),,2004,16468.0,"List(560175ed45cedb3395e5a3a0, 53f42d8bdabfaec22ba1a1e3)"
53e99785b7602d9701f418d5,"List(List(53f4d0e8dabfaeedcf77e71f, null, null, null, John D. McGregor, null, null, null, null, null, null, null, null, null, 1333185))",,,,7.0,,en,15,81.0,71.0,,,Domain,List(http://dx.doi.org/10.5381/jot.2004.3.7.c6),3.0,2004,3383.0,List(53f4d0e8dabfaeedcf77e71f)
53e99785b7602d9701f42477,"List(List(null, null, null, null, &NA;, null, null, null, null, null, null, null, null, null, null))",10.1097/00002480-196204000-00076,,,1.0,,en,19,,,,,Discussion:,"List(http://dx.doi.org/10.1097/00002480-196204000-00076, http://dx.doi.org/10.1198/004017004000000059)",8.0,1962,14642.0,List(null)
53e99785b7602d9701f4247a,"List(List(53f42f45dabfaeb22f420420, null, cbbc@dennischammond.com, null, Dennis C. Hammond, null, null, null, null, null, null, null, null, null, null))",10.1097/PRS.0b013e3181b1799e,,,3.0,,en,4,705.0,704.0,//static.aminer.org/pdf/PDF/003/103/434/discussion.pdf,,Discussion:,"List(http://dx.doi.org/10.1097/PRS.0b013e3181b1799e, http://dx.doi.org/10.1080/00401706.2000.10485712, http://www.webofknowledge.com/)",124.0,2009,13746.0,List(53f42f45dabfaeb22f420420)


#### 3.3. Organization DF

In [0]:
# Create the new df
import re

def getCountry(s):
    if s is None:
        return None
    arr = []
    countries = ["Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua & Deps", "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain", "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan", "Bolivia", "Bosnia", "Botswana", "Brazil", "Brunei", "Bulgaria", "Burkina", "Burundi", "Cambodia", "Cameroon", "Canada", "Cape Verde", "Central African Republic", "Chad", "Chile", "China", "Colombia", "Comoros", "Congo", "Congo Democratic Republic", "Costa Rica", "Croatia", "Cuba", "Cyprus", "Czech Republic", "Denmark", "Djibouti", "Dominica", "Dominican Republic", "East Timor", "Ecuador", "Egypt", "El Salvador", "Equatorial Guinea", "Eritrea", "Estonia", "Ethiopia", "Fiji", "Finland", "France", "Gabon", "Gambia", "Georgia", "Germany", "Ghana", "Greece", "Grenada", "Guatemala", "Guinea", "Guinea-bissau", "Guyana", "Haiti", "Honduras", "Hungary", "Iceland", "India", "Indonesia", "Iran", "Iraq", "Ireland", "Israel", "Italy", "Ivory Coast", "Jamaica", "Japan", "Jordan", "Kazakhstan", "Kenya", "Kiribati", "South Korea", "Kosovo", "Kuwait", "Kyrgyzstan", "Laos", "Latvia", "Lebanon", "Lesotho", "Liberia", "Libya", "Liechtenstein", "Lithuania", "Luxembourg", "Macedonia", "Madagascar", "Malawi", "Malaysia", "Maldives", "Mali", "Malta", "Marshall Islands", "Mauritania", "Mauritius", "Mexico", "Micronesia", "Moldova", "Monaco", "Mongolia", "Montenegro", "Morocco", "Mozambique", "Myanmar", "Burma", "Namibia", "Nauru", "Nepal", "Netherlands", "New Zealand", "Nicaragua", "Niger", "Nigeria", "Norway", "Romania", "Pakistan", "Palau", "Panama", "Papua New Guinea", "Paraguay", "Peru", "Philippines", "Poland", "Portugal", "Qatar", "Oman", "Russia", "Rwanda", "St Kitts & Nevis", "St Lucia", "Saint Vincent & The Grenadines", "Samoa", "San Marino", "Sao Tome & Principe", "Saudi Arabia", "Senegal", "Serbia", "Seychelles", "Sierra Leone", "Singapore", "Slovakia", "Slovenia", "Solomon Islands", "Somalia", "South Africa", "South Sudan", "Spain", "Sri Lanka", "Sudan", "Suriname", "Swaziland", "Sweden", "Switzerland", "Syria", "Taiwan", "Tajikistan", "Tanzania", "Thailand", "Togo", "Tonga", "Trinidad & Tobago", "Tunisia", "Turkey", "Turkmenistan", "Tuvalu", "Uganda", "Ukraine", "United Arab Emirates", "United Kingdom", "United States", "Uruguay", "Uzbekistan", "Vanuatu", "Vatican City", "Venezuela", "Vietnam", "Yemen", "Zambia", "Zimbabwe"]
    state_names = ["alaska", "alabama", "arkansas", "american samoa", "arizona", "california", "colorado", "connecticut", "district ", "of columbia", "delaware", "florida", "georgia", "guam", "hawaii", "iowa", "idaho", "illinois", "indiana", "kansas", "kentucky", "louisiana", "massachusetts", "maryland", "maine", "michigan", "minnesota", "missouri", "mississippi", "montana", "north carolina", "north dakota", "nebraska", "new hampshire", "new jersey", "new mexico", "nevada", "new york", "ohio", "oklahoma", "oregon", "pennsylvania", "puerto rico", "rhode island", "south carolina", "south dakota", "tennessee", "texas", "utah", "virginia", "virgin islands", "vermont", "washington", "wisconsin", "west virginia", "wyoming"]
    states = ['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY']
    
    for i in s:
        if i["org"] is None:
            arr.append(None)
            break
        sent = re.sub("[^a-zA-Z -]", "", i["org"])
        x = None
        for j in countries:
            x = re.search(j.lower(), sent.lower())
            if x is not None:
                if j.lower() == 'india':
                    x = re.search('indiana', sent.lower())
                    if x is not None:
                        arr.append("United States")
                elif j.lower() == 'georgia':
                    x = re.search('USA', sent)
                    if x is not None:
                        arr.append("United States")
                else:
                    arr.append(j)
                break
        if x is None:
            x = re.search("USA", sent)
            if x is not None:
                arr.append("United States")
                break
        if x is None:
            x = re.search("UK", sent)
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            x = re.search("england", sent.lower())
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            x = re.search("scotland", sent.lower())
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            x = re.search("wales", sent.lower())
            if x is not None:
                arr.append("United Kingdom")
                break
        if x is None:
            for j in states:
                x = re.search(j, sent)
                if x is not None:
                    arr.append("United States")
                    break
        if x is None:
            for j in state_names:
                x = re.search(j, sent.lower())
                if x is not None:
                    arr.append("United States")
                    break
        break
                    
    if len(arr) > 0:
        return arr[0]
    else:
        return None

getCountryUDF = udf(getCountry)

# Organization (affiliation of the first author)
# ID - authors.orgid
# Name - authors.org
# Country - getCountryUDF(F.arrays_zip("authors.org"))
def organization(df):
    new_df = df.select(F.col("authors.orgid").getItem(0).alias("org_id"),
                       F.col("authors.org").getItem(0).alias("org_name"),
                       (getCountryUDF(F.arrays_zip("authors.org"))).alias("org_country"))
    new_df = new_df.na.drop("all")
    return new_df

org_df = organization(_df)

new_df = org_df.select('org_id', 'org_name', 'org_country').distinct()
new_df = new_df.withColumn('ID', F.monotonically_increasing_id())

In [0]:
# Join ID to original dataframe (_df)

# TODO: make sure that at the end, _df has the column 'Org' with the FK of the organization.
# TODO: Fix the error

df_temp = _df.withColumn("org_id", F.col("authors.orgid").getItem(0)).withColumn("org_name", F.col("authors.org").getItem(0))
df_with_id = df_temp.join(new_df.select('ID', 'org_id', 'org_name'), on=['org_id', 'org_name'])
df_with_id = final_df.drop('org_id', 'org_name')

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-390784252746146>[0m in [0;36m<module>[0;34m[0m
[1;32m      2[0m [0mdf_temp[0m [0;34m=[0m [0m_df[0m[0;34m.[0m[0mwithColumn[0m[0;34m([0m[0;34m"org_id"[0m[0;34m,[0m [0mF[0m[0;34m.[0m[0mcol[0m[0;34m([0m[0;34m"authors.orgid"[0m[0;34m)[0m[0;34m.[0m[0mgetItem[0m[0;34m([0m[0;36m0[0m[0;34m)[0m[0;34m)[0m[0;34m.[0m[0mwithColumn[0m[0;34m([0m[0;34m"org_name"[0m[0;34m,[0m [0mF[0m[0;34m.[0m[0mcol[0m[0;34m([0m[0;34m"authors.org"[0m[0;34m)[0m[0;34m.[0m[0mgetItem[0m[0;34m([0m[0;36m0[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m [0mdf_with_id[0m [0;34m=[0m [0mdf_temp[0m[0;34m.[0m[0mjoin[0m[0;34m([0m[0mnew_df[0m[0;34m.[0m[0mselect[0m[0;34m([0m[0;34m'ID'[0m[0;34m,[0m [0;34m'org_id'[0m[0;34m,[

#### 3.4. DBLP fact table

In [0]:
dblp_df = _df.select('_id','venue_id','Org','Author_ID','references','keywords','fos','title','n_citation','lang','page_start','page_end','doi','isbn','year','volume','issue')

dblp_df = dblp_df.toDF('ID','Venue','Org','Authors','References','Keywords','FOS','Title','NoCitations','Lang','PageStart','PageEnd','DOI','ISBN','Year','Volume','Issue')

In [0]:
display(dblp_df.limit(DISPLAY_LIMIT))

ID,Venue,Authors,References,Keywords,FOS,Title,NoCitations,Lang,PageStart,PageEnd,DOI,ISBN,Year,Volume,Issue
53e99784b7602d9701f3e133,8021.0,"List(53f45728dabfaec09f209538, 5601754345cedb3395e59457, 53f38438dabfae4b34a08928, 5601754345cedb3395e5945a, 53f43d25dabfaeecd6995149)",,"List(canopy parameters, canopy spectrum, different soil water content control, winter wheat, irrigation, hydrology, radiometry, moisture, indexes, vegetation, indexation, dry weight, soil moisture, water content, indexing terms, spectrum, natural disaster)","List(Agronomy, Moisture, Hydrology, Environmental science, Dry weight, Water content, Stomatal conductance, Transpiration, Irrigation, Soil water, Canopy)",The relationship between canopy parameters and spectrum of winter wheat under different irrigations in Hebei Province.,0,en,1930.0,1933.0,10.1109/IGARSS.2011.6049503,,2011,,
53e99784b7602d9701f3e151,3130.0,"List(53f46797dabfaeb22f542630, 54328883dabfaeb4c6a8a699)","List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)","List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)","List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",A solution to the problem of touching and broken characters.,17,en,602.0,605.0,10.1109/ICDAR.1993.395663,,1993,,
53e99784b7602d9701f3e15d,10395.0,"List(53f43b03dabfaedce555bf2a, 53f45ee9dabfaee43ecda842, 53f42e8cdabfaee1c0a4274e)","List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)","List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)","List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",Timing yield estimation using statistical static timing analysis,28,en,2461.0,,10.1109/ISCAS.2005.1465124,0-7803-8834-8,2005,,
53e99784b7602d9701f3e922,10582.0,"List(53f39e3edabfae4b34aa8c4a, 53f431bcdabfaee2a1cb41b5, 53f46ac3dabfaeee22a63eab, 53f44f6adabfaedf435efcb8)",,,,International Conference on Nano/Micro Engineered and Molecular Systems,1,en,1054.0,1057.0,10.1109/NEMS.2009.5068754,,2009,,
53e99784b7602d9701f3f411,7268.0,List(548a2e3ddabfae9b40134fbc),"List(53e9adbdb7602d97037be8a2, 53e9bb53b7602d9704792f33, 558aa425e4b0b32fcb37fff4, 558abd44e4b031bae1f9653a, 53e9a326b7602d9702c32229, 53e9b1d7b7602d9703c6ce7c, 558a7de784ae84d265bdee99, 53e9ae17b7602d9703828d13, 53e9aa4fb7602d97033bf9ad)","List(Internet, hypermedia markup languages, information resources, systems re-engineering, COBOL, PL/I, World Wide Web, XML, batch programs, data conversion, e-commerce, eXtensible Markup Language, enterprise application integration, interface reengineering, legacy programs, online programs, software reengineering, subprograms, systems integration)","List(XML Base, World Wide Web, XML framework, XML Encryption, Efficient XML Interchange, SGML, Programming language, Software engineering, XML, XML validation, Computer science, cXML)",Using XML to Integrate Existing Software Systems into the Web,28,en,167.0,172.0,10.1109/CMPSAC.2002.1044548,0-7695-1727-7,2002,,
53e99784b7602d9701f3f5fe,9794.0,List(53f46a22dabfaee0d9c3d5e5),"List(53e9a073b7602d9702957efa, 53e9ad87b7602d970377bfb5, 53e9be51b7602d9704b11381, 53e9be04b7602d9704abb31d, 53e9992bb7602d9702169236, 53e998cdb7602d97021044db, 53e9afa6b7602d97039f6054, 53e99822b7602d9702044e60)","List(resource allocation, cpu utilization, quality of service)","List(Virtualization, Service level objective, Virtual machine, Computer science, Testbed, Quality of service, Provisioning, Resource allocation, Web application, Operating system, Distributed computing)",Research on resource allocation for multi-tier web applications in a virtualization environment,2,en,506.0,512.0,10.1007/s11704-011-0127-6,,2011,5.0,4.0
53e99784b7602d9701f3f600,16468.0,"List(560175ed45cedb3395e5a3a0, 53f42d8bdabfaec22ba1a1e3)",,"List(city road, local bus service, exact bus route, efficient travel planner, bus travel, particular destination, city bus, bus movement, proposed system, large metropolitan city, current location, real time, public transport)","List(Changeover, Transport engineering, Planner, New delhi, Public transport, Schedule, Local bus, Engineering, Metropolitan area)",BUSTRAP – an efficient travel planner for metropolitans,0,en,537.0,537.0,10.1007/978-3-540-30536-1_64,3-540-24076-4,2004,,
53e99785b7602d9701f418d5,3383.0,List(53f4d0e8dabfaeedcf77e71f),,,,Domain,15,en,71.0,81.0,,,2004,3.0,7.0
53e99785b7602d9701f42477,14642.0,List(null),,,,Discussion:,19,en,,,10.1097/00002480-196204000-00076,,1962,8.0,1.0
53e99785b7602d9701f4247a,13746.0,List(53f42f45dabfaeb22f420420),,,,Discussion:,4,en,704.0,705.0,10.1097/PRS.0b013e3181b1799e,,2009,124.0,3.0


In [0]:
dblp_df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- Venue: long (nullable = true)
 |-- Authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- References: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Keywords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- FOS: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Title: string (nullable = true)
 |-- NoCitations: integer (nullable = true)
 |-- Lang: string (nullable = true)
 |-- PageStart: integer (nullable = true)
 |-- PageEnd: integer (nullable = true)
 |-- DOI: string (nullable = true)
 |-- ISBN: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Issue: integer (nullable = true)



### 4. Load DFs as Delta tables

In [0]:
# DBLP fact table


In [0]:
# Venue table

In [0]:
# Author table


In [0]:
# Organization
