In [0]:
# Define the list (or any other type of iterable) of splits to save
saved_splits = range(0, 5) # Saves the first 5 splits

In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

import logging
import json
import re

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [0]:
def replace_empty_string(col):
    return F.when(col == "", None).otherwise(col)

def transform(_df):
    # Create the col of author IDs
    _df = _df.withColumn('Author_ID', F.col('authors._id'))
    # Delete entries where any author ID is null
    _df = _df.where("!exists(Author_ID, x -> x is null)")
    # Drop entries with 1-word titles or empty authors or nonexistant _id or any nonexistant author id.
    # Also removes empty or missing references.
    _df = (_df.filter((F.size(F.col('authors')) > 0) & # By default F.size() returns -1 if the value is null.
                      (F.size(F.split(F.col('title'), ' ')) > 1) &  
                      (F.col('_id') != '') & 
                      (F.col('_id').isNotNull()) & 
                      ~(F.array_contains(F.col('references'), '')) & 
                      ~(F.array_contains(F.col('Author_ID'), ''))))
    # Remove all null references
    _df = _df.withColumn('references', F.expr('filter(references, x -> x is not null)'))    
    # Remove entries that are forewords
    _df = _df.filter(~F.lower(F.col("title")).contains("foreword"))
    # Convert n_citation data type to int
    _df = _df.withColumn('n_citation', F.col('n_citation').cast('int'))
    # Replace empty language values with null.
    _df = _df.withColumn('lang', F.when(F.col('lang') == '', None).otherwise(F.col('lang')))
    # Replace empty 'keyword' and 'fos' arrays with null values.
    _df = (_df.withColumn('keywords', F.when(F.size(F.col('keywords')) == 0, None).otherwise(F.col('keywords')))
              .withColumn('fos', F.when(F.size(F.col('fos')) == 0, None).otherwise(F.col('fos'))))
    # Replace non-numeric page numbers with nulls and convert column type to int. Then replace 0 page numbers with nulls as well.
    _df = (_df.withColumn('page_start', F.when(F.col('page_start').cast('int').isNotNull(), F.col('page_start')).otherwise(None)) # replace non-numeric page numbers with null
              .withColumn('page_end', F.when(F.col('page_end').cast('int').isNotNull(), F.col('page_end')).otherwise(None))
              .withColumn('page_start', F.col('page_start').cast('int')) # convert column type to int
              .withColumn('page_end', F.col('page_end').cast('int'))
              .withColumn('page_start', F.when(F.col('page_start') == 0, None).otherwise(F.col('page_start'))) # replace 0 page numbers with null as well
              .withColumn('page_end', F.when(F.col('page_end') == 0, None).otherwise(F.col('page_end'))))
    # Replace empty dois with nulls.
    _df = _df.withColumn('doi', F.when(F.col('doi') == '', None).otherwise(F.col('doi')))
    # Replace empty years with nulls and change data type to int.
    _df = (_df.withColumn('year', F.when(F.col('year') == 0, None).otherwise(F.col('year')))
              .withColumn('year', F.col('year').cast('int')))
    # Replace non-numeric volume and issue numbers with null and convert data types to int. Then repalce 0 values with null as well.
    _df = (_df.withColumn('volume', F.when(F.col('volume').cast('int').isNotNull(), F.col('volume')).otherwise(None)) # replace non-numeric values
              .withColumn('issue', F.when(F.col('issue').cast('int').isNotNull(), F.col('issue')).otherwise(None))
              .withColumn('volume', F.col('volume').cast('int')) # convert column type to int
              .withColumn('issue', F.col('issue').cast('int'))
              .withColumn('volume', F.when(F.col('volume') == 0, None).otherwise(F.col('volume'))) # replace 0 issue and volume numbers with null as well.
              .withColumn('issue', F.when(F.col('issue') == 0, None).otherwise(F.col('issue'))))
    
    # Replace empty strings in some columns with nulls
    venue = F.col("venue")
    for col in ["_id", "issn", "name", "name_d", "name_s", "online_issn", "publisher", "raw", "raw_zh", "t"]:
        venue = venue.withField(col, replace_empty_string(F.col(f"venue.{col}")))  
    _df = (
        _df
        .withColumn("venue", venue)
        .withColumn("issn", replace_empty_string(F.col("issn")))
        .withColumn("isbn", replace_empty_string(F.col("isbn")))
        .withColumn("isbn", F.when(F.col("isbn") == "isbn", None).otherwise(F.col("isbn")))
        .withColumn("issn", F.when(F.col("issn") == "issn", None).otherwise(F.col("issn")))
    )
    # fix incorrect issn
    _df = (_df
               .withColumn("issn",
                           F.when(F.length(F.col("issn")) == 9, F.col("issn"))
                           .when(F.length(F.col("issn")) == 8, F.concat(F.col("issn").substr(1, 4), F.lit("-"), F.col("issn").substr(5, 4)))
                           .when(F.col("issn").contains("E-ISBN"), F.col("issn").substr(1, 9))
                           .otherwise(None)
                          )
               .withColumn("venue", 
                           F.col("venue")
                           .withField("issn", F.coalesce(F.col("venue.issn"), F.col("issn")))
                           )
               .drop("issn")
              )
    # replace venue with null fields with null
    venue_is_empty = (
        F.col("venue.issn").isNull() &
        F.col("venue.name").isNull() &
        F.col("venue.name_d").isNull() &
        F.col("venue.name_s").isNull() &
        F.col("venue.online_issn").isNull() &
        F.col("venue.publisher").isNull() &
        F.col("venue.raw").isNull() &
        F.col("venue.raw_zh").isNull()
    )
    _df = _df.withColumn("venue", F.when(venue_is_empty, None).otherwise(F.col("venue")))
    # remove rows with null venues
    _df = _df.filter(F.col("venue").isNotNull())
    # coalescing venue._id and venue.issn to make up for missing ids
    _df = _df.withColumn("venue", F.col("venue").withField("_id", F.coalesce(F.col("venue._id"), F.col("venue.issn"))))
    # removing rows with venue id null
    _df = _df.filter(F.col("venue._id").isNotNull())
    return _df

In [0]:
for n in saved_splits:
    df = spark.read.option("multiline", True).json(f'dbfs:/user/dblpv13/dblpv13.{n}.json.gz')
    
    df = transform(df)
    df.write.parquet(f'dbfs:/user/dblpv13/dblpv13.{n}.parquet')
    logger.info(f'Wrote split {n} to parquet. ({df.count()}) rows')

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-2821859725189327>[0m in [0;36m<module>[0;34m[0m
[1;32m      3[0m [0;34m[0m[0m
[1;32m      4[0m     [0mdf[0m [0;34m=[0m [0mtransform[0m[0;34m([0m[0mdf[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;32m----> 5[0;31m     [0mdf[0m[0;34m.[0m[0mwrite[0m[0;34m.[0m[0mparquet[0m[0;34m([0m[0;34mf'dbfs:/user/dblpv13/dblpv13.{n}.parquet'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      6[0m     [0mlogger[0m[0;34m.[0m[0minfo[0m[0;34m([0m[0;34mf'Wrote split {n} to parquet. ({df.count()}) rows'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m/databricks/spark/python/pyspark/sql/readwriter.py[0m in [0;36mparquet[0;34m(self, path, mode, partitionBy, compression)[0m
[1;32m    883[0m             [0mself[0m[0;34m.[0m[0mpartitionBy[0m[0;34m([0m[0mpart

In [0]:
# Test reading a parquet file
split = 0
df_parq = spark.read.parquet(f'dbfs:/user/dblpv13/dblpv13.{split}.parquet')
logger.info(f'Processed split {split} contains {df_parq.count()} rows')
df_parq.printSchema()

INFO:__main__:Processed split 0 contains 156949 rows
root
 |-- _id: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _id: string (nullable = true)
 |    |    |-- bio: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- gid: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |    |-- name_zh: string (nullable = true)
 |    |    |-- oid: string (nullable = true)
 |    |    |-- oid_zh: string (nullable = true)
 |    |    |-- orcid: string (nullable = true)
 |    |    |-- org: string (nullable = true)
 |    |    |-- org_zh: string (nullable = true)
 |    |    |-- orgid: string (nullable = true)
 |    |    |-- orgs: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- orgs_zh: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- 

In [0]:
display(df_parq.limit(10))

_id,abstract,authors,doi,fos,isbn,issue,keywords,lang,n_citation,page_end,page_start,pdf,references,title,url,venue,volume,year,Author_ID
53e99784b7602d9701f3e151,,"List(List(53f46797dabfaeb22f542630, null, null, null, Jairo Rocha, null, null, null, null, null, null, null, null, null, null), List(54328883dabfaeb4c6a8a699, null, null, null, Theo Pavlidis, null, null, null, null, null, null, null, null, null, null))",10.1109/ICDAR.1993.395663,"List(Intelligent character recognition, Pattern recognition, Computer science, Feature (computer vision), Document processing, Handwriting recognition, Optical character recognition, Feature extraction, Feature (machine learning), Artificial intelligence, Intelligent word recognition)",,,"List(handwriting recognition, prototypes, image segmentation, computer science, expert systems, knowledge base, pattern recognition, usability, optical character recognition, shape, feature extraction)",en,17,605.0,602,,"List(53e99cf5b7602d97025ace63, 557e8a7a6fee0fe990caa63d, 53e9a96cb7602d97032c459a, 53e9b929b7602d9704515791, 557e59ebf6678c77ea222447)",A solution to the problem of touching and broken characters.,List(http://dx.doi.org/10.1109/ICDAR.1993.395663),"List(53a72a4920f7420be8bfa51b, null, null, International Conference on Document Analysis and Recognition, null, null, null, ICDAR-1, null, null, null, null, 0)",,1993,"List(53f46797dabfaeb22f542630, 54328883dabfaeb4c6a8a699)"
53e99784b7602d9701f3e15d,"As process variations become a significant problem in deep sub-micron technology, a shift from deterministic static timing analysis to statistical static timing analysis for high-performance circuit designs could reduce the excessive conservatism that is built into current timing design methods. We address the timing yield problem for sequential circuits and propose a statistical approach to handle it. We consider the spatial and path reconvergence correlations between path delays, set-up time and hold time constraints, and clock skew due to process variations. We propose a method to get the timing yield based on the delay distributions of register-to-register paths in the circuit On average, the timing yield results obtained by our approach have average errors of less than 1.0% in comparison with Monte Carlo simulation. Experimental results show that shortest path variations and clock skew due to process variations have considerable impact on circuit timing, which could bias the timing yield results. In addition, the correlation between longest and shortest path delays is not significant.","List(List(53f43b03dabfaedce555bf2a, null, null, null, Min Pan, null, null, null, null, null, null, null, null, null, null), List(53f45ee9dabfaee43ecda842, null, null, null, Chris C. N. Chu, null, null, null, null, null, null, null, null, null, null), List(53f42e8cdabfaee1c0a4274e, null, null, null, Hai Zhou, null, null, null, null, null, null, null, null, null, null))",10.1109/ISCAS.2005.1465124,"List(Delay calculation, Timing failure, Monte Carlo method, Sequential logic, Statistical static timing analysis, Shortest path problem, Computer science, Algorithm, Clock skew, Static timing analysis, Statistics)",0-7803-8834-8,,"List(sequential circuits, statistical distributions, set-up time constraints, register-to-register paths, statistical static timing analysis, integrated circuit modelling, parameter estimation, statistical analysis, circuit model, path delays, deep sub-micron technology, timing, delay distributions, delays, circuit timing, shortest path variations, hold time constraints, integrated circuit yield, process variations, integrated circuit layout, high-performance circuit designs, clock skew, timing yield estimation, deterministic static timing analysis, monte carlo simulation, design method, static timing analysis, design methodology, process variation, shortest path, registers, circuit design, circuit analysis)",en,28,,2461,//static.aminer.org/pdf/PDF/000/423/329/timing_yield_estimation_using_statistical_static_timing_analysis.pdf,"List(53e9a8a9b7602d97031f6bb9, 599c7b6b601a182cd27360da, 53e9b443b7602d9703f3e52b, 53e9a6a6b7602d9702fdc57e, 599c7b6a601a182cd2735703, 53e9aad9b7602d970345afea, 5582821f0cf2bf7bae57ac18, 5e8911859fced0a24bb9a2ba, 53e9b002b7602d9703a5c932)",Timing yield estimation using statistical static timing analysis,"List(http://dx.doi.org/10.1109/ISCAS.2005.1465124, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1465124)","List(53a72e2020f7420be8c80142, null, null, International Symposium on Circuits and Systems, null, null, null, ISCAS (3), null, null, null, null, 0)",,2005,"List(53f43b03dabfaedce555bf2a, 53f45ee9dabfaee43ecda842, 53f42e8cdabfaee1c0a4274e)"
53e99784b7602d9701f3f411,"The eXtensible Markup Language 驴 XML 驴 is not only a language for communication between humans and the web, it is also a language for communication between programs. Rather than passing parameters, programs can pass documents from one to another, containing not only pure data, but control information as well. Even legacy programs written in ancient languages such as COBOL and PL/I can be adapted by means ofinterface reengineering to process and to generate XML documents.","List(List(548a2e3ddabfae9b40134fbc, null, null, null, Harry M. Sneed, null, null, null, null, null, null, null, null, null, null))",10.1109/CMPSAC.2002.1044548,"List(XML Base, World Wide Web, XML framework, XML Encryption, Efficient XML Interchange, SGML, Programming language, Software engineering, XML, XML validation, Computer science, cXML)",0-7695-1727-7,,"List(Internet, hypermedia markup languages, information resources, systems re-engineering, COBOL, PL/I, World Wide Web, XML, batch programs, data conversion, e-commerce, eXtensible Markup Language, enterprise application integration, interface reengineering, legacy programs, online programs, software reengineering, subprograms, systems integration)",en,28,172.0,167,,"List(53e9adbdb7602d97037be8a2, 53e9bb53b7602d9704792f33, 558aa425e4b0b32fcb37fff4, 558abd44e4b031bae1f9653a, 53e9a326b7602d9702c32229, 53e9b1d7b7602d9703c6ce7c, 558a7de784ae84d265bdee99, 53e9ae17b7602d9703828d13, 53e9aa4fb7602d97033bf9ad)",Using XML to Integrate Existing Software Systems into the Web,"List(http://dx.doi.org/10.1109/CMPSAC.2002.1044548, http://doi.ieeecomputersociety.org/10.1109/CMPSAC.2002.1044548, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=1044548)","List(53a72e9920f7420be8c93fac, null, null, Computer Software and Applications Conference, null, null, null, COMPSAC, null, null, null, null, 0)",,2002,List(548a2e3ddabfae9b40134fbc)
53e99784b7602d9701f3f5fe,"Resource allocation for multi-tier web applications in virtualization environments is one of the most important problems in autonomous computing. On one hand, the more resources that are provisioned to a multitier web application, the easier it is to meet service level objectives (SLO). On the other hand, the virtual machine which hosts the multi-tier web application needs to be consolidated as much as possible in order to maintain high resource utilization. This paper presents an adaptive resource controller which consists of a feedback utilization controller and an auto-regressive and moving average model (ARMA)-based model estimator. It can meet application-level quality of service (QoS) goals while achieving high resource utilization. To evaluate the proposed controllers, simulations are performed on a testbed simulating a virtual data center using Xen virtual machines. Experimental results indicate that the controllers can improve CPU utilization and make the best tradeoff between resource utilization and performance for multi-tier web applications.","List(List(53f46a22dabfaee0d9c3d5e5, null, ysg_2005@hotmail.com, 5b8698cce1cd8e14a3826671, Shuguo Yang, null, null, null, null, School of Mathematics and Physics, Qingdao University of Science and Technology, Qingdao, China 266061, null, 5f71b2e91c455f439fe3f23f, null, null, null))",10.1007/s11704-011-0127-6,"List(Virtualization, Service level objective, Virtual machine, Computer science, Testbed, Quality of service, Provisioning, Resource allocation, Web application, Operating system, Distributed computing)",,4.0,"List(resource allocation, cpu utilization, quality of service)",en,2,512.0,506,,"List(53e9a073b7602d9702957efa, 53e9ad87b7602d970377bfb5, 53e9be51b7602d9704b11381, 53e9be04b7602d9704abb31d, 53e9992bb7602d9702169236, 53e998cdb7602d97021044db, 53e9afa6b7602d97039f6054, 53e99822b7602d9702044e60)",Research on resource allocation for multi-tier web applications in a virtualization environment,"List(http://dx.doi.org/10.1007/s11704-011-0127-6, http://link.springer.com/article/10.1007/s11704-011-0127-6, http://www.webofknowledge.com/)","List(572de199d39c4f49934b3d5c, 1673-7350, null, null, null, null, null, Frontiers of Computer Science in China, null, null, null, null, 0)",5.0,2011,List(53f46a22dabfaee0d9c3d5e5)
53e99792b7602d9701f5af1a,,"List(List(5631df8845cedb3399f3e752, null, null, null, Shigeru Fujita, null, null, null, null, null, null, null, null, null, null), List(53f4775edabfaee4dc891b69, null, null, null, Kenji Sugawara, null, null, null, null, null, null, null, null, null, null), List(54096ca7dabfae450f483585, null, null, null, Claude Moulin, null, null, null, null, null, null, null, null, null, null), List(5448b55bdabfae87b7e68206, null, null, null, Jean-Paul A. Barthès, null, null, null, null, null, null, null, null, null, null))",10.1109/COGINF.2010.5599834,"List(Syma, Computer science, Symbiotic computing, Multi-agent system, Human–computer interaction, Schedule, Artificial intelligence, Ubiquitous computing, Cognition)",,,"List(cognition, multi-agent systems, ubiquitous computing, ADIPS-DASH, OMAS, SYMA, actuators, awareness and operation module, cognition functions, decision functions, intelligent multiagent system, multiparadigm-multiagent framework, perceptual interaction, social interaction, symbiotic base mechanism, symbiotic multiagent system, Awareness, Cognition Layer model, Multi-agent system, Social-ware, Symbiotic Computing)",en,4,630.0,625,,"List(53e9b3dab7602d9703ec7ddf, 53e9a3edb7602d9702d03525, 53e9b9fbb7602d97045f67ae, 53e9b4c3b7602d9703fdfe37, 53e9a310b7602d9702c1a36e, 53e9abfeb7602d97035c19c5)",The design of awareness and operation module for the symbiotic applications.,"List(http://dx.doi.org/10.1109/COGINF.2010.5599834, http://doi.ieeecomputersociety.org/10.1109/COGINF.2010.5599834, http://ieeexplore.ieee.org/xpl/abstractAuthors.jsp?tp=&arnumber=5599834)","List(53a72bad20f7420be8c2d5af, null, null, IEEE International Conference on Cognitive Informatics, null, null, null, IEEE ICCI, null, null, null, null, 0)",,2010,"List(5631df8845cedb3399f3e752, 53f4775edabfaee4dc891b69, 54096ca7dabfae450f483585, 5448b55bdabfae87b7e68206)"
53e99792b7602d9701f5af27,"A promising traffic flow forecasting model based on Multivariate Adaptive Regression Splines (MARS) is developed in this paper. First, the historical traffic flow data is obtained from the loop detectors installed on the road network of Beijing. Then, part of the data is selected for training the MARS model while the rest is used to test the method. The results based on MARS method are compared with those of other methods such as the Neural Networks. The proposed MARS method is proved to have a considerable accuracy. Moreover, the model constructed with MARS can be described with analytical functions, which helps a lot in the further research on traffic flow forecasting.","List(List(53f46e66dabfaee02adb48fd, null, ysq05@mails.tsinghua.edu.cn, null, Shengqi Ye, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(53f362d7dabfae4b3498de6a, null, null, null, Yingjia He, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(54488a23dabfae87b7e3f16a, null, null, null, Jianming Hu, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null), List(561cba1b45ce11c523ca3441, null, null, null, Zuo Zhang, null, null, null, null, Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China, null, null, List(Tsinghua Univ, Natl Lab Informat Sci & Technol, Dept Automat, Beijing 100084, Peoples R China), null, null))",10.1109/FSKD.2008.678,"List(Spline (mathematics), Mars Exploration Program, Data modeling, Multivariate adaptive regression splines, Traffic flow, Computer science, Regression analysis, Artificial intelligence, Artificial neural network, Machine learning, Beijing)",,,"List(considerable accuracy, promising traffic flow forecasting, proposed mars method, neural networks, short-term traffic flow forecasting, historical traffic flow data, traffic flow forecasting, multivariate adaptive regression splines, mars model, analytical function, mars method, analytic function, forecasting, predictive models, spline, mars, traffic flow, neural network, detectors, data models, regression analysis)",en,11,675.0,669,,"List(53e9b95bb7602d9704549008, 53e9ba11b7602d97046117d8, 53e9b8f6b7602d97044dc1a6, 53e99d51b7602d970260acca, 53e9a751b7602d9703088787)",Short-Term Traffic Flow Forecasting Based on MARS,"List(http://dx.doi.org/10.1109/FSKD.2008.678, http://www.webofknowledge.com/)","List(53a72cfa20f7420be8c554b2, null, null, null, null, null, null, FSKD (5), null, null, null, null, 0)",,2008,"List(53f46e66dabfaee02adb48fd, 53f362d7dabfae4b3498de6a, 54488a23dabfae87b7e3f16a, 561cba1b45ce11c523ca3441)"
53e99792b7602d9701f5af35,"This paper describes an approach to the feature location problem for distributed systems, that is, to the problem of locating which code components are important in providing a particular feature for an end user. A feature is located by observing system execution and noting time intervals in which it is active. Traces of execution in intervals with and without the feature are compared. Earlier experience has shown that this analysis is difficult because distributed systems often exhibit stochastic behavior and because time intervals are hard to identify with precision. To get around these difficulties, the paper proposes a definition of time interval based on the causality analysis introduced by Lamport and others. A strict causal interval may be defined, but it must often be extended to capture latent events and to represent the inherent imprecision in time measurement. This extension is modeled using a weighting function which may be customized to the specific circumstances of each study. The end result of the analysis is a component relevance index, denoted p""c, which can be used to measure the relevance of a software component to a particular feature. Software engineers may focus their analysis efforts on the top components as ranked according to p""c. Two case studies are presented. The first study demonstrates the feasibility of p""c by applying our method to a well-defined distributed system. The second study demonstrates the versatility of p""c by applying our method to message logs obtained from a large military system. Both studies indicate that the suggested approach could be an effective guide for a software engineer who is maintaining or enhancing a distributed system.","List(List(53f43a51dabfaec22baa659b, null, dedwards@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Dennis Edwards, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f3b3ffdabfae4b34b2dae9, null, ssimmons@cs.uwf.edu, 5b8695e5e1cd8e14a36f684d, Sharon Simmons, null, null, null, null, Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA, null, 5f71b2bd1c455f439fe3dea6, List(Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null), List(53f4333fdabfaeb22f451979, null, nwilde@uwf.edu, null, Norman Wilde, null, null, null, null, Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., null, null, List(Corresponding author. Tel.: +1 850 474 2542; fax: +1 850 857 6056., Department of Computer Science, University of West Florida, 11000 University Parkway, Pensacola, FL 32514, USA), null, null))",10.1016/j.jss.2004.12.018,"List(Data mining, Causality, End user, Ranking, Computer science, Military systems, Software, Feature model, Component-based software engineering, A-weighting, Distributed computing)",,1.0,"List(Feature location, Distributed systems, Software reconnaissance)",en,62,68.0,57,//static.aminer.org/pdf/PDF/000/996/035/an_approach_to_feature_location_in_distributed_systems.pdf,"List(53e9b6eeb7602d970427df40, 53e9b6eeb7602d9704283b9f, 53e9b40eb7602d9703f01b25, 53e9a3c0b7602d9702ccdfc9, 53e99818b7602d97020347a2, 53e9a2acb7602d9702bb4d7e, 558aa7ea84ae84d265bee194, 558a5258e4b037c08756714c, 53e9b946b7602d97045336a9, 53e9b1d6b7602d9703c67695, 53e9a516b7602d9702e3bcea, 53e9ac33b7602d97035f892c, 53e9ba22b7602d9704628817, 53e9af3ab7602d97039769c8, 53e9b1a3b7602d9703c2c6f7, 53e9ac89b7602d9703660f90, 53e9ad2db7602d970370e8a2, 53e9a735b7602d970306db2b, 53e99960b7602d97021a17da)",An approach to feature location in distributed systems,"List(http://dx.doi.org/10.1016/j.jss.2004.12.018, https://www.sciencedirect.com/science/article/pii/S016412120500004X, http://www.webofknowledge.com/)","List(54825226582fc50b5e05610e, 0164-1212, null, null, null, null, null, Journal of Systems and Software, null, null, null, null, 0)",79.0,2006,"List(53f43a51dabfaec22baa659b, 53f3b3ffdabfae4b34b2dae9, 53f4333fdabfaeb22f451979)"
53e99792b7602d9701f5b06f,"This paper describes an approach to representing cases as nested graph-structures, i.e., as hierarchically, spatially, temporally and causally interconnected nodes (case nodes), which may be themselves recursively described by other sets of interconnected nodes. Each case node represents a case piece (sub-case). An adjacency matrix may represent these nested graph-structured cases. Within our approach, new cases are constructed using an iterative context-guided retrieval of case nodes from multiple cases. In order to illustrate the expressiveness of this case representation approach, we discuss its application to the diagnosis and therapeutics of neurological diseases, to architectural design and to storytelling. Some issues that come out of this approach, like its contribution to the representation of cases of CBR and to integrate ordinary and creative reasoning, are discussed.","List(List(53f45e2adabfaeb22f51d645, null, null, null, Luís Macedo, null, null, null, 0000-0002-3144-0362, null, null, null, null, null, null), List(53f45576dabfaeee22a30c3d, null, null, null, Amílcar Cardoso, null, null, null, null, null, null, null, null, null, null))",10.1007/BFb0056317,"List(Adjacency matrix, Graph, Knowledge representation and reasoning, Storytelling, Architectural design, Computer science, Artificial intelligence, Case-based reasoning, Recursion, Subgraph isomorphism problem)",3-540-64990-5,,"List(nested graph-structured representations, adjacency matrix)",en,20,12.0,1,,"List(53e9b049b7602d9703aadc37, 53e99df1b7602d97026b4d0e, 53e9a6fdb7602d97030331ef, 53e9b39db7602d9703e81ff1, 53e9b6fab7602d970428ee72, 53e9b109b7602d9703b875ef, 53e9a5e9b7602d9702f136da, 53e99e28b7602d97026f0125, 53e9ba17b7602d970461b707, 53e99ad1b7602d970235524d, 53e9b35ab7602d9703e35fe3, 53e9a70bb7602d9703040c52, 53e99ccab7602d970258074d, 558a73e3e4b0b32fcb36e62f, 53e99d0cb7602d97025c15c4, 53e9b1aab7602d9703c36404, 5c790e6c4895d9cbc61790aa)",Nested Graph-Structured Representations for Cases,"List(http://dx.doi.org/10.1007/BFb0056317, http://www.webofknowledge.com/)","List(53a7271520f7420be8b8b5ba, 0302-9743, null, null, null, null, null, EWCBR, null, null, null, null, 0)",1488.0,1998,"List(53f45e2adabfaeb22f51d645, 53f45576dabfaeee22a30c3d)"
53e99792b7602d9701f5b074,"Yalut is a novel user-centric hybrid content sharing overlay for social networking. Yalut enables the users to retain control over their own data and preserve their privacy, whilst still using the popular centralized services. In this demonstration, we show the feasibility of Yalut by integrating the service with the popular social networking apps on Android devices, Mac and Windows desktop platforms. We show that it is possible to provide the benefits of distributed content sharing on top of the existing centralized services with minimal changes to the content sharing process.","List(List(53f4357bdabfaee4dc77b09a, null, kanchana.thilakarathna@nicta.com.au, null, Kanchana Thilakarathna, null, null, null, 0000-0003-4332-0082, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f4662fdabfaee2a1dadc95, null, null, null, Xinlong Guan, null, null, null, null, Natl ICT Australia, Sydney, NSW, Australia, null, null, List(Natl ICT Australia, Sydney, NSW, Australia), null, null), List(53f484c5dabfaee4dc8b0b1e, null, null, null, Aruna Seneviratne, null, null, null, 0000-0001-6894-7987, UNSW, Sch EE&T, Sydney, NSW, Australia, null, null, List(UNSW, Sch EE&T, Sydney, NSW, Australia, Natl ICT Australia, Sydney, NSW, Australia), null, null))",10.1145/2594368.2601465,"List(World Wide Web, Content sharing, Android (operating system), Social network, Computer science, Active networking, Overlay, User-centered design)",,,"List(cellular data traffic offloading, mobile social networking, store and forward networks, user generated content sharing)",en,2,361.0,360,,"List(53e9b04eb7602d9703ab29b9, 557c6f6a08b02739a5ca7106, 53e9be79b7602d9704b38a13, 53e9b360b7602d9703e3d236)",Demo: Yalut -- user-centric social networking overlay,"List(http://dx.doi.org/10.1145/2594368.2601465, http://doi.acm.org/10.1145/2594368.2601465, http://dl.acm.org/citation.cfm?id=2594368.2601465&coll=DL&dl=GUIDE&CFID=521580964&CFTOKEN=96511501&preflayout=flat, http://www.webofknowledge.com/)","List(53a72cf620f7420be8c548e2, null, null, null, null, null, null, MobiSys, null, null, null, null, 0)",,2014,"List(53f4357bdabfaee4dc77b09a, 53f4662fdabfaee2a1dadc95, 53f484c5dabfaee4dc8b0b1e)"
53e99792b7602d9701f5b085,"The traffic characteristics of various distributed join algorithms on the Hypercube are analyzed. It is shown that, regardless of which join strategy is employed, the network bandwidth requirements of the computation and collection phases are radically different. This imbalance prevents these two phases from being pipelined (overlapped). To alleviate this problem, the HyperKYKLOS Network is proposed. The topology of this network is defined and a brief description of the I/O nodes presently under construction is included.","List(List(53f43415dabfaee43ec18eea, null, null, null, Bernard L. Menezes, null, null, null, null, Dept. of Electrical and Computer Eng., null, null, List(Dept. of Electrical and Computer Eng.), null, null), List(53f47f2cdabfaee43ed52fa2, null, null, null, K. Thadani, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f42f14dabfaee02ac76859, null, null, null, Alfred G. Dale, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null), List(53f46382dabfaee02ad88cb3, null, null, null, Roy M. Jenevein, null, null, null, null, University of Texas at Austin,#TAB#, null, 5f71b2841c455f439fe3c6bf, List(University of Texas at Austin,#TAB#), null, null))",10.1007/978-1-4613-1679-4_6,"List(Multiprocessor architecture, Space-based architecture, Computer architecture, Computer science, Parallel computing, Symmetric multiprocessor system, Bandwidth (signal processing), Database machine, Host processor, Hypercube, Computation)",,,,en,9,88.0,75,//static.aminer.org/pdf/PDF/000/463/867/design_of_a_hyperkyklos_based_multiprocessor_architecture_for_high_performance.pdf,"List(53e9aacab7602d9703449777, 53e9b153b7602d9703bda549, 53e9a82bb7602d970317135f, 53e9bb7ab7602d97047bc792)",Design of a HyperKYKLOS-based Multiprocessor Architecture for High-Performance Join Operations,"List(http://dx.doi.org/10.1007/978-1-4613-1679-4_6, https://link.springer.com/chapter/10.1007%2F978-1-4613-1679-4_6, http://dblp.uni-trier.de/db/conf/iwdm/iwdm87.html#MenezesTDJ87, https://rd.springer.com/chapter/10.1007/978-1-4613-1679-4_6)","List(53a72ac520f7420be8c0cd21, null, null, null, null, null, null, IWDM, null, null, null, null, 0)",,1987,"List(53f43415dabfaee43ec18eea, 53f47f2cdabfaee43ed52fa2, 53f42f14dabfaee02ac76859, 53f46382dabfaee02ad88cb3)"
