In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('read data through spark').getOrCreate()

In [3]:
spark

In [4]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.types as T
from pyspark.sql.types import IntegerType

# Load and clean Paper DF

In [6]:
### load paper into schema
dtypes = pd.read_csv('./schemas/paper.csv').to_records(index=False).tolist()
print(dtypes)
fields = [T.StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
paper_df = spark.read.option('header', 'true').csv('./assets/parsedData/papers.csv', header=True, schema=schema)

[('paper_id', 'Integer'), ('title', 'String'), ('year', 'Integer')]


In [7]:
paper_df.show()

+--------+--------------------+----+
|paper_id|               title|year|
+--------+--------------------+----+
|      65|                null|null|
|     130|                null|null|
|     195|317424;317425;317573|null|
|     260|                null|null|
|     325|                null|null|
|     390|                null|null|
|     455|                null|null|
|     520|       318368;323493|null|
|     585|                null|null|
|     650|                null|null|
|     715|                null|null|
|     780|318420;319233;319...|null|
|     845|                null|null|
|     910|                null|null|
|     975|67604;318882;3718...|null|
|    1040|                null|null|
|    1105|289087;318014;318...|null|
|    1170|                null|null|
|    1235|                null|null|
|    1300|                null|null|
+--------+--------------------+----+
only showing top 20 rows



22/01/16 15:22:20 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 2, schema size: 3
CSV file: file:///Users/yarycka/Desktop/WS2021/DIA/citation-networks-dia/assets/parsedData/papers.csv


In [8]:
### data cleaning for paper schema

### remove spaces from values of the columns
paper_df = paper_df.withColumn("paper_id", trim(paper_df.paper_id))
paper_df = paper_df.withColumn("title", trim(paper_df.title))
paper_df = paper_df.withColumn("year", trim(paper_df.year))

In [9]:
### check for the data types
paper_df.printSchema()
### change the data type of year to Integer
paper_df = paper_df.withColumn("year",paper_df["year"].cast(IntegerType()))

root
 |-- paper_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: string (nullable = true)



In [10]:
### check for nonsense null data
null_values_paper_df = paper_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in paper_df.columns]
   )
### save the ids of papers whose title is missing to clean up the other dataframes
null_paper_ids = paper_df.filter(paper_df['title'].isNull())
null_paper_ids_list=null_paper_ids.select('paper_id').rdd.flatMap(lambda x: x).collect()
null_paper_ids_list = [int(item) for item in null_paper_ids_list]

22/01/16 15:22:29 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: paper_id, ref_ids
 Schema: paper_id, title
Expected: title but found: ref_ids
CSV file: file:///Users/yarycka/Desktop/WS2021/DIA/citation-networks-dia/assets/parsedData/papers.csv
                                                                                

In [11]:
### after checking the below dataframes, all papers whose title is missing have the authors besides paper_id = 748056
### decision: fill missing titles with : Missing Title

paper_df=paper_df.na.fill('Missing Title', ['title'])

In [12]:
paper_df.columns

['paper_id', 'title', 'year']

In [13]:
### check if there are duplicate rows
paper_df.join(paper_df.groupBy(paper_df.columns).agg((F.count("*")>1).cast("int").alias("Duplicate_indicator")),
on=paper_df.columns,how="inner").show()
###there are no duplicates
paper_df.groupby(['paper_id']).count().where('count > 1').sort('count', ascending=False).show()

22/01/16 15:22:47 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 2, schema size: 3
CSV file: file:///Users/yarycka/Desktop/WS2021/DIA/citation-networks-dia/assets/parsedData/papers.csv
22/01/16 15:22:49 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 2, schema size: 3
CSV file: file:///Users/yarycka/Desktop/WS2021/DIA/citation-networks-dia/assets/parsedData/papers.csv
[Stage 3:>                                                          (0 + 8) / 8]

+--------+-----+----+-------------------+
|paper_id|title|year|Duplicate_indicator|
+--------+-----+----+-------------------+
+--------+-----+----+-------------------+



[Stage 6:>                                                          (0 + 8) / 8]

+--------+-----+
|paper_id|count|
+--------+-----+
+--------+-----+





# Load and clean Affiliations df

In [14]:
### load affiliation into schema
dtypes = pd.read_csv('./schemas/affiliation.csv').to_records(index=False).tolist()
print(dtypes)
fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
affiliation_df = spark.read.option('header', 'true').csv('./assets/parsedData/affiliations.csv', header=True, schema=schema)

[('affiliations', 'String'), ('paper_id', 'Integer')]


In [15]:
affiliation_df = affiliation_df.withColumn("affiliations", trim(affiliation_df.affiliations))
affiliation_df = affiliation_df.withColumn("paper_id", trim(affiliation_df.paper_id))
affiliation_df = affiliation_df.withColumn("paper_id",affiliation_df["paper_id"].cast(IntegerType()))

affiliation_df.printSchema()
affiliation_df.show()

root
 |-- affiliations: string (nullable = true)
 |-- paper_id: integer (nullable = true)

+--------------------+--------+
|        affiliations|paper_id|
+--------------------+--------+
|The Queen's Unive...|      65|
|Univ. of Karlsruh...|     130|
|AERE Harwell Labo...|     195|
|University of Mic...|     260|
|Oslo politikammer...|     325|
|Harvard Univ., Ca...|     390|
|Cornell Univ., It...|     455|
|IBM General Techn...|     520|
|               -;-;-|     585|
|New York Univ., N...|     650|
|                   -|     715|
|Xerox Palo Alto R...|     780|
|Univ. of Californ...|     845|
|University of Bol...|     910|
|AT & T Bell Labor...|     975|
|Cornell Univ., It...|    1040|
|University of Mar...|    1105|
|Laboratoire de Ps...|    1170|
|Yale Univ., New H...|    1235|
|                 -;-|    1300|
+--------------------+--------+
only showing top 20 rows



In [16]:
### check for null values in the affiliations column
null_values_affiliations=affiliation_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in affiliation_df.columns]
   )

In [17]:
### This df is used to count papers per unique affiliation, so if the affiliation is missing, it doesnt make sense
### drop all rows where affiliation is null

affiliation_df=affiliation_df.na.drop(how="any", subset=['affiliations'])

In [18]:
affiliation_df.filter(affiliation_df.affiliations.contains('-')).collect()

                                                                                

[Row(affiliations='-;-;-', paper_id=585),
 Row(affiliations='-', paper_id=715),
 Row(affiliations='-;-', paper_id=1300),
 Row(affiliations='-', paper_id=1430),
 Row(affiliations='-', paper_id=1495),
 Row(affiliations='-', paper_id=1625),
 Row(affiliations='-;-;-', paper_id=1690),
 Row(affiliations='Univ. of Illinois at Urbana-Champaign, Urbana;Massachusetts Institute of Technology, Cambridge', paper_id=2795),
 Row(affiliations='-', paper_id=2860),
 Row(affiliations='Mihailo Pupin Institute, Belgrade, Yugoslvia;Carnegie-Mellon Univ., Pittsburgh, PA;Intel Corp., Aloha, OR', paper_id=2990),
 Row(affiliations='Boston Univ., Boston, MA;-', paper_id=3185),
 Row(affiliations='NCR Corporation;-', paper_id=3315),
 Row(affiliations='-', paper_id=3575),
 Row(affiliations='-', paper_id=3705),
 Row(affiliations='-;-', paper_id=4095),
 Row(affiliations='-', paper_id=4160),
 Row(affiliations='Univ. of Alabama, Birmingham;-;-', paper_id=4290),
 Row(affiliations='-', paper_id=4485),
 Row(affiliations='

In [19]:
### check if affiliations are missing as well for the ids whose title was missing in paper_df
for rows in affiliation_df.select("affiliations","paper_id").collect():
    if rows[1] in null_paper_ids_list:
        print(rows[0], rows[1])

                                                                                

The Queen's University of Belfast, Belfast, UK;The Queen's University of Belfast, Belfast, UK 65
Univ. of Karlsruhe, Karlsruhe, West Germany 130
University of Michigan, Ann Arbor, MI;University of Michigan, Ann Arbor, MI 260
Oslo politikammer, Oslo, Norway 325
Harvard Univ., Cambridge, MA;Boston Univ., Boston, MA 390
Cornell Univ., Ithaca, NY;Cornell Univ., Ithaca, NY 455
-;-;- 585
New York Univ., New York, NY 650
- 715
Univ. of California, Berkeley 845
University of Bologna, Bologna, Italy;University of Bologna, Bologna, Italy;University of Bologna, Bologna, Italy 910
Cornell Univ., Ithaca, NY;Cornell Univ., Ithaca, NY;Virginia Polytechnic Institute and State Univ., Blacksburg, VA 1040
Laboratoire de Psychologie de l'Apprentissage, Marseille, France;Laboratoire de Psychologie de l'Apprentissage, Marseille, France;Socie´te´ Opeform, Malakoff, France;Laboratoire de Psychologie de l'Apprentissage, Marseille, France 1170
Yale Univ., New Haven, CT 1235
-;- 1300
Manatee Community College, B

- 32305
-;- 32370
- 32435
General Motors Research Labs., Warren, MI;Univ. of Notre Dame, Notre Dame, IN 32500
Delft Univ. of Technology, Delft, The Netherlands 32565
Univ. of Colorado, Boulder, CO 32630
Softwords, Victoria, B.C., Canada;Softwords, Victoria, B.C., Canada;Softwords, Victoria, B.C., Canada;Softwords, Victoria, B.C., Canada;Softwords, Victorias, B.C., Canada 32695
Kyoto Univ., Kyoto, Japan 32760
Purdue Univ., West Lafayette, IN 32825
Ghent State Univ., Ghent, Belgium;Ghent State Univ., Ghent, Belgium 32955
Univ. of Puget Sound, Tacoma, WA 33020
The William Paterson College of New Jersey, Wayne;Irvington High School, Irvington, NJ 33085
- 33150
-;- 33215
New Mexico State Univ., Las Cruces;New Mexico State Univ., Las Cruces 33280
City of London Polytechnic and Computerised Management Aids, UK;Operational Research Service, UK 33345
State University of New York, Binghamton, NY 33410
Univ. of Crete, Heraklio, Crete, Greece;Univ. de Dijon, Dijon, France;Univ. de Dijon, Dijon, Fr

-;-;- 50570
-;-;- 50635
-;- 50700
- 50765
Texas A&MM Univ., College Station, TX;Texas Univ., College Station, TX;Texas Univ., College Station, TX 50830
-;-;- 50895
- 50960
- 51025
- 51090
-;-;- 51155
- 51220
Univ. Osnabru¨ck, Osnabru¨ck, W. Germany 51285
-;- 51350
- 51415
Wump Research and Company 51545
- 51610
- 51675
- 51740
- 51805
- 51870
Associated Market Research, Austin, TX;MicroCAD Managers, Syracuse, NY;CCS Systems, Wayne, PA 51935
Sowerby Research Centre, Bristol, UK;Univ. of Aston, Birmingham, UK;Cheadle Hulme School, Cheshire, UK 52000
- 52065
-;-;- 52130
-;- 52195
- 52260
- 52325
Univ. of Southern California, Los Angeles, CA;Univ. of Southern California, Los Angeles, CA;Univ. of Southern California, Los Angeles, CA 52390
Cornell Univ., Ithaca, NY;Cornell Univ., Ithaca, NY 52455
-;- 52520
-;- 52585
-;- 52650
Brown Univ.;Brown Univ. 52715
Univ. Dortmund, W. Germany;Univ. Dortmund, W. Germany;Univ. Dortmund, W. Germany 52780
- 52845
-;- 52910
- 52975
- 53040
-;- 53105
- 53170

- 85410
-;- 85475
- 85605
MRC Applied Psychology Unit, 15 Chaucer Road, Cambridge, CB2 2EF, England 85735
- 85865
- 85930
- 85995
- 86060
- 86450
- 86515
- 86580
-;- 86645
- 86775
- 86840
-;- 86905
-;-;- 87165
- 87230
Documentation Specialist, University of Alaska at Anchorage, Computing & Technology Services, 3211 Providence Drive, Anchorage, Alaska;Training Specialist, University of Alaska at Anchorage, Computing & Technology Services, 3211 Providence Drive, Anchorage, Alaska 87360
Department of Computing Services, University of Saskatchewan, Saskatoon, Saskatchewan, CANADA S7N 0W0 87425
-;- 87555
-;-;-;- 87685
-;-;-;-;-;- 87815
-;-;-;-;- 87880
- 87945
- 88010
Univ. of South Carolina, Columbia, SC 88075
Eastern Michigan Univ., Psilanti 88140
Univ. of Maryland, College Park 88205
Virginia Commonwealth Univ., Richmond 88270
-;-;- 88335
Univ. of California, Los Angeles 88530
Dept. of Computer and Information Science, University of Arkansas at Little Rock;Dept. of Computer and Informatio

Univ. of Maryland, Baltimore County 127530
Stanford Univ., Stanford, CA 127660
-;- 127725
Univ. of Minnesota, Minneapolis 127855
- 128050
- 128115
Arizona State Univ., Tempe 128245
- 128375
Univ. of California, Los Angeles 128440
Harvard Univ., Boston, MA 128570
The Univ. of Oklahoma, Norman 128635
Carleton Univ., Ottawa, Ont., Canada 128700
Univ. of Toronto, Toronto, Ont., Canada 128765
- 128895
- 128960
- 129025
- 129155
Rensselaer Polytechnic Institute, Troy, NY 129220
- 129285
-;- 129350
- 129415
Mississippi State Univ., Mississippi State, MS 129480
-;- 129610
- 129675
-;-;- 129805
-;- 129870
-;-;- 129935
-;-;-;- 130000
-;-;- 130065
-;- 130130
The George Washington Univ., Washington, DC 130195
-;- 130260
Duke Univ., Durham, NC 130325
Old Dominion Univ., Norfolk, VA 130390
Univ. of California, Berkeley 130455
Temple Univ., Philadelphia, PA 130585
West Virginia Univ., Morgantown 130650
-;-;- 130715
-;-;- 130780
-;-;- 130845
-;- 130910
-;-;- 130975
- 131040
-;- 131105
- 131170
-;- 131

-;-;-;- 169845
- 169910
- 169975
-;- 170040
-;-;- 170105
-;-;- 170170
-;- 170235
-;-;- 170300
-;-;- 170365
- 170430
-;- 170495
- 170560
- 170625
-;-;-;- 170690
-;- 170755
- 170820
- 170885
-;- 170950
- 171015
-;- 171080
- 171145
-;-;-;- 171210
-;- 171275
-;- 171340
Obninsk State Study and Conference Centre, 'Algorithm' Researeh & Methodology Bureau, 21 Kurchatov St., Obninsk 249020, Kaluga Region, Russia;Obninsk State Study and Conference Centre, 'Algorithm' Researeh & Methodology Bureau, 21 Kurchatov St., Obninsk 249020, Kaluga Region, Russia 171405
- 171600
-;- 171665
University of Arizona 171795
- 171860
-;-;-;-;- 171925
Fairleigh Dickinson Univ., Rutherford, NJ;Manhattan College, Bronx, NY 171990
-;- 172055
Computer Science Department, St. Bonaventure University, St. Bonaventure, NY 172120
Georgetown Univ.;Bell Atlantic;Bell Atlantic;George Mason Univ. 172250
-;-;-;- 172315
- 172445
-;-;- 172510
Univ Illinois, Dept Math Stat & Comp Sci, Chicago, IL 60607, USA 172835
AT&T Gibbal Inf

- 201890
IBM Toronto Laboratory 202085
- 202150
Uniforum, Santa Clara, California 202215
AT&T Istel, Inc., 60 Mall Road, Burlington, MA 202670
IBM Corp., 11400 Burnet Road, Austin, TX;Dept. of Computer and Electrical Engineering, The University of Texas at Austin, Austin, TX and Carnegie Mellon University, Dept. of ECE, Pittsburgh, PA 202865
-;- 202930
-;- 203255
-;-;- 203450
-;-;-;- 203515
- 203580
- 203645
-;-;- 203710
-;- 203840
- 203905
-;- 203970
University of California, Berkeley, CA 204035
CRIN-INRIA/ and LIFIA-IMAG, Grenoble Cedex, France 204100
- 204165
- 204230
-;-;-;- 204295
-;- 204360
-;- 204490
-;- 204555
- 204620
- 204685
- 204750
-;-;-;-;- 204815
-;- 204880
-;- 204945
- 205010
AT&T Bell Labs;AT&T Bell Labs 205075
- 205140
- 205205
-;- 205270
The Atlantic Systems Guild 205335
- 205465
- 205530
-;-;- 205595
- 205660
-;- 205725
-;-;-;-;- 205790
-;- 205920
-;- 206050
- 206115
- 206180
-;- 206245
-;-;- 206310
-;-;-;- 206375
-;- 206440
- 206505
- 206570
Rabobank Nederland, The

Animation Research Ltd., Level 2, 4-50 Moray Place, Dunedin, New Zealand 233415
Microsoft, One Microsoft Way, Redwest E, Redmond, Washington 233545
- 233675
-;- 233740
- 233805
Lockheed Missiles & Space Company, 1111 Lockheed Way, 0/78-20, B/564, Sunnyvale, California 234065
Industrieel Ontwerpen, Technische Universiteit Delft, Jaffalaan 9, 2628 BX Delft, The Netherlands;Industrieel Ontwerpen, Technische Universiteit Delft, Jaffalaan 9, 2628 BX Delft, The Netherlands;Rank Xerox Cambridge EuroPARC, 61 Regent Street, Cambridge CB2 1AB, UK 234260
Electronic Data Systems, 300 East Big Beaver, Troy, Michigan 234325
Universität, GH, Paderborn, D-33095 Paderborn, Germany 234390
-;- 234520
-;- 234650
- 234780
-;- 234845
- 234910
Univ. of Reading 234975
-;- 235105
- 235170
Department of Computer Systems, Uppsala University, P.O. Box 325, S--751 05 Uppsala, Sweden;Department of Computer Systems, Uppsala University, P.O. Box 325, S--751 05 Uppsala, Sweden 235235
-;-;- 235300
-;- 235365
- 235560
-

-;- 258830
- 258895
- 258960
-;-;-;- 259025
- 259155
- 259220
- 259285
-;- 259350
-;- 259415
Oregon Graduate Institute, Portland, OR;Oregon Graduate Institute, Portland, OR;Oregon Graduate Institute, Portland, OR;Oregon Graduate Institute, Portland, OR;Oregon Graduate Institute, Portland, OR;Oregon Graduate Institute, Portland, OR;Oregon Graduate Institute, Portland, OR;Univ. of California, Santa Cruz;Univ. of California, Santa Cruz;Univ. of California, Santa Cruz;Carnegie Mellon Univ., Pittsburgh, PA;Carnegie Mellon Univ., Pittsburgh, PA;Carnegie Mellon Univ., Pittsburgh, PA;Tucker-Maxon Oral School, Portland, OR;Tucker-Maxon Oral School, Portland, OR;Tucker-Maxon Oral School, Portland, OR;Tucker-Maxon Oral School, Portland, OR 259480
Univ. of Tennessee, Knoxville 259545
- 259740
-;- 259870
-;- 260130
- 260195
-;- 260260
- 260325
-;- 260455
- 260520
- 260585
Boston Univ., Boston, MA;Univ. of Groningen, Groningen, The Netherlands 260650
The Boeing Co.;Univ. of California, Irvine 260715

Department of Statistics, Texas A&M University, College Station, TX 285805
Industrial Engineering Department, Southern Illinois University, Edwardsville, IL;School of Industrial Engineering, Oklahoma State University, Stillwater, OK 285870
Computer Systems Documentation, Albany, California 286065
-;-;-;-;- 286130
Computer Science Division, Department of Electrical Engineering and Computer Sciences, University of California at Berkeley, Berkeley, California 286195
Univ. di Firenze, Florence, Italy;Massachusetts Institute of Technology, Cambridge 286455
TRW;AverStar;Ada Core Technologies & ACT-Europe 286585
Department of Art, 155S 1452E, Rm. 405 CHPC, University of Utah, Salt Lake City, UT;Department of Biology,  155S 1452E, Rm. 405 CHPC, University of Utah, Salt Lake City, UT;Center for High Performance Computing and Department of Computer Science,  155S 1452E, Rm. 405 CHPC, University of Utah, Salt Lake City, UT 286715
- 286780
-;- 287560
-;- 287885
National Advisory Committee for Aero

AT&T Labs, Middletown, NJ;AT&T Labs, Middletown, NJ;AT&T Labs, Middletown, NJ;AT&T Labs, Middletown, NJ 309530
- 309595
-;-;-;- 309660
Department of Health Administration, PO Box 980203, Virginia Commonwealth University, Richmond, VA 23298-0203. Phone: 804-828-5224. Fax: 804-828-1894. e-mail: <rfc822>OZCAN@HSC.VCU.EDU</rfc822>;Department of Health Administration, PO Box 980203, Virginia Commonwealth University, Richmond, VA 23298-0203. Phone: 804-828-5224. Fax: 804-828-1894. e-mail: <rfc822>OZCAN@HSC.VCU.EDU</rfc822>;Department of Health Administration, PO Box 980203, Virginia Commonwealth University, Richmond, VA 23298-0203. Phone: 804-828-5224. Fax: 804-828-1894. e-mail: <rfc822>OZCAN@HSC.VCU.EDU</rfc822> 309725
- 309790
-;-;- 309855
-;-;- 309920
-;- 309985
-;- 310050
-;-;- 310115
-;- 310180
-;-;-;-;-;- 310245
-;- 310310
-;-;- 310375
-;-;- 310440
- 310635
-;- 310700
- 310830
- 310895
Language Technologies Institute, Carnegie Mellon University, Pittsburgh, PA 15213;Language Technologi

-;- 331955
- 332020
University of Ulster, Northern Ireland  (E-mail: <rfc822>p.mckevitt@qub.ac.uk</rfc822>);National University of Ireland, Galway (NUI Galway),  Ireland;Nous Research & CSSI, Ireland 332085
-;-;-;-;- 332150
Department of Mathematics, Colgate University, Hamilton, New York, 11346, f1aaron@math.colgate.eduf1;Department of Mathematics and Statistics, South Dakota State University, Brookings, South Dakota, 57007, f2daniel_schaal@sdstate.eduf2 332215
-;-;- 332280
-;- 332345
LLP-CESALP, ESIA Engineering School, University of Savoie, France;LLP-CESALP, ESIA Engineering School, University of Savoie, France 332410
- 332475
- 332605
-;- 332670
Rutgers Univ., New Brunswick, NJ 332735
-;-;-;- 332800
- 332865
Institute of Scientific Computing, ETH Zurich, Zurich, Switzerland 332930
-;- 332995
Computer Science Department, The University of Texas at Tyler, 3900 University Boulevard, Tyler, Texas 333060
-;-;-;- 333125
-;- 333190
- 333255
-;-;- 333320
-;-;- 333385
- 333515
-;-;- 333645

Nara Institute of Science and Technology, Nara, Japan and ATR Media Information Science Laboratories, Kyoto, Japan;National Institute of Informatics, Tokyo, Japan and Nara Institute of Science and Technology, Nara, Japan 379210
-;-;-;- 379340
- 379600
-;-;- 379665
-;-;- 379730
-;-;- 379795
-;- 379860
-;-;- 379925
-;- 379990
-;-;-;-;- 380055
-;- 380120
-;- 380185
-;- 380250
-;- 380315
-;- 380380
-;- 380445
-;-;- 380510
-;-;- 380575
-;- 380640
-;- 380705
-;- 380770
-;-;- 380835
-;- 380900
-;- 380965
-;- 381030
-;- 381095
-;- 381160
-;- 381225
-;- 381290
-;- 381355
-;-;- 381420
-;- 381485
-;- 381550
-;- 381615
-;- 381680
-;- 381745
-;- 381810
-;- 381875
-;- 381940
-;- 382005
-;-;-;- 382070
-;- 382135
-;- 382200
-;-;-;- 382265
- 382330
-;-;- 382395
-;-;-;-;-;- 382460
-;-;-;- 382525
-;-;- 382590
-;-;-;-;-;- 382655
- 382720
-;- 382785
-;- 382850
-;- 382915
-;- 382980
-;-;- 383045
- 383110
-;-;-;- 383175
-;- 383240
-;- 383305
-;- 383370
-;- 383435
-;- 383500
- 383565
-;-;-;-;-;-;-;-;- 383630


UMBC, Department of Information Systems, Academic IV Building, B Wing, 1000 Hilltop Circle, Baltimore, MD 21250 USA <rfc822>cseaman@umbc.edu</rfc822> 420875
Tilburg University, Faculty of Law /  Eindhoven University of Technology, Faculty of  Technology Management, The Netherlands.  E-mail: <rfc822>anton.vedder@kub.nl</rfc822> 420940
Department of Languages and Philosophy, Utah State University, Logan, UT 84322-0720, USA E-mail: <rfc822>dmichel@cc.usu.edu</rfc822> 421005
Department of Computer Science, Keele University, UK 421070
Centrum voor Wiskunde en Informatica, P.O. Box 94079, 1090 GB Amsterdam, The Netherlands  (E-mail: Email: nivelle@cwi.nl) 421980
- 422500
Department of Philosophy, University of Alabama at Birmingham,  Birmingham, AL 35294, U.S.A. E-mail: litch@uab.edu 422695
Corresponding address: Visual Communication Lab., Human Interface Labs., Nippon Telegraph and Telephone Corporation, 1-1 Hikarinooka, Yokosuka, Kanagawa 239,  Japan. E-mail: okuda@nttvdt.hil.ntt.c ...;Cor

- 448955
- 449995
Department of Mathematics, The George Washington University, Washington, DC 450385
Interdepartmental Group of Biostatistics, University of California, 367 Evans Hall, 94720-3860 Berkeley, CA;Department of Statistics, Pennsylvania State University, PA 450515
Equipe d'analyse numerique member de l'UMR 5585 Lyon-st-Etienne, 23, rue Paul Michelon, 42023 Saint-Etienne Cedex 2, France;Equipe d'analyse numerique member de l'UMR 5585 Lyon-st-Etienne, 23, rue Paul Michelon, 42023 Saint-Etienne Cedex 2, France. <rfc822>panasenko@anumsum1.univ-st-etienne.fr</rfc822> 450970
<i>Department of Computer Science, Oregon State University, Corvallis, OR 97331, U.S.A.</i> <rfc822>DIETTERICH%OREGON-STATE@CSNET-RELAY</rfc822> 451035
Chair, ACM Queue Advisory Board 451165
- 451295
Department of Microelectronics and Information Technology, Royal Institute of Technology, Stockholm, Sweden;Department of Microelectronics and Information Technology, Royal Institute of Technology, Stockholm, Swed

-;-;- 504595
-;-;- 504660
-;-;- 504725
-;-;-;-;-;-;-;-;-;- 504790
-;-;-;-;- 504855
- 505180
- 505245
- 505310
- 505375
- 505505
-;- 505635
- 505700
- 505765
- 505895
-;-;- 505960
- 506025
- 506090
- 506155
- 506220
-;- 506285
- 506350
- 506545
- 506675
-;-;- 506805
-;-;-;-;-;- 506870
-;-;-;- 506935
- 507000
- 507130
- 507195
- 507260
-;-;-;- 507390
-;-;-;- 507455
- 507520
-;- 507585
- 507715
-;- 507910
-;- 507975
- 508040
-;- 508170
- 508300
-;- 508365
-;- 508430
-;- 508495
-;-;-;- 508560
-;-;- 508625
-;- 508690
-;- 508755
- 508820
- 508885
-;-;-;-;- 509015
-;-;- 509145
- 509210
-;- 509275
- 509340
- 509470
- 509535
- 509600
-;-;-;- 509730
-;- 509795
-;- 509925
-;- 510055
-;- 510315
- 510445
-;- 510705
-;-;- 510770
- 510835
-;- 510900
-;-;- 510965
- 511030
-;-;-;- 511095
-;-;- 511160
-;-;-;- 511225
-;- 511355
- 511420
- 511485
-;- 511550
-;- 511615
- 511680
-;-;-;-;-;- 511745
-;-;-;-;- 511810
-;-;- 511875
-;- 512005
-;-;-;-;- 512200
-;-;- 512395
- 512525
-;- 512590
-;-;-;-;-;-;- 512655

Nanyang Technological University, Blk N4, 2A-13 Nanyang Avenue, Singapore 639798. <rfc822>asahtan@ntu.edu.sg</rfc822>;IBM T.J. Watson Research Center, Yorktown Heights, NY 10598, USA. <rfc822>psyu@us.ibm.com</rfc822> 569855
- 570050
Institute of Natural Sciences, College of Dunaújváros;Department of Mathematics and Computing, University of Veszprém;Department of Process Engineering, University of Veszprém 570180
University of Urbino, Italy;University of Urbino, Italy 570570
Board of Governors of Federal Reserve System, Washington, D. C. 570830
Rockwell Scientific Company, Thousand Oaks, CA;Rockwell Scientific Company, Thousand Oaks, CA;Rockwell Scientific Company, Thousand Oaks, CA 570960
- 571545
Department of Medical Informatics, College of Physicians and Surgeons, Columbia University, New York, NY;Department of Microbiology, College of Physicians and Surgeons, Columbia University, New York, NY;Department of Internal Medicine, Columbia-Presbyterian Medical Center, New York, NY;Genais

-;-;-;-;- 592475
-;- 592540
-;-;- 592605
-;-;- 592735
-;- 592800
-;- 592865
-;-;- 592930
-;-;- 593255
- 593450
-;- 593515
-;-;- 593580
-;-;- 593645
- 593710
-;-;- 593775
-;-;-;-;- 593840
-;- 593970
- 594035
-;-;- 594165
-;-;- 594230
- 594295
- 594360
-;- 594425
-;-;- 594490
- 594555
-;- 594620
-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;- 594685
-;-;- 594750
-;-;- 594815
-;-;- 594880
-;- 594945
-;-;-;- 595010
-;-;- 595075
-;-;- 595140
-;-;- 595205
-;-;-;- 595270
- 595335
-;-;- 595400
-;- 595465
- 595530
- 595595
-;- 595660
-;- 595725
-;-;- 595790
-;-;- 595855
-;- 595920
-;-;- 595985
- 596050
-;-;- 596115
-;- 596180
-;- 596245
- 596310
-;- 596375
-;- 596440
-;-;-;-;-;-;-;-;-;-;- 596505
-;-;-;-;- 596570
-;-;-;- 596635
BGS Systems, Inc., Waltham, Mass. 596700
-;-;-;-;-;-;- 596765
Cambridge University Computer Laboratory, Corn Exchange Street, Cambridge CB2 3QG England;Cambridge University Computer Laboratory, Corn Exchange Street, Cambridge CB2 3QG England 596830
CNR - National Research Council - CNU

-;- 641810
- 641940
- 642005
Department of Civil Engineering, IMAR, University of Coimbra, Pólo II--Pinhai de Marrcos, 3030-290 Coimbra, Portugal;Department of Civil Engineering, IMAR, University of Coimbra, Pólo II--Pinhai de Marrcos, 3030-290 Coimbra, Portugal;Department of Civil Engineering, IMAR, University of Coimbra, Pólo II--Pinhai de Marrcos, 3030-290 Coimbra, Portugal 642070
- 642460
- 642525
-;-;-;-;- 642590
University of Copenhagen, DIKU, Universitetsparken 1, 2100 Copenhagen, Denmark;Kansas State University, Department of Computing and Information Sciences, 234 Nichols Hall, Manhattan, KS;University of Texas at Dallas, Department of Computer Science, P.O. Box 830688, Richardson, TX 642850
- 643630
-;- 643695
- 643825
-;- 643890
-;- 643955
-;- 644020
-;-;-;- 644085
- 644150
- 644215
- 644280
-;- 644345
-;- 644410
- 644475
- 644540
- 644605
-;-;-;- 644670
-;- 644735
-;- 644800
-;- 644865
-;- 644930
-;- 644995
- 645190
-;- 645255
- 645320
- 645385
-;- 645450
- 645515
-;-;-;- 6

-;- 679380
- 679445
- 679510
- 679640
-;- 679705
- 679770
-;- 679835
-;- 679900
- 679965
-;- 680030
-;- 680095
-;-;- 680160
-;- 680225
-;-;- 680290
-;-;- 680355
- 680420
-;-;- 680485
- 680550
-;-;- 680615
-;- 680680
- 680745
-;-;- 680810
-;- 680875
- 680940
-;- 681005
- 681070
- 681135
- 681200
-;- 681265
-;-;-;- 681330
-;-;-;- 681395
-;- 681460
-;-;-;- 681525
- 681590
-;-;-;- 681655
- 681720
-;-;- 681785
- 681850
-;- 682045
- 682110
-;- 682175
- 682240
- 682305
-;-;- 682370
-;-;-;-;- 682435
-;- 682500
-;- 682565
-;- 682630
- 682695
-;- 682760
-;- 682825
-;- 682890
-;- 682955
-;-;- 683020
- 683085
-;-;- 683150
-;-;- 683215
-;- 683280
-;- 683345
- 683410
-;-;- 683475
Indiana University Purdue University Indianapolis (IUPUI);Montana State University 683605
- 683800
- 683865
- 683930
- 683995
- 684060
- 684125
- 684190
- 684255
- 684320
- 684385
- 684450
- 684515
- 684580
- 684645
- 684710
- 684775
- 684840
- 684905
- 684970
- 685035
- 685100
- 685165
- 685230
- 685295
- 685360
- 685425
-

INMOS Limited, Whitefriars, Lewins Mead, Bristol 725335
Department of Mathematical Sciences, University of Northern Colorado, Ross Hall of Sciences, Greeley, CO 725660
Temple University;Cleveland State University 725790
- 725855
National Center for Supercomputing Applications (NCSA), Champaign, Illinois 726050
Department of Computer Science and Engineering, University of Bridgeport, Bridgeport, CT 06601, USA;Department of Computer Science and Engineering, University of Bridgeport, Bridgeport, CT 06601, USA/ e-mail: <rfc822>beiwang@bridgeport.edu</rfc822>;Department of Music, University of Bridgeport, Bridgeport, CT 06601, USA 726180
SGS-Thomson Microelectronics, Agrate Brianza, Italy;University of Colorado Boulder, CO 726375
- 726700
-;-;-;- 726765
-;-;-;- 726830
-;-;- 726960
-;- 727025
-;-;- 727155
- 727220
-;- 727285
- 727350
Bell Communications Research, Morristown, New Jersey 727480
-;-;- 727740
- 727935
-;-;- 728000
-;-;- 728065
-;-;- 728130
-;-;- 728195
Opalon, 4-Siromyatnichesky

Redwood City, CA;Adobe Systems Inc., San Jose, CA 761345
IBM Tokyo Research Laboratory, 1623-14 Shimotsuruma, Yamato-shi, Kanagawa 242-8502, Japan 761410
-;- 761475
- 761540
-;-;- 761605
-;- 761670
-;- 761735
-;- 761800
-;- 761865
-;- 761930
-;- 761995
Computer Sciences Corporation, San Diego, CA 762515
University of Southern California;-;-;-;-;- 762580
Electronics and Communication Engineering Department, Cairo University, Egypt;Electronics and Communication Engineering Department, Cairo University, Egypt;Electronics and Communication Engineering Department, Cairo University, Egypt. <rfc822>asoliman@idsc.net.eg</rfc822> 763490
University of Memphis, Department of Mathematics, USA and Trinity College, Cambridge CB2 1TQ, UK;University College London, Department of Mathematics, UK 763555
Laboratory of Semiconductor Physics, Department of Physics, University of Leuven, Celestijnenlaan 200D, 3001 Leuven, Belgium;Laboratory of Semiconductor Physics, Department of Physics, University of Leuv

Future Applications Lab, Viktoria Institute, Hörselgången 4, 417 56, Göteborg, Sweden;Embedded Interaction Research Group, University of Munich, Amalienstrasse 17, 80333, München, Germany;Visualization Department, Zuse Institute Berlin (ZIB), Takustrasse 7, 14195, Berlin, Germany 777855
- 777920
- 777985
-;- 778050
-;- 778115
Graz University of Technology, Department of Mathematics C, Steyrergasse 30 / III, A-8010, Graz, Austria 778180
Institut fur Geometrie, TU Wien, A-1040 Wien, Austria (e-mail: michael.drmota@tuwien.ac.at);Department of Computer Science, Purdue University, W. Lafayette, IN 47907, USA (e-mail: spa@cs.purdue.edu) 778440
Chinese Academy of Engineering 778895
University of Aizu;University of Aizu;University of Aizu;University of Aizu 778960
Dalian University of Technology;Dalian University of Technology;Dalian University of Technology 779025
Universität Karlsruhe 779220
ABB Switzerland Ltd., Corporate Research;ABB Switzerland Ltd., Corporate Research;ABB Switzerland Ltd

Department of Mechanics and Mathematics, Moscow Lomonosov State University, Moscow 119992, Russia 789945
-;- 790075
Computer Science Division, University of California, Berkeley, CA 94720-1776, USA;Department of Computer Science, The University of Texas at San Antonio, 6900 N. Loop 1604 West, San Antonio, TX 78249-0667, USA 790140
Augusta State University 790205
Valparaiso University, Valparaiso, IN;Butler University, Indianapolis, IN 790270
Institute of Automation, Chinese Academy of Sciences, Beijing (China) 100080 (E-mail: jzyu@compsys.ia.ac.cn);Institute of Automation, Chinese Academy of Sciences, Beijing (China) 100080 (E-mail: jzyu@compsys.ia.ac.cn);Institute of Automation, Chinese Academy of Sciences, Beijing (China) 100080 (E-mail: jzyu@compsys.ia.ac.cn) 790790
- 790855
Mississippi Center for Supercomputing Research 790920
Atlas Computing Division, Rutherford Laboratory, Chilton Didcot Oxon, England;Atlas Computing Division, Rutherford Laboratory, Chilton Didcot Oxon, England 7

Department of Computational Intelligence and Systems Science, Interdisciplinary Graduate School of Science and Engineering, Tokyo Institute of Technology, Yokohama, 226-8502 Japan;Department of Computational Intelligence and Systems Science, Interdisciplinary Graduate School of Science and Engineering, Tokyo Institute of Technology, Yokohama, 226-8502 Japan 805545
- 805610
Graduate School of Information Science, Nara Institute of Science and Technology, Ikoma City, 630-0101 Japan (Presently with Internet Initiative Japan, Inc.);Graduate School of Information Science, Nara Institute of Science and Technology, Ikoma City, 630-0101 Japan (Presently with Sharp Corp.);Graduate School of Information Science, Nara Institute of Science and Technology, Ikoma City, 630-0101 Japan (Presently with Red Hat, Inc.);Graduate School of Information Science, Nara Institute of Science and Technology, Ikoma City, 630-0101 Japan;Graduate School of Information Science, Nara Institute of Science and Technolog

- 814710
Science Applications International Corporation, San Diego, CA 814905
PRC Inc., McLean, VA;PRC Inc., McLean, VA 814970
Distribution Control Systems, Inc., Hazelwood, MO, U.S.A.;University of California at San Diego, San Diego, CA, U.S.A.;Department of Electrical and Computer Engineering, Brigham Young University, Provo, UT 84602, U.S.A. 815360
Texas Instruments, Avenue Jack Kilby, BP 5 06270 Villeneuve, Loubet, France (Formerly with STMicroelectronics Via Tolomeo, 1 20010 Cornaredo (MI), Italy);STMicroelectronics, Via Tolomeo, 1 20010 Cornaredo (MI), Italy;Department of Innovation Engineering, University of Lecce, Via per Monteroni, 73100 Lecce, Italy 815555
Department of Electrical Engineering, Indian Institute of Technology, Hauz Khas, New Delhi 110016, India;Department of Electrical Engineering, Indian Institute of Technology, Hauz Khas, New Delhi 110016, India 815620
Statistical Laboratory, University of Cambridge, Wilberforce Road, Cambridge CB3 0WB, United Kingdom;Statist

University of New Mexico, Albuquerque, NM 829530
University of Pittsburgh, Pittsburgh, Pennsylvania 829595
Center for Visual Computing, Department of Computer Science, State University of New York at Stony Brook, Stony Brook, NY 11794, USA.;Center for Visual Computing, Department of Computer Science, State University of New York at Stony Brook, Stony Brook, NY 11794, USA. 830050
Kupala State University, Grodno, Belarus 830180
Department of Computer Science, University College London, Gower Street, London WC1E 6BT, UK;Department of Computer Science, University College London, Gower Street, London WC1E 6BT, UK;Department of Computer Science, University College London, Gower Street, London WC1E 6BT, UK;Department of Computer Science, University College London, Gower Street, London WC1E 6BT, UK 830375
Department of Electronic Engineering, Northern Taiwan Institute of Science and Technology, Peito, Taipei, Taiwan, Republic of China 830440
Berkeley Phylogenomics Group, Department of Bioengin

- 833365
Institute of Immunology, University of Rostock  Schillingallee 70, D-18057 Rostock, Germany;Institute of Immunology, University of Rostock  Schillingallee 70, D-18057 Rostock, Germany;Institute of Immunology, University of Rostock  Schillingallee 70, D-18057 Rostock, Germany;Institute of Neurology, University of Rostock  Schillingallee 70, D-18057 Rostock, Germany;Institute of Immunology, University of Rostock  Schillingallee 70, D-18057 Rostock, Germany 833495
Department of Biotechnology, School of Engineering, Nagoya University                Furo-cho, Chikusa-ku, Nagoya 464-8603, Japan;Department of Biotechnology, School of Engineering, Nagoya University                Furo-cho, Chikusa-ku, Nagoya 464-8603, Japan;Department of Biotechnology, School of Engineering, Nagoya University                Furo-cho, Chikusa-ku, Nagoya 464-8603, Japan 833560
Departments of Medicine, Center for Bioinformatics and Molecular Biostatistics, University of California  San Francisco, CA 9414

- 839735
North Carolina State University, Department of Physics, Raleigh, NC;North Carolina State University, Department of Physics, Raleigh, NC;Stanford Synchrotron Radiation Laboratories;University of Leuven, Belgium;University of Leuven, Belgium;Freescale Corporation;Freescale Corporation;Vanderbilt University;North Carolina State University, Department of Physics, Raleigh, NC 840320
Ecole Polytechnique Fédérale de Lausanne (EPFL), Institute of Theoretical Physics, Lausanne, Switzerland and Institut Romand de Recherche Numérique en Physique des Materiaux (IRRMA) ...;Ecole Polytechnique Fédérale de Lausanne (EPFL), Institute of Theoretical Physics, Lausanne, Switzerland and Institut Romand de Recherche Numérique en Physique des Materiaux (IRRMA) ... 840385
- 840580
Rutgers University;Università degli Studi di Milano, Italy;The Free Haven Project 840710
University of Pennsylvania, Philadelphia, Pa;University of Pennsylvania, Philadelphia, Pa 841230
Bioinformatics Research Centre (BIRC

Ralston Purina, St. Louis, MO 848185
Department of Computer Engineering, Sharif University of Technology, Tehran, Iran and School of Computer Science, IPM, Tehran, Iran;Department of Computing Science, University of Glasgow, Glasgow G12 8RZ, UK;School of Information Technologies, University of Sydney, Sydney NSW 2006, Australia 848250
- 848770
Sandia Nat. Labs., Sandia Corp., Albuquerque, NM, USA;Sandia Nat. Labs., Sandia Corp., Albuquerque, NM, USA;Sandia Nat. Labs., Sandia Corp., Albuquerque, NM, USA 848835
CSE Dept., Univ. at Buffalo, SUNY, Amherst, MA, USA 848900
Univ. of Tokyo, Japan;Univ. of Tokyo, Japan;Univ. of Tokyo, Japan;Dept. of Comput. Sci., Virginia Univ., Charlottesville, VA, USA 848965
Tsinghua University, Beijing, China;Tsinghua University, Beijing, China;Tsinghua University, Beijing, China;Tsinghua University, Beijing, China 849030
RMIT;RMIT;RMIT 849095
Universidad Rey Juan Carlos Campus de Mostoles, Madrid, Spain;Universidad Rey Juan Carlos Campus de Mostoles, Madrid

Jet Propulsion Laboratory, Pasadena, California;Jet Propulsion Laboratory, Pasadena, California;Jet Propulsion Laboratory, Pasadena, California 860210
Long Island, NY 860925
McMaster University, Canada;McMaster University, Canada 860990
CEDAR, University at Buffalo, Amherst, NY;CEDAR, University at Buffalo, Amherst, NY;CEDAR, University at Buffalo, Amherst, NY 861055
Arcticus Systems Sweden;Malardalen Research and Technology Centre (MRTC) Sweden;Malardalen Research and Technology Centre (MRTC) Sweden 861120
Carnegie Mellon University;Carnegie Mellon University 861185
University of Basel;University of Basel;University of Basel 861250
University of Central Florida;University of Central Florida 861315
Zhejiang University Hangzhou, China;Zhejiang University Hangzhou, China 861380
Osaka Prefecture University, Japan 861445
Dept. of Inf. & Commun. Eng., Sunmoon Univ., Chungnam, South Korea;Dept. of Comput. Sci., Boston Univ., MA, USA 861510
IBM Deutschland GmbH 861575
Mathematics Department, 

BBC New Media, Broadcast Centre, Media Village, White City, London 871650
-;- 871715
University of Houston-Clear Lake, Houston, Texas 871780
University of British Columbia, Canada;Infosys, Bangalore, India;QALabs, Vancouver, Canada;Alcatel, Paris, France 871845
DARPA/ISTO 871910
Department of Software Engineering, University of Applied Sciences Upper Austria at Hagenberg, Austria;Department of Dermatology, Medical University of Vienna, Austria;Department of Dermatology, Medical University of Vienna, Austria;Department of Dermatology, Medical University of Vienna, Austria 871975
Canada Research Chair in Adaptive Information Infrastructures for the e-Society, Faculty of Computer Science, University of New Brunswick, Fredericton, NB, Canada 872040
Department of Mechanical and Electro-Mechanical Engineering, Tamkang University, Tamsui, Taipei Hsien, Taiwan, ROC;Department of Mechanical and Electro-Mechanical Engineering, Tamkang University, Tamsui, Taipei Hsien, Taiwan, ROC 872170
San Pedr

Department of Computer Science, University of Illinois at Urbana Champaign;Department of Electrical and Computer Engineering, University of Illinois at Urbana Champaign 889460
- 889590
-;-;-;-;- 889785
Universiteit van Amsterdam- Advanced Internet Research group, Amsterdam, The Netherlands;SARA, Amsterdam, The Netherlands;ANL - Argonne National Laboratory, Argonne, IL 889915
The Smith-Kettlewell Eye Research Institute;Stella Maris Institute, Pisa;Weizmann Institute of Science, Rehovot, Israel;The Smith-Kettlewell Eye Research Institute;The Smith-Kettlewell Eye Research Institute;The Smith-Kettlewell Eye Research Institute 890110
Osaka Univ., Osaka, Japan;NTT Network Innovation Labs., Japan;National Institute of Informatics, Japan;NTT Communication Science Labs., Japan 890305
University of California—Davis;University of California—Davis;University of California—Davis;University of California—Davis;University of California—Davis 890500
ETH Zurich;Sony Computer Science Laboratories, Inc 8

Simon Fraser University, Surrey, BC 903760
Simon Fraser University, Central City, Surrey, BC Canada 903825
Marshfield Hills, Massachusetts 903890
ENSAD, Paris, France;ENSAD, Paris, France 903955
Department of Manufacturing Engineering & Management, Technical University of Denmark, Building 423 and 424, Produktionstorvet, 2800 Kgs, Lyngby, Denmark;Department of Manufacturing Engineering & Management, Technical University of Denmark, Building 423 and 424, Produktionstorvet, 2800 Kgs, Lyngby, Denmark 904020
Hamburg, Germany 904085
ZoomLab;Waag Society / for old and new media 904150
Dalhousie University, Halifax, NS, Canada 904215
National Taiwan University;National Taiwan University;National Taiwan University;National Taiwan University 904475
Weta Digital 904605
- 904735
College of Business, Eastern Michigan University, Ypsilanti, USA 48197 904865
Toyohashi University of Technology, Toyohashi, Aichi;Toyohashi University of Technology, Toyohashi, Aichi;Toyohashi University of Technology, T

Columbia University;Columbia University;Columbia University 910130
Western Washington University;Western Washington University 910195
The University of Tokyo, Hongo, Bunkyo, Tokyo, Japan;The University of Tokyo, Hongo, Bunkyo, Tokyo, Japan;The University of Tokyo, Hongo, Bunkyo, Tokyo, Japan 910260
University of California, Irvine;University of California, Irvine 910325
Semantic Studios 910520
- 910650
Department of Mathematical Sciences, University of North Carolina at Greensboro, Greensboro, North Carolina 27402-6170, USA 910910
School of Mathematics, University of Manchester, Manchester, UK M13 9PL;Department of Engineering Management and Systems Engineering, George Washington University, Washington, USA 20052 910975
- 911040
The author is with the Moran Soft Inc., Republic of Korea.,;The author is with the Faculty of Dept. of Information & Communication Engineering, ChungNam National Univ., Republic of Korea. (Corresponding author) E-mail: kjoolee@cnu.ac.kr 911235
The authors are w

Istituto per le Applicazioni del Calcolo "M. Picone", Consiglio Nazionale delle Ricerche, Viale del Policlinico, Roma, Italy 917865
- 917930
-;- 917995
- 918060
- 918190
- 918255
-;- 918320
- 918385
- 918450
-;-;- 918580
- 918645
- 918775
- 918840
-;- 918905
-;- 919165
- 919230
-;- 919295
- 919360
- 919425
- 919490
-;- 919555
- 919620
Learning Algorithms and Systems Laboratory (LASA), Ecole Polytechnique Fédérale de Lausanne (EPFL), 1015, Lausanne, Switzerland;Learning Algorithms and Systems Laboratory (LASA), Ecole Polytechnique Fédérale de Lausanne (EPFL), 1015, Lausanne, Switzerland 919685
- 919750
Argonne National Labs 919815
NVIDIA 920010
- 920075
-;-;- 920140
-;- 920205
- 920270
- 920335
- 920400
-;- 920465
-;-;- 920530
-;-;- 920595
-;- 920660
-;- 920725
- 920790
-;- 920855
-;-;-;- 920920
-;- 920985
- 921115
- 921180
- 921245
-;- 921310
-;-;-;- 921375
-;-;- 921440
-;- 921505
-;- 921635
-;- 921700
- 921765
-;-;- 921830
- 921895
-;-;-;- 921960
-;-;- 922025
- 922090
- 922155
-;-;- 9

Universität Bielefeld, Technische Fakultät, AG Genominformatik, Germany;Universität Bielefeld, Technische Fakultät, AG Genominformatik, Germany 946140
El Centro College Library, Dallas County Community College District, Dallas, TX 75202 946400
INRIA Sophia Antipolis - LIRMM, 161 rue Ada, 34092, Montpellier cedex 5, France;INRIA Rhône-Alpes, Zirst 655 avenue de l’Europe Montbonnot, 38334, Saint Ismier Cedex, France;DPA P3M CNRS, 31 chemin Joseph Aiguier, 13402, Marseille Cedex 20, France;DPA P3M CNRS, 31 chemin Joseph Aiguier, 13402, Marseille Cedex 20, France 946660
Yale University, Department of Computer Science, P. O. Box 208285, 06520-8285, New Haven, CT, USA 946790
Colby College, Department of Psychology, 5550 Mayflower Hill, 04901, Waterville, ME, USA and University of Connecticut, Center for the Ecological Study of Perception & Action, 5550 Mayflower Hill, ...;College of the Holy Cross, 5550 Mayflower Hill, 04901, Worcester, MA, USA;University of Connecticut, Center for the Ecolo

- 953615
University of Missouri, Kansas City, Conservatory of Music, 4949 Cherry, Kansas City, MO 64110-2229, USA email: rudyp@umkc.edu 953875
School of Chemistry, University of Manchester, Oxford Road, Manchester M13 9PL, England and CCLRC Daresbury Laboratory, Daresbury, Warrington, WA4 4AD, England;International Union of Crystallography, 5 Abbey Square, Chester CH1 2HU, England;International Union of Crystallography, 5 Abbey Square, Chester CH1 2HU, England 954135
University of South Alabama, Mobile, AL, USA 954200
National Institute of Information and Communications Technology, Kyoto, 619-0289 Japan;Faculty of Engineering, University of the Ryukyus, Okinawa, 903-0213 Japan 954330
University of Tsukuba;University of Tsukuba;University of Tsukuba;University of Tsukuba;University of Tsukuba;University of Tsukuba 956150
Florida International University 956345
- 956410
Motorola India Research Labs, Bangalore, India;Motorola India Research Labs, Bangalore, India 956605
Canon Research Cen

Oregon Health & Science University, USA 974480
Carnegie Mellon University, USA;Carnegie Mellon University, USA 974545
University of Oulu, 90570 Oulu, Finland;University of Oulu, 90570 Oulu, Finland 974610
Feng Chia University, Taichung, Taiwan, 40724, R.O.C.;National Chung Cheng University, Chiayi, Taiwan, 621, R.O.C.;National Chung Cheng University, Chiayi, Taiwan, 621, R.O.C. 974675
National University of Defense Technology, China;National University of Defense Technology, China 974740
Sch. of Electr. Eng. & Comput. Sci., Univ. of Central Florida, Orlando, FL, USA;- 975260
Technische Universitat Darmstadt, Germany;Technische Universitat Darmstadt, Germany;Technische Universitat Darmstadt, Germany 975325
Federal University of Pernambuco, Brazil;Federal University of Pernambuco, Brazil;Federal University of Pernambuco, Brazil;Federal University of Pernambuco, Brazil 975390
ETH Zurich, Switzerland 975455
Duke University, USA;L3S Research Center, Germany;Argonne National Laboratory, USA 

Moscow State University of Railway Transport, Moscow, Russia;Moscow State University of Railway Transport, Moscow, Russia;Moscow State University of Railway Transport, Moscow, Russia;Wayne State University, Detroit, USA 985010
University of Michigan, Ann Arbor, MI 985075
- 985725
Department of Management Engineering, Osaka Electro-Communication University, Neyagawa, Osaka, Japan 985790
Weber State University 985855
Intelligent Autonomous Systems Laboratory, Faculty of Computing, Engineering and Mathematical Sciences, University of the West of England, Bristol BS16 1QY, United Kingdom;Intelligent Autonomous Systems Laboratory, Faculty of Computing, Engineering and Mathematical Sciences, University of the West of England, Bristol BS16 1QY, United Kingdom 986440
Concordia University, Montreal, PQ, Canada 986570
Doshisha University, Kyoto, Japan;Doshisha University, Kyoto, Japan;Doshisha University, Kyoto, Japan;Doshisha University, Kyoto, Japan;Doshisha University, Kyoto, Japan 986830
Pan

University of California, Riverside 995995
University of São Paulo at São Carlos, Mechanical Engineering Department, São Carlos, SP, 13566-590, Brazil;University of São Paulo at São Carlos, Electrical Engineering Department, São Carlos, SP, 13566-590, Brazil;University of São Paulo at São Carlos, Electrical Engineering Department, São Carlos, SP, 13566-590, Brazil 996320
Columbia University, New York, NY 10027, USA;Princeton University, Princeton, NJ 08544, USA 996710
Bengal Engineering and Science University, Howrah, India 711103;Institute of Physics, Bhubaneswar, India 751005;Heritage Institute of Technology, Kolkata, India 107;Bengal Engineering and Science University, Howrah, India 711103 996840
Radboud University, Nijmegen, The Netherlands;Radboud University, Nijmegen, The Netherlands;Radboud University, Nijmegen, The Netherlands 996905
School of Information Engineering, Beijing University of Posts and Telecommunications, Beijing, China;School of Information Engineering, Beijing U

-;- 1011075


KeyboardInterrupt: 

In [20]:
### split affiliations so we can have clean data and seperate records {paper_id; affiliations}
unique_affiliations_df = affiliation_df.select(F.col("paper_id"), F.explode(F.split(F.col("affiliations"),";")).alias("affiliation"))
unique_affiliations_df.show(20, False)
affiliation_df.show(20, False)

+--------+------------------------------------------------------+
|paper_id|affiliation                                           |
+--------+------------------------------------------------------+
|65      |The Queen's University of Belfast, Belfast, UK        |
|65      |The Queen's University of Belfast, Belfast, UK        |
|130     |Univ. of Karlsruhe, Karlsruhe, West Germany           |
|195     |AERE Harwell Laboratory, Oxon, UK                     |
|195     |Queen's Univ., Belfast, Northern Ireland              |
|260     |University of Michigan, Ann Arbor, MI                 |
|260     |University of Michigan, Ann Arbor, MI                 |
|325     |Oslo politikammer, Oslo, Norway                       |
|390     |Harvard Univ., Cambridge, MA                          |
|390     |Boston Univ., Boston, MA                              |
|455     |Cornell Univ., Ithaca, NY                             |
|455     |Cornell Univ., Ithaca, NY                             |
|520     |

In [21]:
### check for special nonsense characters "-", If the affiliation is missing, there is no point of keeping the rows
###unique_affiliations_df.filter(unique_affiliations_df.affiliations=='-').collect()
unique_affiliations_df=unique_affiliations_df.where(unique_affiliations_df.affiliation!='-')

In [22]:
unique_affiliations_df.show()

+--------+--------------------+
|paper_id|         affiliation|
+--------+--------------------+
|      65|The Queen's Unive...|
|      65|The Queen's Unive...|
|     130|Univ. of Karlsruh...|
|     195|AERE Harwell Labo...|
|     195|Queen's Univ., Be...|
|     260|University of Mic...|
|     260|University of Mic...|
|     325|Oslo politikammer...|
|     390|Harvard Univ., Ca...|
|     390|Boston Univ., Bos...|
|     455|Cornell Univ., It...|
|     455|Cornell Univ., It...|
|     520|IBM General Techn...|
|     520|IBM Research Divi...|
|     520|IBM Research Divi...|
|     520|IBM Research Divi...|
|     650|New York Univ., N...|
|     780|Xerox Palo Alto R...|
|     780|Xerox Palo Alto R...|
|     780|Xerox Palo Alto R...|
+--------+--------------------+
only showing top 20 rows



In [41]:
### check for duplicate rows:
unique_affiliations_df.groupby(['paper_id', 'affiliation']).count().where('count > 1').sort('count', ascending=False).show()

[Stage 45:>                                                         (0 + 4) / 4]

+--------+--------------------+-----+
|paper_id|        affiliations|count|
+--------+--------------------+-----+
|  569905|IBM and Universit...|   91|
| 1202294|Open Grid Forum—G...|   88|
| 1542970|University of Ten...|   65|
|  418817|Humanoid Robotics...|   62|
| 1731577|IBM Semiconductor...|   59|
|  772121|IBM Research Divi...|   52|
| 1038111|IBM Thomas J. Wat...|   46|
| 1241693|INFN-CNAF V.le Be...|   44|
| 1633898|NASA Goddard Spac...|   31|
| 1077644|Carnegie Mellon U...|   31|
| 1210078|IMEC, Kapeldreef ...|   29|
|  994444|Lehrstuhl fur Ope...|   29|
|  864278|Dept. of Electr. ...|   29|
| 1229219|    No Affiliations,|   29|
|  771289|IBM Research Divi...|   28|
| 1423217|Shanghai Astronom...|   28|
|  827034|The Artist Educat...|   28|
| 1308603|Digital Hollywood...|   27|
| 1312394|Atheros Communica...|   27|
| 1972771|LinkedIn, Inc, Mo...|   27|
+--------+--------------------+-----+
only showing top 20 rows



                                                                                

In [23]:
### drop duplicate rows since here we need unique affiliations
unique_affiliations_df=unique_affiliations_df.dropDuplicates()


In [24]:
unique_affiliations_df.count()

                                                                                

1876283

# Load and clean paper_authors

In [26]:
### load paper_authors into schema
dtypes = pd.read_csv('./schemas/paper_authors.csv').to_records(index=False).tolist()
print(dtypes)
fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
paper_author_df = spark.read.option('header', 'true').csv('./assets/parsedData/paper_authors.csv', header=True, schema=schema)

[('authors', 'String'), ('paper_id', 'Integer')]


In [27]:
### remove leadind and trailing spaces
paper_author_df = paper_author_df.withColumn("authors", trim(paper_author_df.authors))
paper_author_df = paper_author_df.withColumn("paper_id", trim(paper_author_df.paper_id))

### change data type for paper_id to Integer
paper_author_df = paper_author_df.withColumn("paper_id",paper_author_df["paper_id"].cast(IntegerType()))

paper_author_df.show()
paper_author_df.printSchema()

+--------------------+--------+
|             authors|paper_id|
+--------------------+--------+
| K Devine;F J. Smith|      65|
|J Wolff von Guden...|     130|
|J. K. Reid;A. Jen...|     195|
|William G. Golson...|     260|
|    Stein Schjolberg|     325|
|W Ian Gasarch;Ste...|     390|
|Sam Toueg;Özalp B...|     455|
|Frederick H. Dill...|     520|
|A. R. Calderbank;...|     585|
|         Uzi Vishkin|     650|
|      Stephen S. Yau|     715|
|Michael D. Schroe...|     780|
|         S L. Graham|     845|
|D Maio;M R. Scala...|     910|
|         Pamela Zave|     975|
|G. Salton;E. Voor...|    1040|
|Douglas D. Dunlop...|    1105|
|Patrick Peruch;Vi...|    1170|
| Robert J. Sternberg|    1235|
|Curtis Roads;John...|    1300|
+--------------------+--------+
only showing top 20 rows

root
 |-- authors: string (nullable = true)
 |-- paper_id: integer (nullable = true)



In [28]:
null_values_paper_authors=paper_author_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in paper_author_df.columns]
   )

In [29]:
### check if authors are missing as well for the ids whose title was missing in paper_df
for rows in paper_author_df.select("authors","paper_id").collect():
    if rows[1] in null_paper_ids_list:
        print(rows[0], rows[1])

Exception ignored in: <function JavaObject.__init__.<locals>.<lambda> at 0x1117f4f70>
Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.2.0/libexec/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1345, in <lambda>
KeyboardInterrupt: 


KeyboardInterrupt: 

In [51]:
### split authors so we can have clean data and seperate records {paper_id; author}
unique_paper_author_df = paper_author_df.select(F.col("paper_id"), F.explode(F.split(F.col("authors"),";")).alias("author"))
unique_paper_author_df.show(20, False)
paper_author_df.show(20, False)

+--------+---------------------+
|paper_id|author               |
+--------+---------------------+
|65      |K Devine             |
|65      |F J. Smith           |
|130     |J Wolff von Gudenberg|
|195     |J. K. Reid           |
|195     |A. Jennings          |
|260     |William G. Golson    |
|260     |William C. Rounds    |
|325     |Stein Schjolberg     |
|390     |W Ian Gasarch        |
|390     |Steven Homer         |
|455     |Sam Toueg            |
|455     |Özalp Babaoğlu       |
|520     |Frederick H. Dill    |
|520     |Satish Gupta         |
|520     |Daniel T. Ling       |
|520     |Richard E. Matick    |
|585     |A. R. Calderbank     |
|585     |E. G. Coffman, Jr.   |
|585     |L. Flatto            |
|650     |Uzi Vishkin          |
+--------+---------------------+
only showing top 20 rows

+---------------------------------------------------------------+--------+
|authors                                                        |paper_id|
+-------------------------------

In [52]:
### check for duplicate rows:
unique_paper_author_df.groupby(['paper_id', 'author']).count().where('count > 1').sort('count', ascending=False).show()

22/01/16 13:35:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 13:35:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 13:35:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 13:35:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 13:35:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 13:35:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 13:35:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 13:35:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 13:36:01 WARN RowBasedKeyValueBatch: Calling spill() on

+--------+------------------+-----+
|paper_id|            author|count|
+--------+------------------+-----+
| 1523221|      Dongkun Shin|    4|
| 2042230|           Lu Leng|    3|
| 2040206|      Mingshu Wang|    3|
| 2040206|      Anchun Cheng|    3|
| 2059316|     Han Chuanfeng|    3|
| 1202294|         N. Sharma|    3|
| 1586139|      Hector Zenil|    2|
| 1982378|       Yuejuan Han|    2|
| 1167730|   Max A. Woodbury|    2|
| 1864850|  Pauline C. Reich|    2|
| 1612982|          A. Klemm|    2|
| 1765286|     Debashis Saha|    2|
| 1577045|Michael Bartolacci|    2|
| 1966201|            Bo Liu|    2|
| 1180350|        R. Schafer|    2|
| 1434302|         Meir Russ|    2|
| 1297077|            Bei Yu|    2|
| 1443917|       Lingli Zhao|    2|
| 1947014|  Steven Warburton|    2|
| 1198485|   Nedeljko Cvejic|    2|
+--------+------------------+-----+
only showing top 20 rows





In [55]:
### drop duplicate rows since here we need unique paper-author relation
unique_paper_author_df=unique_paper_author_df.dropDuplicates()

In [57]:
unique_paper_author_df.show(truncate=False)



+--------+-------------------+
|paper_id|author             |
+--------+-------------------+
|1117    |Benjamin Kuipers   |
|1574    |J. G. Brookshear   |
|1707    |C. Ghezzi          |
|1829    |Peter M. Stephan   |
|2080    |Matthew L. Ginsberg|
|2222    |Nissim Francez     |
|2615    |Dan Benanav        |
|2872    |Trevor J. Bentley  |
|2996    |Martin T. Sullivan |
|3185    |William B. Robinson|
|3261    |Guy Lapalme        |
|3584    |L. Egghe           |
|4362    |C-T Liou           |
|4369    |Y-C Chen           |
|4424    |Ron M Roth         |
|5735    |D Eyre             |
|5860    |S Makridakis       |
|6181    |W J Baggaley       |
|6247    |Ravi B Boppana     |
|6700    |Tomas Hirschfeld   |
+--------+-------------------+
only showing top 20 rows



[Stage 76:>                                                         (0 + 1) / 1]                                                                                

# Load and clean Publication_venues df

In [31]:
### load publication_venues into schema
dtypes = pd.read_csv('./schemas/publication_venues.csv').to_records(index=False).tolist()
print(dtypes)
fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
publication_venue_df = spark.read.option('header', 'true').csv('./assets/parsedData/publication_venues.csv', header=True, schema=schema)

[('paper_id', 'Integer'), ('publication_venue', 'String')]


In [32]:
publication_venue_df = publication_venue_df.withColumn("publication_venue", trim(publication_venue_df.publication_venue))
publication_venue_df = publication_venue_df.withColumn("paper_id", trim(publication_venue_df.paper_id))
publication_venue_df = publication_venue_df.withColumn("paper_id",publication_venue_df["paper_id"].cast(IntegerType()))
publication_venue_df.show()

+--------+--------------------+
|paper_id|   publication_venue|
+--------+--------------------+
|      65|Information Techn...|
|     130|Proc. of the symp...|
|     195|ACM Transactions ...|
|     260|Information and C...|
|     325|Computers and pen...|
|     390|Information and C...|
|     455|SIAM Journal on C...|
|     520|IBM Journal of Re...|
|     585|Journal of the AC...|
|     650|Theoretical Compu...|
|     715|            Computer|
|     780|ACM Transactions ...|
|     845|Methods and tools...|
|     910|Information Proce...|
|     975|ACM Transactions ...|
|    1040|Information Proce...|
|    1105|ACM Transactions ...|
|    1170|Proc. of the 2nd ...|
|    1235|Proc. of the inte...|
|    1300|Foundations of co...|
+--------+--------------------+
only showing top 20 rows



In [33]:
null_values_publication_venue=publication_venue_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in publication_venue_df.columns]
   )

In [34]:
publication_venue_df.filter(publication_venue_df['publication_venue'].isNull()).show()

[Stage 22:>                                                         (0 + 1) / 1]                                                                                

+--------+-----------------+
|paper_id|publication_venue|
+--------+-----------------+
|  109525|             null|
|  987870|             null|
| 1065415|             null|
| 1555515|             null|
|  144171|             null|
|  804766|             null|
| 1032591|             null|
| 1986272|             null|
| 1013548|             null|
| 1054433|             null|
| 1785618|             null|
|  993855|             null|
| 1444435|             null|
|  893041|             null|
|  947251|             null|
|  638112|             null|
| 1074067|             null|
| 1375797|             null|
| 1376577|             null|
|  861258|             null|
+--------+-----------------+
only showing top 20 rows



# Load and clean Citations df

In [36]:
### load affiliation into schema
dtypes = pd.read_csv('./schemas/citations.csv').to_records(index=False).tolist()
print(dtypes)
fields = [StructField(dtype[0], globals()[f'{dtype[1]}Type']()) for dtype in dtypes]
schema = StructType(fields)
citation_df = spark.read.option('header', 'true').csv('./assets/parsedData/citations.csv', header=True, schema=schema)

[('paper_id', 'Integer'), ('ref_ids', 'String')]


In [37]:
citation_df.show()

+--------+--------------------+
|paper_id|             ref_ids|
+--------+--------------------+
|      65|                null|
|     130|                null|
|     195|317424;317425;317573|
|     260|                null|
|     325|                null|
|     390|                null|
|     455|                null|
|     520|       318368;323493|
|     585|                null|
|     650|                null|
|     715|                null|
|     780|318420;319233;319...|
|     845|                null|
|     910|                null|
|     975|67604;318882;3718...|
|    1040|                null|
|    1105|289087;318014;318...|
|    1170|                null|
|    1235|                null|
|    1300|                null|
+--------+--------------------+
only showing top 20 rows



In [38]:
### remove leading and trailing spaces
citation_df = citation_df.withColumn("ref_ids", trim(citation_df.ref_ids))
citation_df = citation_df.withColumn("paper_id", trim(citation_df.paper_id))
### change data type of paper_id to Integer
citation_df = citation_df.withColumn("paper_id",citation_df["paper_id"].cast(IntegerType()))
citation_df.show()

+--------+--------------------+
|paper_id|             ref_ids|
+--------+--------------------+
|      65|                null|
|     130|                null|
|     195|317424;317425;317573|
|     260|                null|
|     325|                null|
|     390|                null|
|     455|                null|
|     520|       318368;323493|
|     585|                null|
|     650|                null|
|     715|                null|
|     780|318420;319233;319...|
|     845|                null|
|     910|                null|
|     975|67604;318882;3718...|
|    1040|                null|
|    1105|289087;318014;318...|
|    1170|                null|
|    1235|                null|
|    1300|                null|
+--------+--------------------+
only showing top 20 rows



In [39]:
### check for duplicate rows
citation_df.groupby(['paper_id', 'ref_ids']).count().where('count > 1').sort('count', ascending=False).show()



+--------+-------+-----+
|paper_id|ref_ids|count|
+--------+-------+-----+
+--------+-------+-----+



                                                                                

In [40]:
### split citations so we can have clean data and seperate records {paper_id; ref_id}
unique_citation_df = citation_df.select(F.col("paper_id"), F.explode(F.split(F.col("ref_ids"),";")).alias("ref_id"))
unique_citation_df.show(20, False)
citation_df.show(20, False)

+--------+------+
|paper_id|ref_id|
+--------+------+
|195     |317424|
|195     |317425|
|195     |317573|
|520     |318368|
|520     |323493|
|780     |318420|
|780     |319233|
|780     |319290|
|780     |319579|
|780     |320813|
|975     |67604 |
|975     |318882|
|975     |371812|
|975     |404772|
|975     |604232|
|975     |834995|
|975     |836008|
|1105    |289087|
|1105    |318014|
|1105    |318186|
+--------+------+
only showing top 20 rows

+--------+------------------------------------------------+
|paper_id|ref_ids                                         |
+--------+------------------------------------------------+
|65      |null                                            |
|130     |null                                            |
|195     |317424;317425;317573                            |
|260     |null                                            |
|325     |null                                            |
|390     |null                                            |
|4

In [41]:
### change datat type of ref_id to Integer
unique_citation_df = unique_citation_df.withColumn("ref_id",unique_citation_df["ref_id"].cast(IntegerType()))

In [42]:
unique_citation_df.printSchema()

root
 |-- paper_id: integer (nullable = true)
 |-- ref_id: integer (nullable = true)



In [43]:
### check for duplicate rows
unique_citation_df.groupby(['paper_id', 'ref_id']).count().where('count > 1').sort('count', ascending=False).show()

22/01/16 15:37:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 15:37:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 15:37:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 15:37:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 15:37:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 15:37:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 15:37:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 15:37:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
22/01/16 15:37:53 WARN RowBasedKeyValueBatch: Calling spill() on



22/01/16 15:37:55 ERROR Executor: Exception in task 2.0 in stage 33.0 (TID 122)
java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillReader.<init>(UnsafeSorterSpillReader.java:50)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillWriter.getReader(UnsafeSorterSpillWriter.java:159)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.getSortedIterator(UnsafeExternalSorter.java:523)
	at org.apache.spark.sql.execution.UnsafeKVExternalSorter.sortedIterator(UnsafeKVExternalSorter.java:206)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.finishAggregate(HashAggregateExec.scala:493)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.agg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterat

Py4JJavaError: An error occurred while calling o459.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 33.0 failed 1 times, most recent failure: Lost task 2.0 in stage 33.0 (TID 122) (172.20.10.2 executor driver): java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillReader.<init>(UnsafeSorterSpillReader.java:50)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillWriter.getReader(UnsafeSorterSpillWriter.java:159)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.getSortedIterator(UnsafeExternalSorter.java:523)
	at org.apache.spark.sql.execution.UnsafeKVExternalSorter.sortedIterator(UnsafeKVExternalSorter.java:206)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.finishAggregate(HashAggregateExec.scala:493)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.agg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.convert.Wrappers$IteratorWrapper.hasNext(Wrappers.scala:32)
	at org.sparkproject.guava.collect.Ordering.leastOf(Ordering.java:628)
	at org.apache.spark.util.collection.Utils$.takeOrdered(Utils.scala:37)
	at org.apache.spark.rdd.RDD.$anonfun$takeOrdered$2(RDD.scala:1518)
	at org.apache.spark.rdd.RDD$$Lambda$3681/0x0000000801586840.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.RDD$$Lambda$2871/0x0000000801300040.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2441/0x0000000801142c40.apply(Unknown Source)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2352)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2351)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2351)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1109)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2591)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2533)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:898)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2309)
	at org.apache.spark.rdd.RDD.$anonfun$reduce$1(RDD.scala:1120)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.reduce(RDD.scala:1102)
	at org.apache.spark.rdd.RDD.$anonfun$takeOrdered$1(RDD.scala:1524)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.takeOrdered(RDD.scala:1512)
	at org.apache.spark.sql.execution.TakeOrderedAndProjectExec.executeCollect(limit.scala:204)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:338)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:366)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:338)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3715)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2728)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3706)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3704)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2728)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2935)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:287)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:326)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillReader.<init>(UnsafeSorterSpillReader.java:50)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeSorterSpillWriter.getReader(UnsafeSorterSpillWriter.java:159)
	at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.getSortedIterator(UnsafeExternalSorter.java:523)
	at org.apache.spark.sql.execution.UnsafeKVExternalSorter.sortedIterator(UnsafeKVExternalSorter.java:206)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.finishAggregate(HashAggregateExec.scala:493)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.agg_doAggregateWithKeys_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage2.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.convert.Wrappers$IteratorWrapper.hasNext(Wrappers.scala:32)
	at org.sparkproject.guava.collect.Ordering.leastOf(Ordering.java:628)
	at org.apache.spark.util.collection.Utils$.takeOrdered(Utils.scala:37)
	at org.apache.spark.rdd.RDD.$anonfun$takeOrdered$2(RDD.scala:1518)
	at org.apache.spark.rdd.RDD$$Lambda$3681/0x0000000801586840.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.RDD$$Lambda$2871/0x0000000801300040.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2441/0x0000000801142c40.apply(Unknown Source)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 62715)
Traceback (most recent call last):
  File "/usr/local/Cellar/python@3.9/3.9.8/Frameworks/Python.framework/Versions/3.9/lib/python3.9/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/local/Cellar/python@3.9/3.9.8/Frameworks/Python.framework/Versions/3.9/lib/python3.9/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/local/Cellar/python@3.9/3.9.8/Frameworks/Python.framework/Versions/3.9/lib/python3.9/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/local/Cellar/python@3.9/3.9.8/Frameworks/Python.framework/Versions/3.9/lib/python3.9/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/Cellar/apache-spark/3.2.0/libexec/python/pyspark/accumulators.py", line 26

### Compute paper count per unique affiliation

In [44]:
paper_count_per_affiliation_df = unique_affiliations_df.groupBy('affiliations').count()
print(paper_count_per_affiliation_df.show())

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.2.0/libexec/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/Cellar/python@3.9/3.9.8/Frameworks/Python.framework/Versions/3.9/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [Errno 54] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/Cellar/apache-spark/3.2.0/libexec/python/lib/py4j-0.10.9.2-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/Cellar/apache-spark/3.2.0/libexec/python/lib/py4j-0.10.9.2-src.zip/py4j/clientserver.py", line 503, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or

Py4JError: functions does not exist in the JVM

### Validate precomputed paper counts (per author)

In [45]:
papers_per_author_df = paper_auhors.groupBy("author")
papers_per_author_count_df = papers_per_authors_df.count()

NameError: name 'paper_auhors' is not defined

### Validate precomputed ref counts (per author) - ?

### Validate precomputed h indexes (per author)

#### Important links
1. Join https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.DataFrame.join.html
2. OrderBy https://sparkbyexamples.com/pyspark/pyspark-orderby-and-sort-explained/#sort


1. Retrieve all publications of the author
2. Calculate the number of references per publication
3. Sort the results in descending order
4. Find a threshold N, where N top publications have at least N references each. N is the h-index of the author.


In [48]:
# Retrieve all publications per author
print(papers_per_author_df)

NameError: name 'papers_per_author_df' is not defined

In [49]:
# Calculate the number of references per publication
refs_per_paper_count_df = ref_ids.groupBy("paper_id").count()

NameError: name 'ref_ids' is not defined

In [50]:
# Join and sort the results in descending order
author_papers_with_ref_count = papers_per_author_df.join(refs_per_paper_count_df, 'paper_id')
sorted_papers_with_ref_count = author_papers_with_ref_count.sort(author_papers_with_ref_count.ref_ids.desc())


NameError: name 'papers_per_author_df' is not defined