In [1]:
import findspark
findspark.init()
import pandas as pd;
import csv;
import numpy as np;
from pyspark import SparkContext
from pyspark import SparkConf
import re
from pyspark.sql.types import StringType, ArrayType,StructType,StructField

In [2]:
conf = SparkConf().setAppName('Read & parse text data file in pyspark')
sc = SparkContext.getOrCreate(conf=conf)

In [3]:
PAPER_KEYS_MAP = {
    "#index": "paper_id", "#*": "title", "#@": "authors", "#o": "affiliations", "#t": "year", "#c": "publication_venue", "#%": "ref_ids", "#!": "abstract",
}
PAPER_DATASET_PATH = './assets/AMiner-Paper.txt'

AUTHOR_KEYS_MAP = {
    "#index": "id", "#n": "name", "#a": "affiliations", "#pc": "paper_count", "#cn": "citation_count", "#t": "research_interests", "#hi": "h_index", "#upi": "u_p_index", "#pi": "p_index"
}
AUTHOR_DATASET_PATH = './assets/AMiner-Author.txt'

In [17]:
def zipDatasetWithIndex(datasetPath):
    lines = sc.textFile(datasetPath).zipWithIndex()
    return lines
lines = zipDatasetWithIndex(PAPER_DATASET_PATH)

                                                                                

[('#index 1', 0)]


In [5]:
def exchangeElementAtIndexToOne(index, zeroArray):
    zeroArray[index] = 1;

def doCumSumForIndexRows(lines):
    # Find the starting row of each data entry
    pos = lines.filter(lambda x: "#index" in x[0]).map(lambda x: x[1]).collect() 
    zeroArray = np.zeros(lines.count(), dtype=int)
    for element in pos:
        exchangeElementAtIndexToOne(element, zeroArray)
    # Calculate cumulative sum of starting data entry indexes
    summedArray = np.cumsum(zeroArray)
    return summedArray;

In [6]:
summedArray = doCumSumForIndexRows(lines)

                                                                                

In [7]:
def createBroarcastedTuple(x, arrayOfStartRowIndexes):
    return (arrayOfStartRowIndexes.value[x[1]], x[0])

def convertDataIntoIndexedTuples(datasetLines, cumSummedIndexRowsArray):
    broadcastedArray = sc.broadcast(cumSummedIndexRowsArray);
    convertedData = datasetLines.map(lambda dataLine: createBroarcastedTuple(dataLine, broadcastedArray))
    return convertedData

In [32]:
list_get = lambda l, x, d=None: d if not l[x] else l[x] # safe getter of list values

def splitIntoKeyValue(stringToSplit, keyMap):
    formattedValue = stringToSplit 
    if type(stringToSplit) is str:
        if len(formattedValue) > 0:
            formattedValue = formattedValue.split(" ", 1)
            key = list_get(formattedValue, 0)
            mappedKey = keyMap.get(key)
            if (mappedKey):
                formattedValue = { mappedKey: list_get(formattedValue, 1, '') }
            else:
                formattedValue = {}
        else:
            formattedValue = {}
    return formattedValue

def appendStringOrListIntoList(lst, elToAppend):
    if elToAppend is not None:
        if type(elToAppend) == str:
            lst.append(elToAppend)
        else:
            lst = lst + elToAppend
    return lst

# Paper related
def convertFeatures(dct, affiliations_data={}, papers_data={}, paper_authors_data={}, publication_venues_data={}):
    if dct.get("paper_id"):
        paper_id = dct.get("paper_id")
        
        affiliations_data["paper_id"] = paper_id
        papers_data["paper_id"] = paper_id
        paper_authors_data["paper_id"] = paper_id
        publication_venues_data["paper_id"] = paper_id
#         print("AUTH", paper_authors_data)
    elif dct.get("affiliations"):
        affiliations_data["affiliations"] = dct.get("affiliations")
    elif dct.get("ref_ids"):
        appendStringOrListIntoList(papers_data["ref_ids"], dct.get("ref_ids"))
    elif dct.get("authors"):
        paper_authors_data["authors"] = dct.get("authors")
#         print("AUTHors", paper_authors_data)
    elif dct.get("title"):
        papers_data["title"] = dct.get("title")
    elif dct.get("year"):
        papers_data["year"] = dct.get("year")
    elif dct.get("publication_venue"):
        publication_venues_data["publication_venue"] = dct.get("publication_venue")
        
# Paper related
def reduceFeaturesToDict(featureA='', featureB='', keyMap={}):
    papers_data = { "ref_ids": [] }
    affiliations_data = {}
    paper_authors_data = {}
    publication_venues_data = {}
    try:        
        splittedA = splitIntoKeyValue(featureA, keyMap);
        splittedB = splitIntoKeyValue(featureB, keyMap);
        
        if (splittedA.get('papers_data')):
            papers_data = splittedA.get('papers_data')
            affiliations_data = splittedA.get('affiliations_data')
            paper_authors_data = splittedA.get('paper_authors_data')
            publication_venues_data = splittedA.get('publication_venues_data')
        else:
            convertFeatures(splittedA, affiliations_data, papers_data, paper_authors_data, publication_venues_data)
        convertFeatures(splittedB, affiliations_data, papers_data, paper_authors_data, publication_venues_data)
        return {
            "papers_data": papers_data,
            "affiliations_data": affiliations_data,
            "paper_authors_data": paper_authors_data,
            "publication_venues_data": publication_venues_data,
        }
    except Exception as error:
        print("ERROR: ", featureA, featureB, error)
        raise

def convertDataIntoDicts(objects, dataKeyMap):
    reducedObjects = objects.reduceByKey(lambda a, b: reduceFeaturesToDict(a, b, dataKeyMap))
    print(reducedObjects.sortByKey(ascending=False).first())
    mappedDicts = reducedObjects.map(lambda x: x[1]) # retrieve dicts from the tuples
    return mappedDicts

In [33]:
def convertDictsArrayIntoCSVFile(dictsArray, fileName):
    df = sqlContext.createDataFrame(dictsArray)
    print(df.show())
    # Save data to csv file
    df.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save(fileName)

In [34]:
papers = convertDataIntoIndexedTuples(lines, summedArray)


In [35]:
print(papers.take(20))

[(1, '#index 1'), (1, '#* Book Review: Discover Linux'), (1, '#@ Marjorie Richardson'), (1, '#o -'), (1, '#t 1998'), (1, '#c Linux Journal'), (1, ''), (2, '#index 2'), (2, '#* MOSFET table look-up models for circuit simulation'), (2, '#@ '), (2, '#o '), (2, '#t 1984'), (2, '#c Integration, the VLSI Journal'), (2, ''), (3, '#index 3'), (3, '#* The verification of the protection mechanisms of high-level language machines'), (3, '#@ Virgil D. Gligor'), (3, '#o Univ. of Maryland, College Park'), (3, '#t 1984'), (3, '#c International Journal of Parallel Programming')]


In [36]:
authorDictsArray = convertDataIntoDicts(papers, PAPER_KEYS_MAP)



(2092356, {'papers_data': {'ref_ids': ['215579', '333683', '511383', '594375', '641666', '763878', '966860', '1056157'], 'paper_id': '2092356', 'title': 'Reliability prediction through system modeling', 'year': '2013'}, 'affiliations_data': {'paper_id': '2092356', 'affiliations': 'Dept of Computer Engg IIT(BHU) Varanasi, India;Reactor Safety Division Bhabha Atomic Research Centre Dept of Atomic Energy, Govt of India;Dept of Computer Engg IIT(BHU) Varanasi, India'}, 'paper_authors_data': {'paper_id': '2092356', 'authors': 'Lalit Kumar Singh;Gopika Vinod;A. K. Tripathi'}, 'publication_venues_data': {'paper_id': '2092356', 'publication_venue': 'ACM SIGSOFT Software Engineering Notes'}})


                                                                                

In [39]:
def mapPapersData(data):
    papers_data = data["papers_data"]
    papers_data['ref_ids'] = ';'.join(papers_data['ref_ids'])
    return papers_data

papers_d = authorDictsArray.map(lambda x: mapPapersData(x))
affiliations_d = authorDictsArray.map(lambda x: x["affiliations_data"])
paper_authors_d = authorDictsArray.map(lambda x: x["paper_authors_data"])
publication_venues_d = authorDictsArray.map(lambda x: x["publication_venues_data"])

In [41]:
convertDictsArrayIntoCSVFile(papers_d, "./assets/papers_d6.csv")



+--------+--------------------+--------------------+----+
|paper_id|             ref_ids|               title|year|
+--------+--------------------+--------------------+----+
|      65|                    |Direct file organ...|1984|
|     130|                    |An introduction t...|1983|
|     195|317424;317425;317573|On solving almost...|1984|
|     260|                    |Connections betwe...|1984|
|     325|                    |Computers and pen...|1984|
|     390|                    |Relativizations c...|1984|
|     455|                    |On the optimum ch...|1984|
|     520|       318368;323493|All points addres...|1984|
|     585|                    |Optimum Head Sepa...|1984|
|     650|                    |A parallel-design...|1984|
|     715|                    |Computer - IEEE C...|1984|
|     780|318420;319233;319...|Experience with G...|1984|
|     845|                    |Code generation a...|1984|
|     910|                    |On estimating acc...|1984|
|     975|6760

                                                                                

In [42]:
convertDictsArrayIntoCSVFile(affiliations_d, "./assets/affiliations_d6.csv")

+--------------------+--------+
|        affiliations|paper_id|
+--------------------+--------+
|The Queen's Unive...|      65|
|Univ. of Karlsruh...|     130|
|AERE Harwell Labo...|     195|
|University of Mic...|     260|
|Oslo politikammer...|     325|
|Harvard Univ., Ca...|     390|
|Cornell Univ., It...|     455|
|IBM General Techn...|     520|
|               -;-;-|     585|
|New York Univ., N...|     650|
|                   -|     715|
|Xerox Palo Alto R...|     780|
|Univ. of Californ...|     845|
|University of Bol...|     910|
|AT & T Bell Labor...|     975|
|Cornell Univ., It...|    1040|
|University of Mar...|    1105|
|Laboratoire de Ps...|    1170|
|Yale Univ., New H...|    1235|
|                 -;-|    1300|
+--------------------+--------+
only showing top 20 rows

None


                                                                                

In [43]:
convertDictsArrayIntoCSVFile(paper_authors_d, "./assets/paper_authors_d6.csv")

+--------------------+--------+
|             authors|paper_id|
+--------------------+--------+
| K Devine;F J. Smith|      65|
|J Wolff von Guden...|     130|
|J. K. Reid;A. Jen...|     195|
|William G. Golson...|     260|
|    Stein Schjolberg|     325|
|W Ian Gasarch;Ste...|     390|
|Sam Toueg;Özalp B...|     455|
|Frederick H. Dill...|     520|
|A. R. Calderbank;...|     585|
|         Uzi Vishkin|     650|
|      Stephen S. Yau|     715|
|Michael D. Schroe...|     780|
|         S L. Graham|     845|
|D Maio;M R. Scala...|     910|
|         Pamela Zave|     975|
|G. Salton;E. Voor...|    1040|
|Douglas D. Dunlop...|    1105|
|Patrick Peruch;Vi...|    1170|
| Robert J. Sternberg|    1235|
|Curtis Roads;John...|    1300|
+--------------------+--------+
only showing top 20 rows

None


                                                                                

In [44]:
convertDictsArrayIntoCSVFile(publication_venues_d, "./assets/publication_venues_d6.csv")

+--------+--------------------+
|paper_id|   publication_venue|
+--------+--------------------+
|      65|Information Techn...|
|     130|Proc. of the symp...|
|     195|ACM Transactions ...|
|     260|Information and C...|
|     325|Computers and pen...|
|     390|Information and C...|
|     455|SIAM Journal on C...|
|     520|IBM Journal of Re...|
|     585|Journal of the AC...|
|     650|Theoretical Compu...|
|     715|            Computer|
|     780|ACM Transactions ...|
|     845|Methods and tools...|
|     910|Information Proce...|
|     975|ACM Transactions ...|
|    1040|Information Proce...|
|    1105|ACM Transactions ...|
|    1170|Proc. of the 2nd ...|
|    1235|Proc. of the inte...|
|    1300|Foundations of co...|
+--------+--------------------+
only showing top 20 rows

None


                                                                                