In [1]:
import findspark
findspark.init()
import pandas as pd;
import csv;
import numpy as np;
from pyspark import SparkContext
from pyspark import SparkConf
import re

In [2]:
PAPER_KEYS_MAP = {
    "#index": "id", "#*": "title", "#@": "authors", "#o": "affiliations", "#t": "year", "#c": "publication_venue", "#%": "ref_ids", "#!": "abstract",
}
PAPER_DATASET_PATH = './assets/AMiner-Paper.txt'

AUTHOR_KEYS_MAP = {
    "#index": "id", "#n": "name", "#a": "affiliations", "#pc": "paper_count", "#cn": "citation_count", "#t": "research_interests", "#hi": "h_index", "#upi": "u_p_index", "#pi": "p_index"
}
AUTHOR_DATASET_PATH = './assets/AMiner-Author.txt'

In [4]:
conf = SparkConf().setAppName('Read & parse text data file in pyspark')
sc = SparkContext.getOrCreate(conf=conf)

In [5]:
def zipDatasetWithIndex(datasetPath):
    lines = sc.textFile(datasetPath).zipWithIndex()
    return lines
lines = zipDatasetWithIndex(AUTHOR_DATASET_PATH)

                                                                                

In [6]:
def exchangeElementAtIndexToOne(index, zeroArray):
    zeroArray[index] = 1;

def doCumSumForIndexRows(lines):
    # Find the starting row of each data entry
    pos = lines.filter(lambda x: "#index" in x[0]).map(lambda x: x[1]).collect() 
    zeroArray = np.zeros(lines.count(), dtype=int)
    for element in pos:
        exchangeElementAtIndexToOne(element, zeroArray)
    # Calculate cumulative sum of starting data entry indexes
    summedArray = np.cumsum(zeroArray)
    return summedArray;

In [7]:
summedArray = doCumSumForIndexRows(lines)
# print(summedArray.collect())
# Find the starting row of each data entry
# pos = lines.filter(lambda x: "#index" in x[0]).map(lambda x: x[1]).collect()

                                                                                

In [8]:
def createBroarcastedTuple(x, arrayOfStartRowIndexes):
    return (arrayOfStartRowIndexes.value[x[1]], x[0])

def convertDataIntoIndexedTuples(datasetLines, cumSummedIndexRowsArray):
    broadcastedArray = sc.broadcast(cumSummedIndexRowsArray);
    convertedData = datasetLines.map(lambda dataLine: createBroarcastedTuple(dataLine, broadcastedArray))
    return convertedData

In [9]:
list_get = lambda l, x, d=None: d if not l[x] else l[x] # safe getter of list values

def splitIntoKeyValue(stringToSplit, keyMap):
    formattedValue = stringToSplit 
    if type(stringToSplit) is str:
        if len(formattedValue) > 0:
            formattedValue = formattedValue.split(" ", 1)
            key = list_get(formattedValue, 0)
            mappedKey = keyMap.get(key) # note : this is not ideal. We should check and write better, because it ommits null values
            
            if (mappedKey):
                formattedValue = { mappedKey: list_get(formattedValue, 1, '') }
            else:
                formattedValue = {}
        else:
            formattedValue = {}
    return formattedValue

def reduceFeaturesToDict(featureA='', featureB='', keyMap={}):
    try:        
        splittedA = splitIntoKeyValue(featureA, keyMap);
        splittedB = splitIntoKeyValue(featureB, keyMap);
        return { **splittedA, **splittedB }
    except:
        print("ERROR: ", featureA, featureB)
        raise

def convertDataIntoDicts(objects, dataKeyMap):
    reducedObjects = objects.reduceByKey(lambda a, b: reduceFeaturesToDict(a, b, dataKeyMap))
    print(reducedObjects.sortByKey(ascending=False).first())
    mappedDicts = reducedObjects.map(lambda x: x[1]) # retrieve dicts from the tuples
    return mappedDicts

In [10]:
def convertDictsArrayIntoCSVFile(dictsArray, fileName):
    df = sqlContext.createDataFrame(dictsArray)
    # Save data to csv file
    df.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save(fileName)

In [12]:
papers = convertDataIntoIndexedTuples(lines, summedArray)

In [13]:
authorDictsArray = convertDataIntoDicts(papers, AUTHOR_KEYS_MAP)



(1712433, {'id': '1712433', 'name': 'Andrea Gantchev', 'affiliations': '', 'paper_count': '2', 'citation_count': '3', 'h_index': '1', 'p_index': '1.0000', 'u_p_index': '0.8333', 'research_interests': 'subsumption architecture;Subsumption ArchitectureThe subsumption architecture;software architecture;subsumption architectureReusable Strategies;Object-oriented design;object-oriented software design;Rodney Brooks;Software Agents;behaviour-based control;different micro-strategies'})


                                                                                

In [14]:
convertDictsArrayIntoCSVFile(authorDictsArray, "./assets/authors.csv")

                                                                                