In [1]:
import findspark
findspark.init()
import pandas as pd;
import csv;
import numpy as np;
from pyspark import SparkContext
from pyspark import SparkConf
import re
from pyspark.sql.types import StringType, ArrayType,StructType,StructField

In [2]:
conf = SparkConf().setAppName('Read & parse text data file in pyspark')
sc = SparkContext.getOrCreate(conf=conf)

In [3]:
PAPER_KEYS_MAP = {
    "#index": "paper_id", "#*": "title", "#@": "authors", "#o": "affiliations", "#t": "year", "#c": "publication_venue", "#%": "ref_ids", "#!": "abstract",
}
PAPER_DATASET_PATH = './assets/AMiner-Paper.txt'

AUTHOR_KEYS_MAP = {
    "#index": "id", "#n": "name", "#a": "affiliations", "#pc": "paper_count", "#cn": "citation_count", "#t": "research_interests", "#hi": "h_index", "#upi": "u_p_index", "#pi": "p_index"
}
AUTHOR_DATASET_PATH = './assets/AMiner-Author.txt'

In [4]:
def zipDatasetWithIndex(datasetPath):
    lines = sc.textFile(datasetPath).zipWithIndex()
    return lines
lines = zipDatasetWithIndex(PAPER_DATASET_PATH)
print(lines)

                                                                                

PythonRDD[3] at RDD at PythonRDD.scala:53


In [5]:
def exchangeElementAtIndexToOne(index, zeroArray):
    zeroArray[index] = 1;

def doCumSumForIndexRows(lines):
    # Find the starting row of each data entry
    pos = lines.filter(lambda x: "#index" in x[0]).map(lambda x: x[1]).collect() 
    zeroArray = np.zeros(lines.count(), dtype=int)
    for element in pos:
        exchangeElementAtIndexToOne(element, zeroArray)
    # Calculate cumulative sum of starting data entry indexes
    summedArray = np.cumsum(zeroArray)
    return summedArray;

In [6]:
summedArray = doCumSumForIndexRows(lines)
# print(summedArray.collect())
# Find the starting row of each data entry
# pos = lines.filter(lambda x: "#index" in x[0]).map(lambda x: x[1]).collect()

                                                                                

In [7]:
def createBroarcastedTuple(x, arrayOfStartRowIndexes):
    return (arrayOfStartRowIndexes.value[x[1]], x[0])

def convertDataIntoIndexedTuples(datasetLines, cumSummedIndexRowsArray):
    broadcastedArray = sc.broadcast(cumSummedIndexRowsArray);
    convertedData = datasetLines.map(lambda dataLine: createBroarcastedTuple(dataLine, broadcastedArray))
    return convertedData

In [25]:
list_get = lambda l, x, d=None: d if not l[x] else l[x] # safe getter of list values

def splitIntoKeyValue(stringToSplit, keyMap):
    formattedValue = stringToSplit 
    if type(stringToSplit) is str:
        if len(formattedValue) > 0:
            formattedValue = formattedValue.split(" ", 1)
            key = list_get(formattedValue, 0)
            mappedKey = keyMap.get(key)
            if (mappedKey):
                formattedValue = { mappedKey: list_get(formattedValue, 1, '') }
            else:
                formattedValue = {}
        else:
            formattedValue = {}
    return formattedValue

def appendStringOrListIntoList(lst, elToAppend):
    if elToAppend is not None:
        if type(elToAppend) == str:
            lst.append(elToAppend)
        else:
            lst = lst + elToAppend
    return lst

# Paper related
def convertFeatures(dct, affiliations_data={}, papers_data={}, paper_authors_data={}, publication_venues_data={}):
    if dct.get("paper_id"):
        paper_id = dct.get("paper_id")
        affiliations_data["paper_id"] = paper_id
        papers_data["paper_id"] = paper_id
        paper_authors_data["paper_id"] = paper_id
        publication_venues_data["paper_id"] = paper_id
    elif dct.get("affiliations"):
        affiliations_data["affiliations"] = dct.get("affiliations")
    elif dct.get("authors"):
        paper_authors_data["authors"] = dct.get("authors")
    elif dct.get("title"):
        papers_data["title"] = dct.get("title")
    elif dct.get("year"):
        papers_data["year"] = dct.get("year")
    elif dct.get("publication_venue"):
        publication_venues_data["publication_venue"] = dct.get("publication_venue")
        
# Paper related
def reduceFeaturesToDict(featureA='', featureB='', keyMap={}):
    papers_data = {}
    affiliations_data = {}
    paper_authors_data = {}
    publication_venues_data = {}
    try:        
        splittedA = splitIntoKeyValue(featureA, keyMap);
        splittedB = splitIntoKeyValue(featureB, keyMap);
#         print("===> ", splittedA, splittedB)
        
        if (splittedA.get('papers_data')):
#             print("here")
            papers_data = splittedA.get('papers_data')
            affiliations_data = splittedA.get('affiliations_data')
        else:
            convertFeatures(splittedA, affiliations_data, papers_data, paper_authors_data, publication_venues_data)
        convertFeatures(splittedB, affiliations_data, papers_data, paper_authors_data, publication_venues_data)
#         print("AFTER -> ", affiliations_data, papers_data)
#         splittedARefIds = splittedA.get("ref_ids");
#         splittedBRefIds = splittedB.get("ref_ids");
#         newRefIds = [] 
#         appendStringOrListIntoList(newRefIds, splittedARefIds)
#         appendStringOrListIntoList(newRefIds, splittedBRefIds)
        # "ref_ids": ';'.join(newRefIds) 
#         print("Came to the end")
#         print("papers ---> ", papers_data, splittedA, splittedB)
        return {
            "papers_data": papers_data,
            "affiliations_data": affiliations_data,
            "paper_authors_data": paper_authors_data,
            "publication_venues_data": publication_venues_data,
        }
    except Exception as error:
        print("ERROR: ", featureA, featureB, error)
        raise

def convertDataIntoDicts(objects, dataKeyMap):
    reducedObjects = objects.reduceByKey(lambda a, b: reduceFeaturesToDict(a, b, dataKeyMap))
    print(reducedObjects.sortByKey(ascending=False).first())
    mappedDicts = reducedObjects.map(lambda x: x[1]) # retrieve dicts from the tuples
    return mappedDicts

In [30]:
def convertDictsArrayIntoCSVFile(dictsArray, fileName):
    df = sqlContext.createDataFrame(dictsArray)
    # Save data to csv file
    df.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save(fileName)

In [31]:
papers = convertDataIntoIndexedTuples(lines, summedArray)


Py4JError: PythonRDD does not exist in the JVM

In [29]:
authorDictsArray = convertDataIntoDicts(papers, PAPER_KEYS_MAP)

                                                                                

TypeError: 'NoneType' object cannot be interpreted as an integer

In [None]:
papers_d = authorDictsArray.map(lambda x: x["papers_data"])#.collect()
affiliations_d = authorDictsArray.map(lambda x: x["affiliations_data"])#.collect()

Exception in thread "serve RDD 57" java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:458)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)


In [None]:
print(affiliations_d[1])
print(papers_d[4])

In [14]:
# print(authorDictsArray.toDF(['papers_data', 'affiliations_data'])["papers_data"].show())
# df1 = sqlContext.createDataFrame(papers_d)
# df2 = sqlContext.createDataFrame(affiliations_d)

KeyboardInterrupt: 

In [28]:
print(df1.show())

+--------------------+--------------+-------+---+--------------------+-------+-----------+-------+--------------------+---------+
|        affiliations|citation_count|h_index| id|                name|p_index|paper_count|ref_ids|  research_interests|u_p_index|
+--------------------+--------------+-------+---+--------------------+-------+-----------+-------+--------------------+---------+
|                    |             0|      0| 17|     J. Michael Howe| 0.0000|          1|       |HIV disease;Inter...|   0.0000|
|University of Flo...|             0|      0| 34|        Haitham Gabr| 0.0000|          2|       |associate polynom...|   0.0000|
|University of Bat...|             4|      1| 51|         Emma Tonkin| 3.0000|          8|       |metadata element;...|   3.5000|
|School of Compute...|             1|      1| 68|        Woochul Shin| 0.5000|          4|       |Web Service;conte...|   0.7500|
|                    |             0|      0| 85|           S Improta| 0.0000|          1|

In [None]:
convertDictsArrayIntoCSVFile(papers_d, "./assets/papers_d2.csv")

In [14]:
convertDictsArrayIntoCSVFile(affiliations_d, "./assets/affiliations_d2.csv")

                                                                                