# Data parsing

In [69]:
# /**
# *                   _oo0oo_
# *                  o8888888o
# *                  88" . "88
# *                  (| -_- |)
# *                  0\  =  /0
# *                ___/`---'\___
# *              .' \\|     |// '.
# *             / \\|||  :  |||// \
# *            / _||||| -:- |||||- \
# *           |   | \\\  -  /// |   |
# *           | \_|  ''\---/''  |_/ |
# *           \  .-\__  '-'  ___/-. /
# *         ___'. .'  /--.--\  `. .'___
# *      ."" '<  `.___\_<|>_/___.' >' "".
# *     | | :  `- \`.;`\ _ /`;.`/ - ` : | |
# *     \  \ `_.   \_ __\ /__ _/   .-` /  /
# * =====`-.____`.___ \_____/___.-`___.-'=====
# *                   `=---='
# *
# *
# * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# *
# *   Buddha blesses your code to be bug free
# */

This file is used to parse the initial data from [AMiner dataset](https://www.aminer.org/aminernetwork). With this implementation we parse the text files (using cumulative sum applied to file lines) and split them into a couple of csvs which will later be used to create the dataframes.

In [70]:
import findspark
findspark.init()
import pandas as pd;
import csv;
import numpy as np;
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
import re
from pyspark.sql.types import *

from helpers import createDFFromFileAndSchema, saveDFIntoCSVFolder, moveFileToCorrectFolder 

In [71]:
conf = SparkConf().setAppName('Read& parse text data file in pyspark')
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext.getOrCreate(sc)

In [72]:
PATH_TO_ASSETS = './assets/parsedData/'

PAPER_KEYS_MAP = {
    "#index": "paper_id",
    "#*": "title",
    "#@": "authors",
    "#o": "affiliations",
    "#t": "year",
    "#c": "publication_venue",
    "#%": "ref_ids",
    "#!": "abstract",
}
PAPER_DATASET_PATH = './assets/AMiner-Paper.txt'

AUTHOR_KEYS_MAP = {
    "#index": "author_id",
    "#n": "name",
    "#pc": "paper_count",
    "#cn": "citation_count",
    "#t": "research_interests",
    "#hi": "h_index", 
}
AUTHOR_DATASET_PATH = './assets/AMiner-Author.txt'

AUTHOR_2_PAPER_DATASET_PATH = './assets/AMiner-Author2Paper.txt'
AUTHOR_2_PAPER_SCHEMA_PATH = './schemas/paper_author_id.csv'

In [73]:
### Read the file and zip each line into a tuple together with its index
def zipDatasetWithIndex(datasetPath):
    lines = sc.textFile(datasetPath).zipWithIndex()
    return lines

### Here we apply cumsum to find the row indexes with the data of each data entry
1. Zero array: Create an array of 0s with the length equal to the number of lines in the file
2. Find the indexes of the lines starting with `#index`
3. Exchange those indexes inside Zero array into 1s.
4. Apply cumulative sum to this array to find out entry row indexes.

In [74]:
def exchangeElementAtIndexToOne(index, zeroArray):
    zeroArray[index] = 1;

def doCumSumForIndexRows(lines):
    # Find the starting row of each data entry
    pos = lines.filter(lambda x: "#index" in x[0]).map(lambda x: x[1]).collect() 
    zeroArray = np.zeros(lines.count(), dtype=int)
    for element in pos:
        exchangeElementAtIndexToOne(element, zeroArray)
    # Calculate cumulative sum of starting data entry indexes
    summedArray = np.cumsum(zeroArray)
    return summedArray;

In [40]:
### These functions are used to gather the text lines for each data entry
def createBroarcastedTuple(x, arrayOfStartRowIndexes):
    return (arrayOfStartRowIndexes.value[x[1]], x[0])

def convertDataIntoIndexedTuples(datasetLines, cumSummedIndexRowsArray):
    broadcastedArray = sc.broadcast(cumSummedIndexRowsArray);
    convertedData = datasetLines.map(lambda dataLine: createBroarcastedTuple(dataLine, broadcastedArray))
    return convertedData

In [41]:
### This function is used to split string into key and value
list_get = lambda l, x, d=None: d if not l[x] else l[x] # safe getter of list values

def splitIntoKeyValue(stringToSplit, keyMap):
    formattedValue = stringToSplit 
    if type(stringToSplit) is str:
        if len(formattedValue) > 0:
            formattedValue = formattedValue.split(" ", 1)
            key = list_get(formattedValue, 0)
            mappedKey = keyMap.get(key)
            if (mappedKey):
                formattedValue = { mappedKey: list_get(formattedValue, 1, '') }
            else:
                formattedValue = {}
        else:
            formattedValue = {}
    return formattedValue

def appendStringOrListIntoList(lst, elToAppend):
    if elToAppend is not None:
        if type(elToAppend) == str:
            lst.append(elToAppend)
        else:
            lst = lst + elToAppend
    return lst

In [42]:
### These functios are used to convert the strings with paper features for a data entry
### into a dictionnary with mapped key and value

### Paper related
def convertPaperFeatures(dct, affiliations_data={}, papers_data={}, paper_authors_data={}, publication_venues_data={}, refs_data={}):
    if dct.get("paper_id"):
        paper_id = dct.get("paper_id")   
        affiliations_data["paper_id"] = paper_id
        papers_data["paper_id"] = paper_id
        paper_authors_data["paper_id"] = paper_id
        publication_venues_data["paper_id"] = paper_id
        refs_data["paper_id"] = paper_id
    elif dct.get("affiliations"):
        affiliations_data["affiliations"] = dct.get("affiliations")
    elif dct.get("ref_ids"):
        appendStringOrListIntoList(refs_data["ref_ids"], dct.get("ref_ids"))
    elif dct.get("authors"):
        paper_authors_data["authors"] = dct.get("authors")
    elif dct.get("title"):
        papers_data["title"] = dct.get("title")
    elif dct.get("year"):
        papers_data["year"] = dct.get("year")
    elif dct.get("publication_venue"):
        publication_venues_data["publication_venue"] = dct.get("publication_venue")
        
# Paper related
def reducePaperFeaturesToDict(featureA='', featureB='', keyMap={}):
    papers_data = {}
    affiliations_data = {}
    paper_authors_data = {}
    publication_venues_data = {}
    refs_data = { "ref_ids": [] }
    try:        
        splittedA = splitIntoKeyValue(featureA, keyMap);
        splittedB = splitIntoKeyValue(featureB, keyMap);
        
        if (splittedA.get('papers_data')):
            papers_data = splittedA.get('papers_data')
            affiliations_data = splittedA.get('affiliations_data')
            paper_authors_data = splittedA.get('paper_authors_data')
            publication_venues_data = splittedA.get('publication_venues_data')
            refs_data = splittedA.get('refs_data')
        else:
            convertPaperFeatures(splittedA, affiliations_data, papers_data, paper_authors_data, publication_venues_data, refs_data)
        convertPaperFeatures(splittedB, affiliations_data, papers_data, paper_authors_data, publication_venues_data, refs_data)
        return {
            "papers_data": papers_data,
            "affiliations_data": affiliations_data,
            "paper_authors_data": paper_authors_data,
            "publication_venues_data": publication_venues_data,
            "refs_data": refs_data
        }
    except Exception as error:
        print("ERROR: ", featureA, featureB, error)
        raise

In [43]:
### These functios are used to convert the strings with author features for a data entry
### into a dictionnary with mapped key and value

# Author related
def convertAuthorFeatures(dct, authors_data={}, research_interests_data={}):
    if dct.get("author_id"):
        author_id = dct.get("author_id")   
        authors_data["author_id"] = author_id
        research_interests_data["author_id"] = author_id
    elif dct.get("name"):
        authors_data["name"] = dct.get("name")
    elif dct.get("paper_count"):
        authors_data["paper_count"] = dct.get("paper_count")
    elif dct.get("citation_count"):
        authors_data["citation_count"] = dct.get("citation_count")
    elif dct.get("research_interests"):
        research_interests_data["research_interests"] = dct.get("research_interests")
    elif dct.get("h_index"):
        authors_data["h_index"] = dct.get("h_index")
    elif dct.get("publication_venue"):
        authors_data["publication_venue"] = dct.get("publication_venue")
        
# Author related
def reduceAuthorFeaturesToDict(featureA='', featureB='', keyMap={}):
    authors_data = {}
    research_interests_data = {}
    try:        
        splittedA = splitIntoKeyValue(featureA, keyMap);
        splittedB = splitIntoKeyValue(featureB, keyMap);
        
        if (splittedA.get('authors_data')):
            authors_data = splittedA.get('authors_data')
            research_interests_data = splittedA.get('research_interests_data')
        else: # If it's first reduce run, accumulator === the first item of the list. So the item should be converted
            convertAuthorFeatures(splittedA, authors_data, research_interests_data)
        convertAuthorFeatures(splittedB, authors_data, research_interests_data)
        return {
            "authors_data": authors_data,
            "research_interests_data": research_interests_data,
        }
    except Exception as error:
        print("ERROR: ", featureA, featureB, error)
        raise

In [44]:
def convertDataIntoDicts(objects, dataKeyMap, reducer):
    reducedObjects = objects.reduceByKey(lambda a, b: reducer(a, b, dataKeyMap))
    print(reducedObjects.sortByKey(ascending=False).first())
    mappedDicts = reducedObjects.map(lambda x: x[1]) # retrieve dicts from the tuples
    return mappedDicts

def convertDictsArrayIntoCSVFile(dictsArray, folderName, pathToFolder):
    df = sqlContext.createDataFrame(dictsArray)
    print(df.show())
    # Save data to csv file
    df.coalesce(1).write \
        .format("com.databricks.spark.csv") \
        .option("header", "true") \
        .save(f'{pathToFolder}{folderName}')

## Convert the papers dataset using above functions

In [45]:
paperTextLines = zipDatasetWithIndex(PAPER_DATASET_PATH)

                                                                                

In [46]:
summedArrayOfPaperIndexRows = doCumSumForIndexRows(paperTextLines)

                                                                                

In [47]:
paperItemTuples = convertDataIntoIndexedTuples(paperTextLines, summedArrayOfPaperIndexRows)

In [48]:
paperDictsArray = convertDataIntoDicts(paperItemTuples, PAPER_KEYS_MAP, reducePaperFeaturesToDict)



(2092356, {'papers_data': {'paper_id': '2092356', 'title': 'Reliability prediction through system modeling', 'year': '2013'}, 'affiliations_data': {'paper_id': '2092356', 'affiliations': 'Dept of Computer Engg IIT(BHU) Varanasi, India;Reactor Safety Division Bhabha Atomic Research Centre Dept of Atomic Energy, Govt of India;Dept of Computer Engg IIT(BHU) Varanasi, India'}, 'paper_authors_data': {'paper_id': '2092356', 'authors': 'Lalit Kumar Singh;Gopika Vinod;A. K. Tripathi'}, 'publication_venues_data': {'paper_id': '2092356', 'publication_venue': 'ACM SIGSOFT Software Engineering Notes'}, 'refs_data': {'ref_ids': ['215579', '333683', '511383', '594375', '641666', '763878', '966860', '1056157'], 'paper_id': '2092356'}})


                                                                                

In [49]:
def mapPapersData(data):
    refs_data = data["refs_data"]
    refs_data['ref_ids'] = ';'.join(refs_data['ref_ids'])
    return refs_data

papers_d = paperDictsArray.map(lambda x: x["papers_data"])
affiliations_d = paperDictsArray.map(lambda x: x["affiliations_data"])
paper_authors_d = paperDictsArray.map(lambda x: x["paper_authors_data"])
publication_venues_d = paperDictsArray.map(lambda x: x["publication_venues_data"])
paper_refs_d = paperDictsArray.map(lambda x:mapPapersData(x))

In [None]:
convertDictsArrayIntoCSVFile(papers_d, "papers", PATH_TO_ASSETS)
convertDictsArrayIntoCSVFile(affiliations_d, "affiliations", PATH_TO_ASSETS)
convertDictsArrayIntoCSVFile(paper_authors_d, "paper_authors", PATH_TO_ASSETS)
convertDictsArrayIntoCSVFile(publication_venues_d, "publication_venues", PATH_TO_ASSETS)
convertDictsArrayIntoCSVFile(paper_refs_d, "paper_refs", PATH_TO_ASSETS)

                                                                                

+--------+--------------------+----+
|paper_id|               title|year|
+--------+--------------------+----+
|      65|Direct file organ...|1984|
|     130|An introduction t...|1983|
|     195|On solving almost...|1984|
|     260|Connections betwe...|1984|
|     325|Computers and pen...|1984|
|     390|Relativizations c...|1984|
|     455|On the optimum ch...|1984|
|     520|All points addres...|1984|
|     585|Optimum Head Sepa...|1984|
|     650|A parallel-design...|1984|
|     715|Computer - IEEE C...|1984|
|     780|Experience with G...|1984|
|     845|Code generation a...|1984|
|     910|On estimating acc...|1984|
|     975|A distributed alt...|1985|
|    1040|A comparison of t...|1984|
|    1105|Generalizing spec...|1985|
|    1170|Real time graphic...|1984|
|    1235|Common and uncomm...|1984|
|    1300|Foundations of co...|1985|
+--------+--------------------+----+
only showing top 20 rows

None




In [None]:
### Retrieve the files into a correct folder
moveFileToCorrectFolder('papers', PATH_TO_ASSETS)
moveFileToCorrectFolder('affiliations', PATH_TO_ASSETS)
moveFileToCorrectFolder('paper_authors', PATH_TO_ASSETS)
moveFileToCorrectFolder('publication_venues', PATH_TO_ASSETS)
moveFileToCorrectFolder('paper_refs', PATH_TO_ASSETS)

## Convert the authors dataset using above functions

In [53]:
## Authors
authorTextLines = zipDatasetWithIndex(AUTHOR_DATASET_PATH)

                                                                                

In [54]:
summedArrayOfAuthorIndexRows = doCumSumForIndexRows(authorTextLines)

                                                                                

In [55]:
authorItemTuples = convertDataIntoIndexedTuples(authorTextLines, summedArrayOfAuthorIndexRows)

In [57]:
authorDictsArray = convertDataIntoDicts(authorItemTuples, AUTHOR_KEYS_MAP, reduceAuthorFeaturesToDict)

                                                                                

(1712433, {'authors_data': {'author_id': '1712433', 'name': 'Andrea Gantchev', 'paper_count': '2', 'citation_count': '3', 'h_index': '1'}, 'research_interests_data': {'author_id': '1712433', 'research_interests': 'subsumption architecture;Subsumption ArchitectureThe subsumption architecture;software architecture;subsumption architectureReusable Strategies;Object-oriented design;object-oriented software design;Rodney Brooks;Software Agents;behaviour-based control;different micro-strategies'}})


In [59]:
#Retrieve research interests and authors data
authors_d = authorDictsArray.map(lambda x: x["authors_data"])
research_interests_d = authorDictsArray.map(lambda x: x["research_interests_data"])

In [None]:
convertDictsArrayIntoCSVFile(authors_d, "authors", PATH_TO_ASSETS)
convertDictsArrayIntoCSVFile(research_interests_d, "research_interests", PATH_TO_ASSETS)

In [None]:
### Retrieve the files into a correct folder
moveFileToCorrectFolder('authors', PATH_TO_ASSETS)
moveFileToCorrectFolder('research_interests', PATH_TO_ASSETS)

## Convert the author-id-2-paper-id dataset

In [62]:
schema = StructType() \
      .add("index",IntegerType(),True) \
      .add("author_id",IntegerType(),True) \
      .add("paper_id",IntegerType(),True) \
      .add("author_position",StringType(),True)


In [63]:
# readFileA2P = spark.read.options(delimiter='\t').csv(AUTHOR_2_PAPER_DATASET_PATH, header=False,schema=schema)

author_id_2_paper_id_df = createDFFromFileAndSchema( \
    spark, \
    AUTHOR_2_PAPER_DATASET_PATH, \
    AUTHOR_2_PAPER_SCHEMA_PATH, \
    '\t', \
    False \
)

File path: ./assets/AMiner-Author2Paper.txt, schema path: ./schemas/paper_author_id.csv
Types from schema: [('index', 'Integer'), ('author_id', 'Integer'), ('paper_id', 'Integer'), ('author_position', 'Integer')]


In [64]:
author_id_2_paper_id_df.printSchema()

root
 |-- index: integer (nullable = true)
 |-- author_id: integer (nullable = true)
 |-- paper_id: integer (nullable = true)
 |-- author_position: integer (nullable = true)



In [65]:
author_id_2_paper_id_df.show()

[Stage 77:>                                                         (0 + 1) / 1]

+-----+---------+--------+---------------+
|index|author_id|paper_id|author_position|
+-----+---------+--------+---------------+
|    1|   381617|       1|              1|
|    2|   630546|       3|              1|
|    3|   112127|       4|              1|
|    4|    96116|       4|              2|
|    5|   578328|       5|              1|
|    6|   865779|       5|              2|
|    7|   669143|       5|              3|
|    8|   533344|       6|              1|
|    9|   621167|       7|              1|
|   10|   522333|       7|              2|
|   11|   597188|       7|              3|
|   12|  1396373|       8|              1|
|   13|  1644597|       8|              2|
|   14|   798283|       8|              3|
|   15|   371951|       9|              1|
|   16|   378500|      10|              1|
|   17|   117256|      11|              1|
|   18|   562284|      12|              1|
|   19|     1224|      13|              1|
|   20|  1223056|      14|              1|
+-----+----

                                                                                

In [67]:
### Change the delimiter to comma, add headers and save the csv
saveDFIntoCSVFolder(author_id_2_paper_id_df, 'paper_author_id', PATH_TO_ASSETS)
moveFileToCorrectFolder('paper_author_id', PATH_TO_ASSETS)

# readFileA2P.coalesce(1).write.option("header",True).option("delimiter",",").csv("paper-author.csv")


                                                                                