# PART 4: Prepare Author and Institution Data For Admin Import

This notebook formats the authors and institution and writes and affiliated_with relationships into csvs for admin import.

Unlike Part 3 preparing paper nodes, these elements do not have any properties and as such this notebook runs much faster and does not require a chunking methodology .... at least on my machine. 

In [1]:
from ogb.lsc import MAG240MDataset
import numpy as np
import pandas as pd
import dask.dataframe as dd

In [2]:
ROOT_DATA_DIR = '/data'
dataset = MAG240MDataset(root = ROOT_DATA_DIR)

### ID Offsets

To create a universal long id for nodes, we will offset the ogb index values for authors and institutions. This avoids id collisions across node labels.  

Note that Neo4j also allows for the partitioning of unique id by label: https://neo4j.com/docs/operations-manual/current/tools/neo4j-admin/neo4j-admin-import/#import-tool-id-spaces. My decision to offset ids this way to achieve a global node index is merely personal preference. 

In [3]:
AUTHOR_ID_OFFSET = 10_000_000_000 #10B
INSTITUTION_ID_OFFSET = AUTHOR_ID_OFFSET + 10_000_000_000 #10B

## Prepare Author Data

In [4]:
#get edge index into dask df
writes_ddf = dd.from_pandas(pd.DataFrame(dataset.edge_index('author', 'paper').T,
                                columns = [":START_ID",":END_ID"]), npartitions=100)

In [5]:
# offset author index (start id)
writes_ddf[':START_ID'] = writes_ddf[':START_ID']+ AUTHOR_ID_OFFSET
writes_ddf.compute()

Unnamed: 0,:START_ID,:END_ID
0,10000000000,17776550
1,10000000000,22232787
2,10000000000,22359844
3,10000000000,34644458
4,10000000000,59079951
...,...,...
386022715,10122383107,83933134
386022716,10122383108,99252845
386022717,10122383109,99252845
386022718,10122383110,99252845


In [6]:
# write edge indexes (to test area)
writes_ddf.to_csv(f'{ROOT_DATA_DIR}/demo-load/writes-*.csv', 
                      header_first_partition_only=True, index=False, scheduler="processes")

  warn(


['/data/demo-load/writes-00.csv',
 '/data/demo-load/writes-01.csv',
 '/data/demo-load/writes-02.csv',
 '/data/demo-load/writes-03.csv',
 '/data/demo-load/writes-04.csv',
 '/data/demo-load/writes-05.csv',
 '/data/demo-load/writes-06.csv',
 '/data/demo-load/writes-07.csv',
 '/data/demo-load/writes-08.csv',
 '/data/demo-load/writes-09.csv',
 '/data/demo-load/writes-10.csv',
 '/data/demo-load/writes-11.csv',
 '/data/demo-load/writes-12.csv',
 '/data/demo-load/writes-13.csv',
 '/data/demo-load/writes-14.csv',
 '/data/demo-load/writes-15.csv',
 '/data/demo-load/writes-16.csv',
 '/data/demo-load/writes-17.csv',
 '/data/demo-load/writes-18.csv',
 '/data/demo-load/writes-19.csv',
 '/data/demo-load/writes-20.csv',
 '/data/demo-load/writes-21.csv',
 '/data/demo-load/writes-22.csv',
 '/data/demo-load/writes-23.csv',
 '/data/demo-load/writes-24.csv',
 '/data/demo-load/writes-25.csv',
 '/data/demo-load/writes-26.csv',
 '/data/demo-load/writes-27.csv',
 '/data/demo-load/writes-28.csv',
 '/data/demo-l

In [7]:
# get nodes out and deduped
authors_ddf = writes_ddf.drop([":END_ID"], axis=1).drop_duplicates(subset=[":START_ID"]).repartition(npartitions=100)\
 .rename(columns={":START_ID":"ogbIndex:ID"})
authors_ddf.compute()

Unnamed: 0,ogbIndex:ID
0,10000000000
13,10000000001
27,10000000002
190,10000000003
212,10000000004
...,...
386022715,10122383107
386022716,10122383108
386022717,10122383109
386022718,10122383110


In [8]:
# write nodes
authors_ddf.to_csv(f'{ROOT_DATA_DIR}/demo-load/authors-*.csv', 
                      header_first_partition_only=True, index=False, scheduler="processes")

['/data/demo-load/authors-00.csv',
 '/data/demo-load/authors-01.csv',
 '/data/demo-load/authors-02.csv',
 '/data/demo-load/authors-03.csv',
 '/data/demo-load/authors-04.csv',
 '/data/demo-load/authors-05.csv',
 '/data/demo-load/authors-06.csv',
 '/data/demo-load/authors-07.csv',
 '/data/demo-load/authors-08.csv',
 '/data/demo-load/authors-09.csv',
 '/data/demo-load/authors-10.csv',
 '/data/demo-load/authors-11.csv',
 '/data/demo-load/authors-12.csv',
 '/data/demo-load/authors-13.csv',
 '/data/demo-load/authors-14.csv',
 '/data/demo-load/authors-15.csv',
 '/data/demo-load/authors-16.csv',
 '/data/demo-load/authors-17.csv',
 '/data/demo-load/authors-18.csv',
 '/data/demo-load/authors-19.csv',
 '/data/demo-load/authors-20.csv',
 '/data/demo-load/authors-21.csv',
 '/data/demo-load/authors-22.csv',
 '/data/demo-load/authors-23.csv',
 '/data/demo-load/authors-24.csv',
 '/data/demo-load/authors-25.csv',
 '/data/demo-load/authors-26.csv',
 '/data/demo-load/authors-27.csv',
 '/data/demo-load/au

## Prepare Institution Data

In [9]:
#get edge index into dask df
affiliated_with_ddf = dd.from_pandas(pd.DataFrame(dataset.edge_index('author', 'institution').T, 
                                columns = [":START_ID",":END_ID"]), npartitions=100)

In [10]:
# offset author and institution index (start id)
affiliated_with_ddf[':START_ID'] = affiliated_with_ddf[':START_ID'] + AUTHOR_ID_OFFSET
affiliated_with_ddf[':END_ID'] = affiliated_with_ddf[':END_ID'] + INSTITUTION_ID_OFFSET
affiliated_with_ddf.compute()

Unnamed: 0,:START_ID,:END_ID
0,10000000002,20000000000
1,10000000003,20000000000
2,10000000004,20000000245
3,10000000009,20000000001
4,10000000010,20000000002
...,...,...
44592581,10122383075,20000000641
44592582,10122383082,20000000720
44592583,10122383086,20000000577
44592584,10122383094,20000004054


In [11]:
# write edge indexes (to test area)
affiliated_with_ddf.to_csv(f'{ROOT_DATA_DIR}/demo-load/affiliated_with-*.csv', 
                      header_first_partition_only=True, index=False, scheduler="processes")

['/data/demo-load/affiliated_with-00.csv',
 '/data/demo-load/affiliated_with-01.csv',
 '/data/demo-load/affiliated_with-02.csv',
 '/data/demo-load/affiliated_with-03.csv',
 '/data/demo-load/affiliated_with-04.csv',
 '/data/demo-load/affiliated_with-05.csv',
 '/data/demo-load/affiliated_with-06.csv',
 '/data/demo-load/affiliated_with-07.csv',
 '/data/demo-load/affiliated_with-08.csv',
 '/data/demo-load/affiliated_with-09.csv',
 '/data/demo-load/affiliated_with-10.csv',
 '/data/demo-load/affiliated_with-11.csv',
 '/data/demo-load/affiliated_with-12.csv',
 '/data/demo-load/affiliated_with-13.csv',
 '/data/demo-load/affiliated_with-14.csv',
 '/data/demo-load/affiliated_with-15.csv',
 '/data/demo-load/affiliated_with-16.csv',
 '/data/demo-load/affiliated_with-17.csv',
 '/data/demo-load/affiliated_with-18.csv',
 '/data/demo-load/affiliated_with-19.csv',
 '/data/demo-load/affiliated_with-20.csv',
 '/data/demo-load/affiliated_with-21.csv',
 '/data/demo-load/affiliated_with-22.csv',
 '/data/dem

In [12]:
# get nodes out and deduped
institutions_ddf = affiliated_with_ddf.drop([":START_ID"], axis=1)\
    .drop_duplicates(subset=[":END_ID"]).rename(columns={":END_ID":"ogbIndex:ID"})
institutions_ddf.compute()

Unnamed: 0,ogbIndex:ID
0,20000000000
2,20000000245
3,20000000001
4,20000000002
7,20000000557
...,...
44405762,20000025715
44493495,20000025716
44502203,20000025717
44508070,20000025718


In [13]:
# write nodes
institutions_ddf.to_csv(f'{ROOT_DATA_DIR}/demo-load/institution-*.csv', 
                      header_first_partition_only=True, index=False, scheduler="processes")

['/data/demo-load/institution-0.csv']