# Part 1: Prepare Data For Non-Graph ("Flat") Modeling

In [1]:
from ogb.lsc import MAG240MDataset
import numpy as np
import os
import pandas as pd

## Notebook Setup

Root Directory for data storage. Will be used in following parts as well.

In [2]:
ROOT_DATA_DIR = "/data"

In [3]:
if not os.path.exists(ROOT_DATA_DIR):
    os.mkdir(ROOT_DATA_DIR)
    print(f'Created new directory: {ROOT_DATA_DIR}')
else:
    print(f'Directory {ROOT_DATA_DIR} already exists')

Directory /data already exists


### Get the Dataset Object
The dataset object handles downloading and easy access to the data and its features. The dataset object leverages [numpy memmap](https://numpy.org/doc/stable/reference/generated/numpy.memmap.html) functionality to reference large pieces of the dataset on disk so it does not need to load all the features into memory at a time. For more information, please see the [OGB MAG240M Page](https://ogb.stanford.edu/kddcup2021/mag240m/).

__Note: This command takes a while in the *first* run (several hours to a day)__ as the source data needs to be download from OGB. Sequential runs should be near instantaneous though.


In [4]:
dataset = MAG240MDataset(root = ROOT_DATA_DIR)

## Examine Data Splitting and Labels

Only a fraction of the papers (the arXiv papers) are labeled.  An `idx_split` object is provided with indexes mapping the labeled papers to training, validate, and test sets. As we will see below, the training sets have their labels hidden for purposes of previous competition. More information on the data and labeling process can be found at the [OGB MAG240M Page](https://ogb.stanford.edu/kddcup2021/mag240m/)

In [5]:
#get the indexes for arXiv paper data splits
split_dict = dataset.get_idx_split()

In [6]:
#get the relative sizes of each set
for i in split_dict.keys():
    print('------------------')
    print(f'{i} index size = {len(split_dict[i])}')

------------------
train index size = 1112392
------------------
valid index size = 138949
------------------
test-whole index size = 146818
------------------
test-dev index size = 88092
------------------
test-challenge index size = 58726


In [7]:
# Note that we only have known labels in the train and validate sets. 
# A value of -1 implies a hidden label
for i in split_dict.keys():
    paper_labels = dataset.paper_label[split_dict[i]]
    print(f'Paper labels for the "{i}" set:')
    print('--------')
    print(f'Sample values = {paper_labels[:5]}')
    print(f'Number non-missing = {sum(dataset.paper_label[split_dict[i]] > -1)}')
    print('============================\n')

Paper labels for the "train" set:
--------
Sample values = [17. 29. 38.  5.  1.]
Number non-missing = 1112392

Paper labels for the "valid" set:
--------
Sample values = [140. 129.  33.  59.  24.]
Number non-missing = 138949

Paper labels for the "test-whole" set:
--------
Sample values = [-1. -1. -1. -1. -1.]
Number non-missing = 0

Paper labels for the "test-dev" set:
--------
Sample values = [-1. -1. -1. -1. -1.]
Number non-missing = 0

Paper labels for the "test-challenge" set:
--------
Sample values = [-1. -1. -1. -1. -1.]
Number non-missing = 0



## Building a DataFrame for Supervised Model Testing

We will use the 'train' and 'valid' set for pre-graph supervised model analysis
since they are the only ones with labels

In [8]:
#get the training set
feat_cols = [f'paper_encoding_{i}' for i in range(768)]
paper_df_train = pd.DataFrame(dataset.paper_feat[split_dict['train']], columns = feat_cols)
paper_df_train['split_segment'] = 'TRAIN'
paper_df_train['paper_subject'] = dataset.paper_label[split_dict['train']]
paper_df_train['paper_year'] = dataset.paper_year[split_dict['train']]

In [9]:
#get the validation set
paper_df_validate = pd.DataFrame(dataset.paper_feat[split_dict['valid']], columns = feat_cols)
paper_df_validate['split_segment'] = 'VALIDATE'
paper_df_validate['paper_subject'] = dataset.paper_label[split_dict['valid']]
paper_df_validate['paper_year'] = dataset.paper_year[split_dict['valid']]

In [10]:
#join
paper_df = pd.concat([paper_df_train, paper_df_validate])
paper_df

Unnamed: 0,paper_encoding_0,paper_encoding_1,paper_encoding_2,paper_encoding_3,paper_encoding_4,paper_encoding_5,paper_encoding_6,paper_encoding_7,paper_encoding_8,paper_encoding_9,...,paper_encoding_761,paper_encoding_762,paper_encoding_763,paper_encoding_764,paper_encoding_765,paper_encoding_766,paper_encoding_767,split_segment,paper_subject,paper_year
0,0.438477,0.211060,0.393311,0.055969,-0.078003,-0.017807,0.553223,-0.319824,0.394043,0.502930,...,-0.052490,1.092773,0.157227,-1.467773,-1.590820,0.328613,0.332275,TRAIN,17.0,2014
1,0.468994,-0.202637,0.023331,0.535645,0.496582,0.024368,0.239990,0.539551,0.460449,0.078491,...,-0.132812,1.125977,0.368164,-0.191406,-0.378418,0.031616,-0.311523,TRAIN,29.0,2014
2,0.047485,-0.398682,-0.420410,0.882324,-0.114685,0.607910,0.151001,0.124695,-0.012108,-0.005211,...,-0.130127,-0.121155,0.790527,-0.147827,-0.451904,0.516602,-0.135986,TRAIN,38.0,2015
3,-0.395508,-0.464355,-0.336670,-0.156616,-0.396240,-0.449951,-0.033630,0.393066,0.552246,-0.076782,...,0.149780,1.133789,0.386230,0.066162,0.760742,0.355469,-0.658691,TRAIN,5.0,2005
4,0.103210,-0.125122,0.039490,0.651855,0.279053,0.020828,0.325439,-0.004528,0.264404,0.178101,...,0.056824,0.499023,0.038788,0.906250,-0.623047,-0.119080,0.394043,TRAIN,1.0,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138944,-0.254883,-0.069885,-0.821289,1.201172,-0.639160,-0.368164,0.802246,-0.076355,0.324219,0.030365,...,0.553223,0.697754,0.369629,1.799805,-0.534180,-0.112244,-0.230713,VALIDATE,51.0,2019
138945,0.667480,-0.046448,0.194214,0.251953,0.003784,0.495361,0.756348,-0.065125,-0.071777,0.123657,...,-0.436279,1.187500,0.360596,-1.391602,-0.752930,-0.068970,0.195923,VALIDATE,12.0,2019
138946,0.660645,-0.515137,-0.776367,0.222412,-1.073242,0.049652,0.335205,0.281982,1.385742,0.360840,...,-0.331787,-0.043549,0.609863,0.025223,0.232422,0.211304,0.060333,VALIDATE,18.0,2019
138947,0.427246,-0.276855,-0.203857,0.391113,-0.368896,-0.091003,1.030273,0.415039,0.506836,0.121399,...,-0.198730,1.151367,0.054382,-0.266113,-0.600098,0.258057,0.411377,VALIDATE,72.0,2019


In [11]:
#write to Parquet so we do not need to repeat this process...keep the index
paper_df.to_parquet(ROOT_DATA_DIR + "/ogb-labeled-papers.parquet", engine='fastparquet', index=True)