## Converting our data from CSV to DGL
DGL requires a format of data. If we use this format directly, we can use a lot of the functionality 
it has to offer, without writing monotonous repetitive functions, breaking the DRY principle of software engineering.
This also helps us use the data without any modification to it

Our dataset will be called `author_data`<br>
 - **edges.csv**: consists of all the edges in our graph<br>
 - **meta.yaml**: consists of the metadata of our actual data, this will help dgl know we are using a similar data format <br>
 - **nodes.csv**: consists of all the nodes and all of their features <br>

In [1]:
import pandas as pd
import numpy as np

In [2]:
author_id = pd.read_csv("../data/author_id.csv")
author_id_to_number = {author_id: idx for idx, author_id in enumerate(author_id['Author'])}
author_id.Author

0      authorID_7c252_ab334_fb8fd_88e82_42c49
1      authorID_02cca_3803b_564ed_e11cc_f9f30
2      authorID_f10d9_1a759_6bf5a_67735_79ff1
3      authorID_36790_ecd55_c2030_dc553_685be
4      authorID_5cf4e_26bd3_d87da_5e03f_80a43
                        ...                  
328    authorID_81b8a_03f97_e8787_c53fe_1a86b
329    authorID_8f1f6_4db81_c40ea_10e1e_9080c
330    authorID_96442_94ac4_ffb30_91eef_01219
331    authorID_ff5a1_ae012_afa5d_4c889_c50ad
332    authorID_69f59_c273b_6e669_ac32a_6dd5e
Name: Author, Length: 333, dtype: object

In [3]:
edge_data = pd.read_csv("../data/co_author_relation.csv")
edge_data.Author1 = edge_data.Author1.map(author_id_to_number) 
edge_data.Author2 = edge_data.Author2.map(author_id_to_number) 
edge_data.rename(columns={"Author1": "src_id", "Author2":"dst_id"},inplace=True)
edge_data.to_csv("../data/author_data/edges.csv", index=False)

In [4]:
node_data = pd.read_csv("../data/author_coauthor_features.csv")
node_data.drop_duplicates(keep="first",inplace=True)
node_data.Author = node_data.Author.map(author_id_to_number)
node_data.rename(columns={"Author":"node_id"},inplace=True)
nodes = node_data['node_id'].to_list()
missing_elements = [item for item in range(min(nodes), max(nodes)) if item not in nodes]
for i in range(len(missing_elements)):
    node_data.loc[len(node_data) + i] = missing_elements[i]

for i in range(len(missing_elements)):
    node_data.iloc[len(node_data) - len(missing_elements) + 2: ,1:] = 0

node_data

Unnamed: 0,node_id,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,...,Feature215,Feature216,Feature217,Feature218,Feature219,Feature220,Feature221,Feature222,Feature223,Feature224
0,179,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,1,1,0,1,0
16,240,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51,149,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
59,272,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
92,34,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
346,296,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
348,305,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
350,309,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
352,327,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:


node_data['feat'] = node_data[node_data.columns[1:]].apply(
    lambda x: ', '.join(x.dropna().astype(str)),
    axis=1
)
node_data.drop(columns=[i for i in node_data.columns[1:-1]],inplace=True)
node_data = node_data.reset_index()
node_data.drop(columns=["index"],inplace=True)
df_len = len(node_data)
node_data.to_csv("../data/author_data/nodes.csv",index=False)
node_data

Unnamed: 0,node_id,feat
0,179,"0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0..."
1,240,"0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,149,"0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0..."
3,272,"0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0..."
4,34,"0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0..."
...,...,...
326,296,"0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
327,305,"0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
328,309,"0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
329,327,"0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
