## Data Preprocessing on The given neo4j dataset. 
The dump file has been loaded onto its own server and is running in the background

#### Importing needed libraries

In [3]:
import pandas as pd
import numpy as np
from py2neo import Graph,Node,Relationship

In [4]:


HOST = "localhost"
PORT = "7687"
url = f"neo4j://{HOST}:{PORT}"
# Ideally we would want to use environment variables for assignment of username and password however,
# that would not be necessary in this case
USERNAME="neo4j"
PASSWD="12345678"
graph = Graph(url,auth=(USERNAME, PASSWD))

#### Running a query to confirm whether the connection is active or not

In [5]:
# This simply counts the number of nodes
query = "MATCH (n) RETURN COUNT(n)"

result = graph.query(query)
print(result)

 COUNT(n) 
----------
      347 



In [6]:

query = f"""
MATCH (a1:Author)-[r:CO_AUTHORED]->(a2:Author) RETURN a1,a2
"""

result = graph.run(query).data()

#### Transforming the data into a pandas dataframe, which will make the data much easier to work with.
The edges in the data between nodes signify that those two nodes(authors) have co authored with each other. We can easily find out the pair of authors that have co authored with each other. 

In [7]:
authors = set()
edges = []

for row in result:
    a1 = row['a1']['author_id']
    a2 = row['a2']['author_id']
    authors.add(a1)
    authors.add(a2)
    edges.append((a1,a2))


author_df = pd.DataFrame(list(authors),columns=['Author'])
edges_df = pd.DataFrame(edges,columns=['Author1','Author2'])
edges_df.head()

Unnamed: 0,Author1,Author2
0,authorID_9a049_b03f6_fc40b_fcf2f_13632,authorID_84a50_92e4a_5b6fe_968fd_523fb
1,authorID_9a049_b03f6_fc40b_fcf2f_13632,authorID_8b940_be7fb_78aaa_6b656_7dd7a
2,authorID_9a049_b03f6_fc40b_fcf2f_13632,authorID_2c7d5_490e6_05083_6f8f2_f0d49
3,authorID_9a049_b03f6_fc40b_fcf2f_13632,authorID_1be00_34108_2e25c_4e251_ca671
4,authorID_9a049_b03f6_fc40b_fcf2f_13632,authorID_d874e_4e4a5_df211_73b0f_83e31


In [8]:
author_feature_data = []
for row in result:
    author_data = {
        "Author":row['a1']['author_id']
    }
    for i in range(1,225):
        author_data[f'Feature{i}'] = row['a1'][f'Feature{i}']
    author_feature_data.append(author_data)


df = pd.DataFrame(author_feature_data)
df.head()  

Unnamed: 0,Author,Feature1,Feature225,Feature2,Feature226,Feature3,Feature227,Feature4,Feature228,Feature5,...,Feature220,Feature444,Feature221,Feature445,Feature222,Feature446,Feature223,Feature447,Feature224,Feature448
0,authorID_9a049_b03f6_fc40b_fcf2f_13632,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,0,0,0
1,authorID_9a049_b03f6_fc40b_fcf2f_13632,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,0,0,0
2,authorID_9a049_b03f6_fc40b_fcf2f_13632,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,1,0,0
3,authorID_9a049_b03f6_fc40b_fcf2f_13632,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,0,0,0
4,authorID_9a049_b03f6_fc40b_fcf2f_13632,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,1,0,0,0


In [9]:
author_df.to_csv("../data/author_id.csv",index=False)
edges_df.to_csv("../data/co_author_relation.csv",index=False)
df.to_csv("../data/author_coauthor_features.csv",index=False)