In [None]:
import json
import time
import pandas as pd
from py2neo import Graph, Node, Relationship
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier

In [None]:
graph = Graph("bolt://neo4j-allquanta:7687", auth=('neo4j','myneo'))
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

In [None]:
years_to_use = 3
start_year = 2009
end_year = 2013

print("Getting dataset...", end=" ")
cites_str = ',\n    '.join(['CASE WHEN {} < q.year THEN NULL ELSE SIZE((q)<-[:CITES]-(:Quanta {{year: {}}})) END as c{}'.format(
    yr, yr, yr) for yr in range(start_year, end_year+1)])
tspr_str = ',\n    '.join(['q.tspr{} as tspr{}'.format(
    yr, yr) for yr in range(start_year, end_year+1)])
query = """
MATCH (q:Quanta)
WHERE 
    (q.doctype='Journal') AND 
    (q.lang='en') AND 
    EXISTS(q.fos) AND 
    (q.year>={} AND q.year <= {}) AND
    q.venue='Nature'   # ADDED THIS LINE IN FOR NATURE ONLY VENUE
RETURN
    q.year as year,
    q.title as title,
    q.id as id,
    {},
    {}
LIMIT 1000
""".format(start_year, end_year-years_to_use, tspr_str, cites_str)
print(query)
query_start_time = time.time()
df = graph.run(query).to_data_frame()
print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))

In [None]:
# appends class to top 5 percentile making into classification problem

threshold = df.quantile(0.95).tspr2013
df['class'] = df.apply(lambda row: 0 if row.tspr2013 < threshold else 1, axis=1)


In [None]:
# extract features for each paper
df.apply(lambda row: NaN not in row)