In [1]:
import json
import time
import pandas as pd
from py2neo import Graph, Node, Relationship
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeClassifier

In [3]:
graph = Graph("bolt://neo4j-allquanta:7687", auth=('neo4j','myneo'))
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

Connected to graph database with 278,432,359 nodes and 1,844,501,832 relationships!


In [6]:
years_to_use = 3
start_year = 2009
end_year = 2013

print("Getting dataset...", end=" ")
cites_str = ',\n    '.join(['CASE WHEN {} < q.year THEN NULL ELSE SIZE((q)<-[:CITES]-(:Quanta {{year: {}}})) END as c{}'.format(
    yr, yr, yr) for yr in range(start_year, end_year+1)])
tspr_str = ',\n    '.join(['q.tspr{} as tspr{}'.format(
    yr, yr) for yr in range(start_year, end_year+1)])
query = """
MATCH (q:Quanta)
WHERE 
    (q.doctype='Journal') AND 
    (q.lang='en') AND 
    EXISTS(q.fos) AND 
    (q.year>={} AND q.year <= {}) AND
    q.venue='Nature'   # ADDED THIS LINE IN FOR NATURE ONLY VENUE
RETURN
    q.year as year,
    q.title as title,
    q.id as id,
    {},
    {}
LIMIT 1000
""".format(start_year, end_year-years_to_use, tspr_str, cites_str)
print(query)
query_start_time = time.time()
df = graph.run(query).to_data_frame()
print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))

Getting dataset... 
MATCH (q:Quanta)
WHERE 
    (q.doctype='Journal') AND 
    (q.lang='en') AND 
    EXISTS(q.fos) AND 
    (q.year>=2009 AND q.year <= 2010) AND
    q.venue='Nature'
RETURN
    q.year as year,
    q.title as title,
    q.id as id,
    q.tspr2009 as tspr2009,
    q.tspr2010 as tspr2010,
    q.tspr2011 as tspr2011,
    q.tspr2012 as tspr2012,
    q.tspr2013 as tspr2013,
    CASE WHEN 2009 < q.year THEN NULL ELSE SIZE((q)<-[:CITES]-(:Quanta {year: 2009})) END as c2009,
    CASE WHEN 2010 < q.year THEN NULL ELSE SIZE((q)<-[:CITES]-(:Quanta {year: 2010})) END as c2010,
    CASE WHEN 2011 < q.year THEN NULL ELSE SIZE((q)<-[:CITES]-(:Quanta {year: 2011})) END as c2011,
    CASE WHEN 2012 < q.year THEN NULL ELSE SIZE((q)<-[:CITES]-(:Quanta {year: 2012})) END as c2012,
    CASE WHEN 2013 < q.year THEN NULL ELSE SIZE((q)<-[:CITES]-(:Quanta {year: 2013})) END as c2013
LIMIT 1000

Done (0.72 minutes).


In [18]:
# appends class to top 5 percentile making into classification problem

threshold = df.quantile(0.95).tspr2013
df['class'] = df.apply(lambda row: 0 if row.tspr2013 < threshold else 1, axis=1)


In [43]:
# extract features for each paper
df.apply(lambda row: NaN not in row)

Unnamed: 0,c2009,c2010,c2011,c2012,c2013,id,title,tspr2009,tspr2010,tspr2011,tspr2012,tspr2013,year,class
0,31.0,52,63,74,76,003e7f6d-ba6f-4e72-8d05-9fd2a35a390c,Bidirectional promoters generate pervasive tra...,0.192500,0.192500,0.235825,0.323400,0.396755,2009,0
1,,0,7,7,3,004f40b8-afc4-4539-af81-babe9bfbdbef,Genomics: In search of rare human variants,,0.150000,0.150000,0.150000,0.150000,2010,0
2,0.0,0,0,0,0,006df37d-8867-4b5b-a851-021cc659c5f1,Chief scientist quits California stem-cell agency,0.150000,0.150000,0.150000,0.150000,0.150000,2009,0
3,1.0,0,1,0,0,006f95f8-6b0b-4dce-9fb6-6358218cef79,Carbon trading: How to save a forest,0.150000,0.150000,0.150000,0.150000,0.150000,2009,0
4,,0,0,0,0,00a14842-1a89-4364-8c42-bd35082b138d,Psychology: A social animal revealed,,0.150000,0.150000,0.150000,0.150000,2010,0
5,,1,0,2,4,00cebaa9-dd01-49dd-9bbe-d5359ccbd1d4,Inferring echolocation in ancient bats.,,0.192500,0.192500,0.192500,0.192500,2010,0
6,,7,14,11,17,00ecfe4e-e24c-4b04-a604-6a3ba01e54f3,Mical links semaphorins to F-actin disassembly,,0.150000,0.150000,0.150000,0.150000,2010,0
7,,5,26,39,34,00fd5c82-8cd5-411e-8dd1-a26ab7183b1e,Rb regulates fate choice and lineage commitmen...,,0.150000,0.150000,0.150000,0.150000,2010,0
8,0.0,0,0,0,0,0100ae6c-f60d-49e1-8008-0c5f39c57b48,Taking a fossil primate on the road,0.150000,0.150000,0.150000,0.150000,0.150000,2009,0
9,,0,0,0,0,01016abb-8de8-4c1a-bef7-10131ef888f0,Teams set for first taste of Antarctic lakes,,0.150000,0.150000,0.150000,0.150000,2010,0
