### Imports

In [None]:
import json
import time
import pandas as pd
from py2neo import Graph, Node, Relationship

### Connect to graph

In [None]:
graph = Graph("bolt://neo4j:7687", auth=('neo4j','myneo'))
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
      graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

### Build dataset

- Consider papers from 1995 onwards. 
- For each paper, extract features and citation data for the next X years
- Output goal is Time-Scaled PageRank Y years later. 


In [None]:
starting_year, ending_year = 2008, 2008
years_to_use = 5
years_until_prediction = 10

In [None]:
def query_to_df(query_string):
    query_start_time = time.time()
    df = graph.run(query_string).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

def build_training_query(year, years_to_use, n_samples=10000):
    
    citations_string = "SIZE((q)<-[:CITES]-(:Quanta {{year: q.year+{}}})) as c{}"
    citations_by_year = ",\n\t".join([citations_string.format(i,i) for 
                                      i in range(1,years_to_use+1)]) + ','
    data_query = """
    MATCH (q:Quanta)
    WHERE q.year={} AND q.doctype='Journal' AND q.lang='en'
    RETURN
        q.id,
        {}
        q.pageRank_2018 as pagerank
    LIMIT {}
    """.format(year, citations_by_year, n_samples)
    return data_query

In [None]:
for year in range(starting_year, ending_year+1, 1):
    data_query = build_training_query(year, years_to_use, 10000)
    df = query_to_df(data_query)
                     
X = df.iloc[:,0:years_to_use].values
y = df.iloc[:,years_to_use].values

In [None]:
X.shape

In [None]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3)

from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor()
regr.fit(X_train, y_train)
print(regr)

from sklearn.metrics import r2_score
y_pred = regr.predict(X_test)
r2 = r2_score(y_test, y_pred)
print("R-squared: {}".format(r2))