In [None]:
import pandas as pd
import json
from py2neo import Graph, Node, Relationship
import numpy as np
import time

In [None]:
# Time Scaled PageRank
graph = Graph("bolt://neo4j:7687")

start_time = time.time()
    
query_start_time = time.time()
query = """
MATCH (p:Patent)
WHERE p.title IS NOT NULL AND p.title <> "" AND (size(p.authors)>0)
WITH 
    *,  
    p.id AS id, 
    p.title AS title,
    p.year AS year, 
    p.pagerank AS pagerank
ORDER BY pagerank DESC
WITH year, COLLECT({{title: title,  page_rank: pagerank}})[..1000] AS data, AVG(pagerank) AS avg_page_rank, stDev(pagerank) as stdDev
UNWIND data AS d
RETURN  
    d.id AS id,
    d.title AS title, 
    d.year AS year, 
    ABS(d.pagerank-avg_page_rank)/stdDev AS scaled_score;
""".format(year,year,year)

print(query)
df = graph.run(query).to_data_frame()
    
    
print("Finished all calculations in {:.2f} minutes.".format((time.time()-start_time)/60))


In [None]:
df

In [None]:
df.drop_duplicates(subset='title', inplace=True)
df.drop_duplicates(subset='id', inplace=True)
# df['authors'] = df['authors'].apply(lambda x: ', '.join(x))
df.to_csv(path_or_buf='/tmp/data/patents_by_tspagerank.csv')

In [None]:
# Run STREAMING PageRank (1) on each year from 1800 to 2020
import pandas as pd
import numpy as np
import time

start_time = time.time()
start_year, end_year, step = 1985, 2010, 5
dfs = []
for year in range(start_year, end_year+1, step):
    
    # < IS MUCH FASTER THAN <=
    print("Running PageRank on works from < {}...".format(year), end=" ")
    query_start_time = time.time()
    query = """
    CALL algo.pageRank.stream(
    'MATCH (p:Quanta) WHERE p.year < {} RETURN id(p) as id',
    'MATCH (p1:Quanta)-[:CITES]->(p2:Quanta) RETURN id(p1) as source, id(p2) as target',
    {{graph:'cypher', iterations:20, write:false, concurrency:20}})
    YIELD node, score
    WITH 
        *,
        node.id AS id, 
        node.title AS title, 
        node.lang AS lang, 
        node.year AS year, 
        node.keywords AS keywords, 
        node.fos AS fos, 
        node.publisher AS publisher,
        score AS page_rank
    ORDER BY page_rank DESC
    LIMIT 10000
    RETURN 
        id,
        title,
        lang, 
        year, 
        keywords, 
        fos, 
        publisher,
        page_rank;    
    """.format(year,year,year)
    
    #graph.run(query)

    #print(query)
    df = graph.run(query).to_data_frame()
    #df['year'] = year
    #dfs.append(df)
    #print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    
print("Finished all calculations in {:.2f} minutes.".format((time.time()-start_time)/60))

In [None]:
# If we don't scale things within the query and just run regular pagerank, we can use this to do time-scaled-pr
def normalize_and_sort(df):
    df['scores_minus_mean'] = df.groupby('year').score - df.groupby('year').score.mean()
    df['scaled_scores'] = df.groupby('year').score_minus_mean / df.groupby('year').score.std()
    df = df.sort_values(['scaled_scores'], ascending=False).head(1000)
    
    #df['scores_minus_mean'] = df['score'] - mean
    #df['scaled_scores'] = df['scores_minus_mean'].divide(stDev, axis=1)

    
    return df

In [None]:
# Get the top 1,000 scaled scores
results = normalize_and_sort(df)


In [None]:
all_results = pd.concat(dfs)
all_results_path = '/tmp/data/result/allResults_{}-{}-{}.csv'.format(start_year, end_year,step)
print("Writing all results to {}...".format(all_results_path), end=" ")
all_results.to_csv(path_or_buf=all_results_path, sep=",", header=True, index=True)
print("Done.")

In [None]:
import numpy as np

# all_results['title_clean'] = all_results['title'].str.replace(',',' ')
all_results['primary_field'] = all_results['fos'].apply(lambda x: x[0] if np.all(pd.notnull(x)) else None)
result = all_results.pivot_table(index=['title','primary_field','publisher'], columns='year', values='impact')    

file_path = '/tmp/data/result/impact_{}-{}-{}.csv'.format(start_year, end_year,step)
print("Writing results to {}...".format(file_path), end=" ")
result.to_csv(path_or_buf=file_path, sep=",", header=True, index=True)
print("Done.")



In [None]:
# Extra stuff 

# Make fake data
a = {'scores': [1, 2, 4, 18, 25, 13]}
df1950 = pd.DataFrame(data=a)
b = {'scores': [2, 7, 4, 68, 20, 13]}
df1951 = pd.DataFrame(data=b)
c = {'scores': [9, 8, 4, 18, 5, 83]}
df1952 = pd.DataFrame(data=c)

# Normalize each dataframe which represents pagerank scores of papers published in a given year
d1 = normalize_scores_for_year(df1950)
d2 = normalize_scores_for_year(df1951)
d3 = normalize_scores_for_year(df1952)

years = [d1, d2, d3]
#years = ['df_' + year for year in range(1900, 2000)]

# Find the top papers from all the years combined and return the top 1000
all_years_df = combine_scores(years)
result = get_top_papers(all_years_df)
    
print(result)

def normalize_scores_for_year_df(df):
    
    mean = df['scores'].mean(axis=0)
    std =  df['scores'].std(axis=0)
    #print(mean)
    #print(std)

    scores_minus_mean = df - mean
    scaled_scores = scores_minus_mean.divide(std, axis=1)

    #print(df)
    #print(scaled_scores)

    scaled_df = pd.DataFrame(scaled_scores)
    sorted_df = scaled_df.sort_values(['scores'], ascending=False).head(1000)
    
    return sorted_df

def combine_scores(dfs):
    all_years_df = pd.concat(dfs)
    return all_years_df

#d = {'paper': ['A', 'BB', 'C', 'd', 'E', 'f'], 'scores': [1, 2, 4, 18, 25, 13]}
d = {'scores': [1, 2, 4, 18, 25, 13]}
#data2 = [{'scores': scores}]#, {'a': 5, 'b': 10, 'c': 20}, {'a': 0, 'b': 8, 'c': 30}]
df = pd.DataFrame(data=d)

