# Extract Data From Graph 

In [None]:
import time, json, glob, os
from py2neo import Graph, Node, Relationship
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
sns.set(color_codes=True)
%matplotlib inline

In [None]:
graph = Graph("bolt://neo4j-allquanta:7687", auth=('neo4j','myneo'))

print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

In [None]:
def run_query(query, graph, print_query=False, run_query=True, 
              print_only=False, to_df=False):
    df = 1
    if print_only: 
        print_query = True
        run_query = False
    start_time = time.time()
    if print_query:
        print(query)
    if run_query:
        if to_df:
            df = graph.run(query).to_data_frame()
        else:
            graph.run(query)
    end_time = time.time()
    minutes_elapsed = (end_time-start_time)/60
    print("Query completed in {:.2f} minutes.".format(minutes_elapsed))
    return df
    
    
top_5 = ['Cell', 'Nature', 'Nature Biotechnology','Proceedings of the National Academy of Sciences of the United States of America','Science']
top_10 = ['Cell', 'Nature', 'Nature Biotechnology','Proceedings of the National Academy of Sciences of the United States of America','Science', 'Journal of the American Chemical Society', 'JAMA', 'The New England Journal of Medicine', 'Nature Genetics', 'Neuron']
top_42 = ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']

In [None]:
query = """
MATCH (q:Quanta)
WHERE q.num_profs > 1 AND q.venue="Nature" 
RETURN 
    q.pageRank_2018 as pagerank, 
    q.articleRank2018 as articlerank, 
    q.num_profs as num_profs, 
    q.num_authors as num_authors
"""
df = run_query(query, graph, to_df=True, print_only=False)

for col in df.columns:
    df['log_{}'.format(col)] = df[col].apply(np.log)
#     df['sqrt_{}'.format(col)] = df[col].apply(np.sqrt)

df.dropna(inplace=True)
df.describe(include='all')

impact_metrics = ['pagerank', 'articlerank']
collab_metrics = ['num_authors', 'num_profs']

log_impact_metrics = ['log_{}'.format(s) for s in impact_metrics]
log_collab_metrics = ['log_{}'.format(s) for s in collab_metrics]

# sqrt_impact_metrics = ['sqrt_{}'.format(s) for s in impact_metrics]
# sqrt_collab_metrics = ['sqrt_{}'.format(s) for s in collab_metrics]

In [None]:
df.info()
# df.to_csv(path_or_buf='/tmp/data/result/impactAndAuthors_Nature.csv')

# Plot Variables

In [None]:
vars_to_include = log_impact_metrics + log_collab_metrics
g = sns.pairplot(df, 
                 x_vars=vars_to_include,
                 y_vars=vars_to_include,
                 markers='o', 
                 diag_kind='auto', 
                 height=4)

In [None]:
for col in log_impact_metrics + collab_metrics:
    plt.figure()
    ax = sns.distplot(df[col], kde=False, bins=100)

In [None]:
sns.set_style('white')
g = sns.pairplot(df, 
                 y_vars=log_impact_metrics, 
                 x_vars=collab_metrics, 
                 height=8, 
                 plot_kws=dict(s=200))

## Fitting Equations

In [None]:
from scipy import stats
def r2(x, y):
    return stats.pearsonr(x, y)[0] ** 2

sns.jointplot(x='num_profs', y='log_pagerank', data=df, height=12, kind="reg", stat_func=r2)

In [None]:
sns.set_style('white')
g = sns.lmplot(x='num_profs',
               y='log_pagerank', 
               data=df, 
               height=10, 
               scatter_kws=dict(s=5, alpha=1),
               legend_out=False, 
               order=3)
g.set(xlim=(0, 100), ylim=(-2,10))

## Clustering

In [None]:
from sklearn.cluster import MiniBatchKMeans
kmeans = MiniBatchKMeans(n_clusters=3, random_state=0).fit(df.values)
df['kmeans'] = kmeans.labels_

plt.figure()
sns.set_style('white')
g = sns.pairplot(df, y_vars=log_impact_metrics, x_vars=collab_metrics, 
                 hue='kmeans', height=8, plot_kws=dict(s=100))

In [None]:
sns.set_style('white')
g = sns.lmplot(x='num_profs', 
               y='log_pagerank', 
               data=df, 
               height=10, 
               hue='kmeans', 
               truncate=False,
               scatter_kws=dict(s=10, alpha=1),
               legend_out=False)
g.set(xlim=(0, 200), ylim=(-2,7))

In [None]:
g = sns.lmplot(x='num_profs', 
               y='log_pagerank', 
               data=df, 
               height=4, 
               col='kmeans',
               hue='kmeans',
               truncate=False,
               scatter_kws=dict(s=10, alpha=1),
               legend_out=False)
g.set(xlim=(0, 200), ylim=(-2.5,7))

In [None]:
sns.set_style('white')
df['label'] = df['num_profs']>2
g = sns.lmplot(x='num_profs', y='log_pagerank', data=df, height=10,
               hue='label', markers=['o','x'])
g.set(xlim=(0, 200), ylim=(-2.5,7))