In [None]:
import json
from py2neo import Graph, Node, Relationship

# Need to get authentication working, currently NEO4J_AUTH=none
graph = Graph("bolt://neo4j:7687")
# graph = Graph('bolt://localhost:7687', bolt=True)

n_nodes = graph.database.primitive_counts['NumberOfNodeIdsInUse']
n_relationships = graph.database.primitive_counts['NumberOfRelationshipIdsInUse']
print("Connected to graph database with {:,} nodes and {:,} relationships!".format
     (n_nodes, n_relationships))

In [None]:
# Write queries to CSV files
import time

start_time = time.time()
query = """CALL apoc.export.csv.query("MATCH (q:Quanta) WHERE (q.venue=\\"Nature\\" OR q.venue=\\"Science\\") AND q.year>=1990 AND EXISTS(q.fos) RETURN q.title as title, q.venue as venue, q.fos as fos, q.year as year", "/import/result/AllQuantaInNatureScienceYear1990.csv", {})"""
graph.run(querviy).evaluate()
print(query)
print("Finished query and wrote results in {:.2f} seconds.".format(time.time()-start_time))

start_time = time.time()
query = """CALL apoc.export.csv.query("MATCH (q:Quanta) WHERE ( q.venue=\\"CA: A Cancer Journal for Clinicians\\" OR q.venue=\\"The New England Journal of Medicine\\" OR q.venue=\\"The Lancet\\" OR q.venue=\\"Chemical Reviews\\" OR q.venue=\\"Nature Reviews Drug Discovery\\" OR q.venue=\\"JAMA\\" OR q.venue=\\"Nature Reviews Cancer\\" OR q.venue=\\"Nature Reviews Immunology\\" OR q.venue=\\"Nature\\" OR q.venue=\\"Nature Reviews Genetics\\" OR q.venue=\\"Science\\" OR q.venue=\\"Chemical Society Reviews\\" OR q.venue=\\"Nature Materials\\" OR q.venue=\\"Nature Nanotechnology\\" OR q.venue=\\"Lancet Oncology\\" OR q.venue=\\"Reviews of Modern Physics\\" OR q.venue=\\"Nature Biotechnology\\" OR q.venue=\\"Nature Reviews Molecular Cell Biology\\" OR q.venue=\\"Nature Reviews Neuroscience\\" OR q.venue=\\"Nature Medicine\\" OR q.venue=\\"Nature Photonics\\" OR q.venue=\\"Nature Reviews Microbiology\\" OR q.venue=\\"Cell\\" OR q.venue=\\"Advances in Physics\\" OR q.venue=\\"Energy and Environmental Science\\" OR q.venue=\\"World Psychiatry\\" ) AND q.year>=1990 AND EXISTS(q.fos) RETURN q.title as title, q.venue as venue, q.fos as fos, q.year as year", "/import/result/AllQuantaWithIf30Year1990.csv.csv", {})"""
print(query)
# graph.run(query).evaluate()
print("Finished query and wrote results in {:.2f} seconds.".format(time.time()-start_time))

In [None]:
# Run ArticleRank
year = 2018
print("Running ArticleRank on works from <= {}...".format(year), end=" ")
query = """
CALL algo.articleRank(
'MATCH (p:Quanta) WHERE p.year <= {} RETURN id(p) as id',
'MATCH (p1:Quanta)-[:CITES]->(p2:Quanta) RETURN id(p1) as source, id(p2) as target',
{{graph:'cypher', writeProperty:'articleRank{}', write: true}});
""".format(year,year)
print(query)
# graph.run(query).evaluate()

In [None]:
# Load libraries for topic modeling
import os
import gensim
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import pyLDAvis.gensim
import seaborn as sns
import warnings
from gensim.parsing.preprocessing import remove_stopwords

In [None]:
# Some settings
%matplotlib inline
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 10

In [None]:
# Load query results as list of lists
# df = pd.read_csv('/tmp/data/result/AllQuantaInNatureScienceYear1990.csv')
df = pd.read_csv('/tmp/data/result/AllQuantaWithIf30Year1990.csv.csv')

# Process"Field of Study" field
df['fos'] = df['fos'].apply(eval)
train_text_fos = df['fos'].tolist()

# Process "title" field
def tokenize_title(title):
    title = remove_stopwords(title.lower())
    return gensim.utils.simple_preprocess(title, deacc=True)
train_text_title = df['title'].apply(tokenize_title).tolist()

# Merge tokens from "title" and "fos"
for i in range(len(train_text_fos)):
    train_text_fos[i].extend(train_text_title[i])
train_text = train_text_fos

# Capitalize everything
train_text = [[w.upper() for w in line] for line in train_text]
train_text

In [None]:
# Create train corpus from query result
from gensim.corpora.dictionary import Dictionary

train_dictionary = Dictionary(train_text)
train_corpus = [train_dictionary.doc2bow(t) for t in train_text]
print("Training data with {:,} samples created.".format(len(train_corpus)))

In [None]:
# Train model

import time
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
start_time = time.time()

lda_model = gensim.models.ldamodel.LdaModel(
    corpus=train_corpus, 
    num_topics=400,
    id2word=train_dictionary,
    chunksize=10000
    )

print("Trained NDA model with {} topics in {:.2f} minutes.".format(lda_model.num_topics, (time.time()-start_time)/60))

In [None]:
# Print the top topics
lda_model.print_topics(num_topics=400, num_words=6)

In [None]:
# Test LDA model a bit

# Find the topics most relevant to some words of interest
word_id = train_dictionary.token2id["SYNTHETIC"]
print(lda_model.get_term_topics(word_id, minimum_probability=None))

# Make sure predictions are working as expected
row = df.loc[1,]
text = [w.upper() for w in row['fos']]
mapped_text = train_dictionary.doc2bow(text)
topic_weights = lda_model[mapped_text]
topic_words = [train_dictionary[w[0]] for w in topic_weights]
print(topic_weights)
print(topic_words)
# test_df = df.iloc[1:10,:]
# test_df.head()
df

In [None]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

def calculate_topics(token_list):
    text = [w.upper() for w in token_list]
    mapped_text = train_dictionary.doc2bow(text)
    topic_weights = lda_model[mapped_text]
    topic_words = [train_dictionary[w[0]] for w in topic_weights]
    return [topic_weights, topic_words]

topic_weights_and_words = df['fos'].progress_apply(calculate_topics)
df['topic_weight_list'] = topic_weights_and_words.apply(lambda x: x[0])
df['topic_words'] = topic_weights_and_words.apply(lambda x: x[1])

In [None]:
def split_data_frame_list(df, target_column):
    row_accumulator = []
    def split_list_to_rows(row):
        split_row = row[target_column]
        if isinstance(split_row, list):
            for s in split_row:
                new_row = row.to_dict()
                new_row[target_column] = s
                row_accumulator.append(new_row)
        else:
            print("ERROR!")
    df.progress_apply(split_list_to_rows, axis=1)
    new_df = pd.DataFrame(row_accumulator)
    return new_df

# Split each row into multiple rows (one for each topic/weight pair)
df_compiled = split_data_frame_list(df,'topic_weight_list')

# Concatenate topic/weight columns with full dataframe
df_compiled[['topic_id', 'topic_weight']] = pd.DataFrame(df_compiled['topic_weight_list'].values.tolist())

# Drop now unnecessary topic/weight column
df_compiled = df_compiled.drop('topic_weight_list', axis=1)

In [None]:
df_sum = df_compiled[df_compiled['topic_weight'] != 0]
df_sum['topic_words_str'] = df_sum['topic_words'].apply(lambda x: )

print("Max: {:.2f}".format(df_sum['topic_weight'].max()))
print("Min: {:.2f}".format(df_sum['topic_weight'].min()))
print("Average: {:.2f}".format(df_sum['topic_weight'].mean()))
print("Median: {:.2f}".format(df_sum['topic_weight'].median()))
print("Most frequent value: {:.2f}".format(df_sum['topic_weight'].round(3).value_counts().idxmax()))

df_sum.head()

In [None]:
mask1 = df_sum['topic_id']==236
mask2 = df_sum['topic_id']==386
mask3 = df_sum['topic_id']==383
mask4 = df_sum['topic_id']==389

p = sns.factorplot(x="year", y='topic_weight', col='topic_id', col_wrap=2, 
                   kind='strip', jitter=1, 
                   data=df_sum[mask1.values | mask2.values | mask3.values | mask4.values])
p.fig.subplots_adjust(top=0.85)
p.fig.suptitle("Scatterplot of Normalized Topic Weights, Split by Topic; All Weights.", fontsize=12)
p.set_xticklabels(rotation=90)

In [None]:
topic_map = {236: 'CRISPR, PEPTIDES, CAS',
            386: 'MRNA, MESSENGER RNA, BIOINFORMATICS',
            11:  'STEM, BIOLOGY, CELL',
            382: 'CANCER, ONCOLOGY, MEDICINE',
            18:  'GENETICS, RNA INTERFERENCE, MOLECULAR BIOLOGY'}

mask = (df_sum['topic_id']==236) | (df_sum['topic_id']==386) | (df_sum['topic_id']==11) |(df_sum['topic_id']==382) | (df_sum['topic_id']==18)
df_plot = df_sum[mask]
df_plot['Topic'] = df_plot['topic_id'].map(topic_map)

sns.set(style="ticks", rc={"lines.linewidth": 1})
p = sns.factorplot(x="year", y='topic_weight', kind='point', hue='Topic', linestyles=":",
                   size=8, aspect=1.5, data=df_plot)
p.set_xticklabels(rotation=90)
p.set_xlabels("Year")
p.set_ylabels("Topic Weight")

In [None]:
df_plot['scaled_weight'] = df_plot['topic_weight']
for t in df_plot['topic_id'].unique():
    print("hi, {}".format(t))
    t_max = df_plot[df_plot['topic_id']==t]['topic_weight'].max()
    topic_scaled = df_plot[df_plot['topic_id']==t]['topic_weight']/t_max
    df_plot['scaled_weight'][df_plot['topic_id']==t] = topic_scaled

In [None]:
df_plot[df_plot['topic_id']==386]['scaled_weight'].min()

In [None]:
p = sns.factorplot(x="year", y='scaled_weight', kind='point', hue='Topic', 
                   linestyles=":", size=8, aspect=1.5, data=df_plot)
p.set_xticklabels(rotation=90)
p.set_xlabels("Year")
p.set_ylabels("Topic Weight")

In [None]:
topic_map = {}
for t in tqdm_notebook(df_sum['topic_id'].unique()):
    topic_terms[t] = ", ".join([w[0] for w in lda_model.show_topic(t, topn=3)])
    
df_sum['topic'] = df_sum['topic_id'].map(topic_map)


In [None]:
# Plot all 400 topics to look for the next CRISPR
# step_size = 400
# for i in range(step_size,df_sum['topic_id'].max(), step_size):
    subsssub
    
#     df_plot_all = df_sum[(df_sum['topic_id']>=(i-step_size)) & (df_sum['topic_id']<i)]
    
p = sns.factorplot(x="year", y='topic_weight', kind='point', col='topic_id', 
                   linestyles=":", size=2, aspect=2, col_wrap=4, 
                   data=df_sum)
p.set_xticklabels(rotation=90)
p.set_xlabels("Year")
p.set_ylabels("Topic Weight")
plt.show()

In [None]:
# Plot interesting topics in more detail
mask = df_sum['topic_id']==46
# df_sum[mask]['title'].apply(print)

sns.set_style("whitegrid")
sns.set_context('paper')
ax = sns.lineplot(x='year', y='topic_weight', data=df_sum[mask])
ax.set(xlabel="Year", ylabel="Topic Weight", title="ECOLOGY, DEPENDENT, PEPTIDE, BIOLOGICAL")
ax
plt.show()

In [None]:
# Plot interesting topics in more detail
mask = (df_sum['topic_id']==46)|(df_sum['topic_id']==65)|(df_sum['topic_id']==337)
# df_sum[mask]['title'].apply(print)

sns.set_style("darkgrid")
sns.set_context('paper')
ax = sns.lineplot(x='year', y='topic_weight', hue='topic', style='topic', data=df_sum[mask], legend=False)
ax.set(xlabel="Year", ylabel="Topic Weight")
plt.legend(["ECOLOGY, DEPENDENT, PEPTIDE","LIVING, AUTISM, HIGHER","PHYSICS, NANOTECHNOLOGY, CHEMISTRY"],loc=4)
plt.show()