In [None]:
import json
import time
import pickle
import os
from functools import reduce

import pandas as pd
import numpy as np
import markovify

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from tqdm.autonotebook import tqdm

from py2neo import Graph, Node, Relationship

In [None]:
filepath = '/tmp/data/authorlist_titles.json'
nlines = int(os.popen("wc -l  {}".format(filepath)).read().split(' ')[0])
reader = pd.read_json(filepath, lines=True, chunksize=1)

In [None]:
models = {}
t = tqdm(total=nlines)
for chunk in reader:

    aid  =  chunk['a.id'].values[0]
    cleanName = chunk['cleanName'].values[0]
    name = chunk['name'].values[0]
    num_titles = chunk['num_titles'].values[0]
    titles = chunk['titles'].values[0]
    
    if num_titles > 5:
        corpus = '\n'.join(titles)
        mdl = markovify.NewlineText(corpus, state_size=2, retain_original=False, well_formed=False)
        models[aid] = {
            'cleanName': cleanName, 
            'name': name, 
            'num_titles': num_titles, 
            'model': mdl}
        
    t.update()

In [None]:
public_address = '54.174.175.98'
graph = Graph('bolt://{}:7687'.format(public_address), auth=('neo4j','myneo'))

def run_query(query, graph, print_query=False, run_query=True, 
              print_only=False, to_df=False, verbose=True):
    df = 1
    if print_only: 
        print_query = True
        run_query = False
    start_time = time.time()
    if print_query:
        print(query)
    if run_query:
        if to_df:
            df = graph.run(query).to_data_frame()
        else:
            graph.run(query)
    end_time = time.time()
    minutes_elapsed = (end_time-start_time)/60
    if verbose:
        print("Query completed in {:.2f} minutes.".format(minutes_elapsed))
    return df

In [None]:
author_list = ['jcraigventer',
               'edwardsboyden',
               'davidbotstein',
               'georgemchurch',
               'shuguangzhang',
               'josephmjacobson']

titledf = pd.DataFrame()

for author in author_list:
    query = """
    match (a:Author)-[:AUTHORED]->(q:Quanta)
    where a.cleanName='{}'
    return a.cleanName as author, q.title as title    
    """.format(author)
    authordf = run_query(query, graph, to_df=True)
    titledf = pd.concat([authordf, titledf])

In [None]:
import markovify

models = {}
for author in author_list:
    print("Training model for {}...".format(author))
    authorcorpus = titledf.loc[titledf['author']==author,'title'] \
                    .str.cat(sep="\n")
    
    authormodel = markovify.NewlineText(authorcorpus, state_size=2, retain_original=False)
    models[author] = authormodel

In [None]:
for author in author_list:
    print("\n=== {} ===".format(author))
    for i in range(5):
        print("{}\n".format(models[author].make_sentence(tries=100)))

In [None]:
query = """
match (q:Quanta)
return q.title as title    
"""
alltitledf = run_query(query, graph, to_df=True)

print("== Training model for all titles ==")
alltitlecorpus = alltitledf['title'].str.cat(sep="\n")
alltitlemodel = markovify.NewlineText(alltitlecorpus,
                                      retain_original=False,
                                      well_formed=True,
                                      state_size=2)

models['allauthors'] = alltitlemodel

print("== Done. ==\n")
for _ in range(10):
    print("{}\n".format(alltitlemodel.make_sentence(tries=100)))

212

{'jcraigventer': 1.358974358974359,
 'edwardsboyden': 3.533333333333333,
 'davidbotstein': 1.1977401129943503,
 'georgemchurch': 1.0,
 'shuguangzhang': 5.72972972972973,
 'josephmjacobson': 21.2}

In [81]:

# Find the total number of publications for each author
npubs = {a: sum(titledf['author']==a) for  a in author_list}

# Find the maximum number of publications
maxnpubs = max(npubs.values())

# Calculate scaling factor for each author to normalize models
baseline_weightings = {a: maxnpubs/npubs[a] for a in author_list}


# Get user weighting (must sum to 1)
user_weighting = {'jcraigventer':    0,
                  'edwardsboyden':   0,
                  'davidbotstein':   .5,
                  'georgemchurch':   0,
                  'shuguangzhang':   0,
                  'josephmjacobson': .5}
assert(sum(user_weighting.values())==1)

# Combine scaling and user weighting to get final weighting
final_weightings = {a: baseline_weightings[a] * user_weighting[a] 
                    for a in user_weighting.keys()}

# Extract models and weights of interest
models_to_use = [models[a] for (a,w) in user_weighting.items() if w>0]
weights_to_use = [final_weightings[a] for (a,w) in user_weighting.items() if w>0]

# Build combined model
combinedmodel = markovify.combine(models_to_use, weights_to_use)

In [83]:
# Generate some titles
for _ in range(10):
    print("{}\n".format(combinedmodel.make_sentence(tries=100)))

Photoelectrochemical synthesis of DNA microarrays.

All-inorganic field effect transistors fabricated by printing

Desferrioxamine-mediated Iron Uptake in Saccharomyces cerevisiae

Yeast actin filaments display ATP-dependent sliding movement over surfaces coated with rabbit muscle myosin.

Gene expression profiling reveals molecularly and clinically distinct subtypes of prostate cancer

The future of humans as model organisms.

Robotics: Self-replication from random parts

Gene expression profiling identifies clinically relevant subtypes of glioblastoma multiforme

Precise Manipulation of Chromosomes in Vivo Enables Genome-Wide Codon Replacement

Evidence for posttranslational translocation of β-lactamase in Salmonella typhimurium

