In [6]:
import time
import pandas as pd
from py2neo import Graph, Node, Relationship
from textgenrnn import textgenrnn

In [7]:
def query_to_df(query, graph):
    print("Starting query...", end=" ")
    query_start_time = time.time()
    df = graph.run(query).to_data_frame()
    print("Done ({:.2f} minutes).".format((time.time()-query_start_time)/60))
    return df

In [15]:
graph = Graph( "bolt://matlaber10.media.mit.edu:7474", auth=('neo4j','myneo'))
print("Connected to graph database with {:,} nodes and {:,} relationships!".format(
    graph.database.primitive_counts['NumberOfNodeIdsInUse'], 
    graph.database.primitive_counts['NumberOfRelationshipIdsInUse']))

ServiceUnavailable: Timed out trying to establish connection to ('18.85.22.126', 7474)

In [26]:
def gen_query(venue):
    query =  """
    MATCH (a:Author)-[:AUTHORED]-(q:Quanta)
    WHERE q.venue = '{}'
    RETURN COLLECT(q.title) AS titles
    """.format(venue)
    return query

In [9]:
top_5 = ['Cell', 'Nature', 'Nature Biotechnology','Proceedings of the National Academy of Sciences of the United States of America','Science']
top_10 = ['Cell', 'Nature', 'Nature Biotechnology','Proceedings of the National Academy of Sciences of the United States of America','Science', 'Journal of the American Chemical Society', 'JAMA', 'The New England Journal of Medicine', 'Nature Genetics', 'Neuron']
top_42 = ['Angewandte Chemie','Blood','Cancer Cell','Cancer Discovery','Cancer Research','Cell','Cell Host & Microbe','Cell Metabolism','Cell Stem Cell','Chemistry & Biology','The EMBO Journal','Genes & Development','Immunity','Journal of Neurology','Journal of the American Chemical Society','JAMA','Journal of Biological Chemistry','Journal of Cell Biology','Journal of Clinical Investigation','Journal of Experimental Medicine','Journal of Medicinal Chemistry','The Lancet','Nature Cell Biology','Nature Chemical Biology','Nature Chemistry','Nature Medicine','Nature Methods','Nature','Nature Biotechnology','The New England Journal of Medicine','Neuron','Nature Genetics','Nature Immunology','Nature Neuroscience','Nature Structural & Molecular Biology','PLOS Biology','PLOS Genetics','PLOS Pathogens','Proceedings of the National Academy of Sciences of the United States of America','Science Signaling','Science Translational Medicine','Science']

In [41]:
for venue in top_5:
    query = gen_query(venue)
    df = query_to_df(query, graph)
    df.to_csv('Train_/{}_titles.csv'.format(venue), index = False, encoding = "UTF-8")

Starting query... Done (0.58 minutes).
Starting query... Done (1.60 minutes).
Starting query... Done (0.10 minutes).
Starting query... Done (1.84 minutes).
Starting query... Done (1.58 minutes).


In [28]:
import ast
import random
def gen_titles_short(venue, n_gen=5, n_train=10, epochs=1, batch=128):
    df_titles = pd.read_csv('Train/{}_titles.csv'.format(venue))
    title_list = df_titles.iloc[0]['titles']
    tl = ast.literal_eval(title_list)
    print('Venue: ', venue)
    print('Number of titles:', len(tl))
    print('Sampling: ', n_train, 'titles')
    texts = []
    for i in range(n_train):
        texts.append(random.choice(tl))
    textgen = textgenrnn()
    textgen.train_on_texts(texts, num_epochs=epochs, batch_size=batch, verbose=2)
    textgen.generate_to_file('Gen/{}_generated.txt'.format(venue),
                    n=n_gen, 
                    temperature=[.2 for i in range(n_train)],
                    max_gen_length=100,
                    progress=False)

In [29]:
for venue in top_5:
    gen_titles_short(venue)

Venue:  Cell
Number of titles: 99673
Sampling:  10 titles
Training on 779 character sequences.
Epoch 1/1
 - 4s - loss: 2.9063
####################
Temperature: 0.2
####################
Prooo in a connected patterrr and the of Recornioted the Contenated Recornive of Patri Meetlants of the Structs of Drama and Mattreaa active of the Seriela Patteerels of Ara Conters Recorging the Serious Recornively of the Arretter Patterrels and a connected elected the the Serious Mattreaa to Rece

Praost and a connected porn of Reception of Statter Strop of Recorging and I activated in anythe the election of any of a bye the or the Matche Recornive of Recomingging the Drama the Contenated Policon and In Marterrrral Recornive of Patterrrel Recornioted and a connecte of any of a recorned of Me

Conten of Receptiooo of Darran of Drama in any of any of a bye by the Morro Recogning Drach Contenartion of Recorging on and I activated in anythe the Macche and a connected and a connecte of and a connected peopl

PS4 ESO SMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM

####################
Temperature: 1.0
####################
17-ymm pulm ear Obtalk

MEMMS

16 tritn-Newisem Attemblach 11



In [None]:
def gen_titles(venue, n_gen=10, epochs=1, batch=128):
    df_titles = pd.read_csv('Train/{}_titles.csv'.format(venue), encoding = "UTF-8")
    title_list = df_titles.iloc[0]['titles']
    texts = ast.literal_eval(title_list)
    print('Venue: ', venue)
    print('Number of titles:', len(texts))
    textgen = textgenrnn()
    textgen.train_on_texts(texts, num_epochs=epochs, batch_size=batch, verbose=0)
    textgen.generate_to_file('Gen/{}_generated.txt'.format(venue),
                    n=n_gen, 
                    temperature=[.2 for i in range(len(texts))],
                    max_gen_length=100,
                    progress=False)