In [2]:
%matplotlib inline
import numpy as np
import pandas
import matplotlib.pyplot as plt
import tensorflow

In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
maxlen = 100
max_words = 10000 # Top 10000 words
dictionary_file = "dictionary.json"
model_file = "model.json"

target_name_dict = { 'astro-ph.GA' : 0,
                     'astro-ph.SR' : 1,
                     'astro-ph.IM' : 2,
                     'astro-ph.EP' : 3,
                     'astro-ph.HE' : 4,
                     'astro-ph.CO' : 5
                   }
target_name = [k for k, v in target_name_dict.items()]

In [5]:
from simpletokenizer import SimpleTokenizer

tokenizer = SimpleTokenizer(dictionary_file)

In [6]:
texts = ["We found a new planet!"]
seq = tokenizer.texts_to_sequences(texts)

In [7]:
seq_pad = pad_sequences(seq, maxlen=maxlen)
print(seq_pad)

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   7 103   4  81 225]]


In [8]:
from tensorflow.keras.models import model_from_json, model_from_config
import json

In [9]:
#with open(model_file, 'r') as json_file:
#    architecture = json.load(json_file)
#    model = model_from_config(architecture)
    
with open(model_file, 'r') as json_file:
    architecture = json_file.read()
    model = model_from_json(architecture)

In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 100, 100)          1000000   
_________________________________________________________________
flatten_12 (Flatten)         (None, 10000)             0         
_________________________________________________________________
dense_29 (Dense)             (None, 32)                320032    
_________________________________________________________________
dropout_7 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_30 (Dense)             (None, 6)                 198       
Total params: 1,320,230
Trainable params: 320,230
Non-trainable params: 1,000,000
_________________________________________________________________


In [11]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [12]:
model.load_weights("arxiv_explore_keras_Jan20.h5", by_name=True)

In [16]:
texts = ["In this paper, we analyze a suite of isolated galaxy simulations. We find that spiral density wave theory are correct. In particular, it correctly predict the growth of two-armed spiral structure. The star formation are triggered by the spiral waves. The pattern speed is consistent with the observation of corotation in the galaxy sample.",
        "We discovered a new forming planet. This planet has ten Jupiter-mass and is embedded in a protoplanetary disks.",
         "We show that the mass fraction of GMC gas (n>100 cm^-3) in dense (n>>10^4 cm^-3) star-forming clumps, observable in dense molecular tracers (L_HCN/L_CO(1-0)), is a sensitive probe of the strength and mechanism(s) of stellar feedback. Using high-resolution galaxy-scale simulations with pc-scale resolution and explicit models for feedback from radiation pressure, photoionization heating, stellar winds, and supernovae (SNe), we make",
         "We have built a new telescope.",
         "We have observed a new sun spot.",
         "We found that Pluto is indeed a Planet.",
         "We found a new neutron star. This neutron star has a very strong magnetic field.",
         "We discovered the B-modes in the cosmological microwave background, which are the imprints of the primodal density fluctuation. This has a great impact on the understanding of cosmology and inflation."
        ]
seq = tokenizer.texts_to_sequences(texts)
seq = pad_sequences(seq, maxlen=maxlen)
proba = model.predict_proba(seq)
print(target_name)
for p in proba:
    print(p)
    print(target_name[np.argmax(p)])

['astro-ph.GA', 'astro-ph.SR', 'astro-ph.IM', 'astro-ph.EP', 'astro-ph.HE', 'astro-ph.CO']
[0.23718327 0.19181998 0.0080737  0.0327327  0.12006622 0.21885613]
astro-ph.GA
[0.18009089 0.2798115  0.05312736 0.13350321 0.14886184 0.165439  ]
astro-ph.SR
[0.20542327 0.13640682 0.01302256 0.01928018 0.11911584 0.32082808]
astro-ph.CO
[0.1695698  0.24559225 0.13564122 0.1293914  0.1728481  0.20802929]
astro-ph.SR
[0.17413644 0.26029035 0.09075928 0.13118671 0.1619396  0.18836707]
astro-ph.SR
[0.17517659 0.26908582 0.06914829 0.12940335 0.15682735 0.1762344 ]
astro-ph.SR
[0.0746217  0.21029827 0.01081681 0.01241784 0.28738827 0.12792465]
astro-ph.HE
[6.4840280e-02 1.8388242e-02 1.0399697e-03 3.0502304e-04 4.1551776e-02
 8.3405989e-01]
astro-ph.CO


In [25]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 100, 100)          1000000   
_________________________________________________________________
flatten_12 (Flatten)         (None, 10000)             0         
_________________________________________________________________
dense_29 (Dense)             (None, 32)                320032    
_________________________________________________________________
dropout_7 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_30 (Dense)             (None, 6)                 198       
Total params: 1,320,230
Trainable params: 320,230
Non-trainable params: 1,000,000
_________________________________________________________________


In [14]:
import h5py

In [21]:
h5f = h5py.File("arxiv_explore_keras_Jan20.h5", "r")
for k in h5f.keys():
    print(k)
h5f.close()

dense_29
dense_30
dropout_7
embedding_15
flatten_12


dense_25
dense_26
dropout_5
embedding_13
flatten_10
