In [1]:
import numpy as np
import pandas


In [2]:
store = pandas.HDFStore("arxiv_data/astroph_2016_preprocessed.h5")

In [3]:
df = store['df']

In [4]:
store.close()

In [5]:
df.shape

(15842, 7)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15842 entries, 0 to 15841
Data columns (total 7 columns):
title         15842 non-null object
abstract      15842 non-null object
categories    15842 non-null object
created       15842 non-null datetime64[ns]
id            15842 non-null object
doi           14028 non-null object
label         15842 non-null float64
dtypes: datetime64[ns](1), float64(1), object(5)
memory usage: 990.1+ KB


In [7]:
texts = df['abstract']
labels = df['label']

In [8]:
labels = np.asarray(labels, dtype=int)

# Tokenizing the data

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [10]:
maxlen = 100
training_samples = 2000
validation_samples = 10000
max_words = 10000 # Top 10000 words

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [11]:
word_index = tokenizer.word_index
print("Found %s unique tokens" % len(word_index))

word_index_reverse = dict()
for k, v in word_index.items():
    word_index_reverse[v] = k

Found 30581 unique tokens


In [12]:
data = pad_sequences(sequences=sequences, maxlen=maxlen)

In [13]:
np.random.seed(1234)
indices = np.arange(texts.shape[0])
np.random.shuffle(indices)

data = data[indices]
labels = labels[indices]

In [14]:
x_train = data[:training_samples]
y_train = labels[:training_samples]

x_val = data[training_samples:training_samples+validation_samples]
y_val = labels[training_samples:training_samples+validation_samples]

In [15]:
y_train[0]

3

In [16]:
target_name_dict = { 'astro-ph.GA' : 0,
                     'astro-ph.SR' : 1,
                     'astro-ph.IM' : 2,
                     'astro-ph.EP' : 3,
                     'astro-ph.HE' : 4,
                     'astro-ph.CO' : 5
                   }
target_name = [k for k, v in target_name_dict.items()]

In [17]:
target_name

['astro-ph.GA',
 'astro-ph.SR',
 'astro-ph.IM',
 'astro-ph.EP',
 'astro-ph.HE',
 'astro-ph.CO']

In [18]:
from keras.utils.np_utils import to_categorical

In [19]:
y_train_one_hot = to_categorical(y_train)
y_val_one_hot = to_categorical(y_val)

In [20]:
y_train_one_hot[0]

array([0., 0., 0., 1., 0., 0.], dtype=float32)

# Parsing GloVe file

In [21]:
import os
glove_dir = "./glove.6B/"

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [22]:
embeddings_dim = 100 

embedding_matrix = np.zeros((max_words, embeddings_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

Model Definition

In [23]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

In [24]:
model = Sequential()
model.add(Embedding(max_words, embeddings_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(6, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                640064    
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 390       
Total params: 1,640,454
Trainable params: 1,640,454
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable= False

In [26]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train_one_hot,
                   epochs=10,
                   batch_size=32,
                   validation_data = (x_val, y_val_one_hot))
model.save_weights('pre_trained_glove_model_2.h5')

Train on 2000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
jj = 14000
for d in data[jj]:
    print(word_index_reverse[d])

and
sampling
the
eos
with
five
fiducial
densities
between
times
the
nuclear
saturation
density
results
in
optimal
errors
for
the
smallest
number
of
parameters
specifically
it
the
radii
of
the
assumed
eos
to
within
less
than
km
for
the
extreme
mock
equations
of
state
and
to
within
less
than
km
for
of
a
sample
of
proposed
physically
motivated
equations
of
state
such
a
parametrization
is
also
able
to
reproduce
the
maximum
mass
to
within
m
sun
and
the
moment
of
inertia
of
a
m
sun
neutron
star
to
within
less
than
for
of
the
proposed
sample
of
equations
of
state


In [28]:
labels[jj]

4

In [29]:
target_name[labels[jj]]

'astro-ph.HE'

In [30]:
model.predict(np.array([data[jj]]))

array([[0.8595854 , 0.06731834, 0.00543686, 0.01210889, 0.0184207 ,
        0.03712979]], dtype=float32)

In [31]:
target_name[labels[jj]]

'astro-ph.HE'

In [32]:
target_name

['astro-ph.GA',
 'astro-ph.SR',
 'astro-ph.IM',
 'astro-ph.EP',
 'astro-ph.HE',
 'astro-ph.CO']

In [33]:
data.shape

(15842, 100)

In [34]:
labels.shape

(15842,)

In [35]:
model.evaluate(np.array([x_val[1000]]), np.array([y_val_one_hot[1000]]))



[5.27219295501709, 0.0]