In [1]:
import numpy as np
import pandas as pd

target_name_dict = { 'astro-ph.GA' : 0,
                    'astro-ph.SR' : 1,
                    'astro-ph.IM' : 2,
                    'astro-ph.EP' : 3,
                    'astro-ph.HE' : 4,
                    'astro-ph.CO' : 5
                }
label2target = { v:k for k,v in target_name_dict.items()}

In [2]:
df = pd.HDFStore("../data/2014astroph_p.h5", "r")
df['/df'].keys()
abstracts = df['/df']['abstract']
labels = np.array(df['/df']['label'])
df.close()

In [3]:
j = np.random.randint(len(labels))
print(j, label2target[labels[j]])
print(abstracts[j])

8489 astro-ph.HE
I review our current state of knowledge about non-thermal radiation from the
Galactic Centre (GC) and Inner Galaxy. Definitionally, the Galactic nucleus is
at the bottom of the Galaxy's gravitational well, rendering it a promising
region to seek the signatures of dark matter decay or annihilation. It also
hosts, however, the Milky Way's resident supermassive black hole and up to 10%
of current massive star formation in the Galaxy. Thus the Galactic nucleus is a
dynamic and highly-energized environment implying that extreme caution must be
exercised in interpreting any unusual or unexpected signal from (or emerging
from) the region as evidence for dark matter-related processes. One spectacular
example of an `unexpected' signal is the discovery within the last few years of
the `Fermi Bubbles' and, subsequently, their polarised radio counterparts.
These giant lobes extend ~7 kpc from the nucleus into both north and south
Galactic hemispheres. Hard-spectrum, microwave emis

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [5]:
# settings
maxlen = 150
max_words = 10000 # Top 10000 words
training_samples = 6000 #

In [6]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(abstracts)
sequences = tokenizer.texts_to_sequences(abstracts)
word_index = tokenizer.word_index

print("Found %s unique tokens" % len(word_index))

word_index_reverse = dict()

for k, v in word_index.items():
    word_index_reverse[v] = k

Found 30677 unique tokens


In [7]:
data = pad_sequences(sequences=sequences, maxlen=maxlen)
indices = np.arange(abstracts.shape[0])

np.random.seed(1234)

np.random.shuffle(indices)

data = data[indices]

labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]

x_test = data[training_samples:]
y_test = labels[training_samples:]

In [8]:
#https://stackoverflow.com/questions/41971587/how-to-convert-predicted-sequence-back-to-text-in-keras
# Creating a reverse dictionary
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

# Function takes a tokenized sentence and returns the words
def sequence_to_text(list_of_indices):
    # Looking up words in dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    return(words)

# Creating texts 
my_texts = list(map(sequence_to_text, data))

# alternative way
my_texts_2 = tokenizer.sequences_to_texts(sequences=sequences)

In [9]:
j = np.random.randint(len(x_train))
print(j, label2target[y_train[j]])
print(my_texts[j])

2788 astro-ph.CO
['ia', 'data', 'in', 'recent', 'years', 'we', 'use', 'the', 'union2', '1', 'data', 'to', 'give', 'a', 'simple', 'classification', 'of', 'such', 'studies', 'for', 'the', 'first', 'time', 'because', 'the', 'maximum', 'anisotropic', 'direction', 'is', 'independent', 'of', 'isotropic', 'dark', 'energy', 'models', 'we', 'adopt', 'two', 'cosmological', 'models', 'lambda', 'cdm', 'w', 'cdm', 'for', 'the', 'hemisphere', 'comparison', 'analysis', 'and', 'lambda', 'cdm', 'model', 'for', 'dipole', 'fit', 'approach', 'in', 'hemisphere', 'comparison', 'method', 'the', 'matter', 'density', 'and', 'the', 'equation', 'of', 'state', 'of', 'dark', 'energy', 'are', 'adopted', 'as', 'the', 'diagnostic', 'in', 'the', 'lambda', 'cdm', 'model', 'and', 'w', 'cdm', 'model', 'respectively', 'in', 'dipole', 'fit', 'approach', 'we', 'fit', 'the', 'fluctuation', 'of', 'distance', 'modulus', 'we', 'find', 'that', 'there', 'is', 'a', 'null', 'signal', 'for', 'the', 'hemisphere', 'comparison', 'metho

In [10]:
from keras.utils.np_utils import to_categorical
y_train_one_hot = to_categorical(y_train)
y_test_one_hot = to_categorical(y_test)

In [11]:
from keras.models import Sequential
import keras.layers as layers

In [36]:
model = Sequential()
embeddings_dim = 100
model.add(layers.Embedding(max_words, embeddings_dim, input_length=maxlen))
model.add(layers.Flatten())
#model.add(layers.Dense(64, activation='relu', input_shape=(maxlen,)))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(rate=0.3))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(6, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 150, 100)          1000000   
_________________________________________________________________
flatten_8 (Flatten)          (None, 15000)             0         
_________________________________________________________________
dense_21 (Dense)             (None, 32)                480032    
_________________________________________________________________
dropout_4 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_23 (Dense)             (None, 6)                 198       
Total params: 1,481,286
Trainable params: 1,481,286
Non-trainable params: 0
_________________________________________________________________


In [37]:
#model.layers[0].set_weights([embedding_matrix])
#model.layers[0].trainable= False

In [38]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train_one_hot,
                   epochs=5,
                   batch_size=32,
                   validation_split=0.3)
#model.save_weights('pre_trained_glove_model_2.h5')

Train on 4200 samples, validate on 1800 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [39]:
results = model.evaluate(x_test, y_test_one_hot)
class_prediction = model.predict_classes(x_test)



In [40]:
results

[1.4146753831172552, 0.641374373615172]

In [45]:
jj = np.random.randint(len(x_test))
print(x_test[jj])
print(my_texts[training_samples+jj])
print(label2target[y_test[jj]])
print("prediction: ", label2target[class_prediction[jj]])

[ 501  263  220   93    3 1124 2322    2   76   66    3  373   77    5
    1  111  786  532  318  142   65  990    8 1555   13 5715 6251 1001
  549  799    5    1   36  391  186 1294  746   13    4  337    2   25
    5   30  254   26    1 1012  117  686 1000 4167    3    4  705  460
  145 1995  111  140 4182  901    4   82 1091  224 2030    2   77   24
    6   33  307   13    4   65  278   24  100 2562 1260  250  269  746
    1  655    2    1  990   34 2026 1856 1006    3 5963 2369 2164 1771
  146  990    8 1157  193   77  646   13  128 1006   23   12 5362    6
  697    5   76   24   39 2369  852  835 1000 3716    1  123 1112   52
  301 7274    1  873 2266   12  241    3    1  765 2369  429   12  423
  481   46 2317 1160  269   10  572   20   79   22]
['curves', 'v', 'c', 'r', 'and', 'greater', 'amounts', 'of', 'low', 'density', 'and', 'hot', 'gas', 'in', 'the', 'disk', 'mid', 'plane', 'ii', 'when', 'stellar', 'feedback', 'is', 'modeled', 'by', 'temporarily', 'switching', 'off', 'radia

In [48]:
abstract_testing = "this is a new extrasolar system"
seq_testing = tokenizer.texts_to_sequences([[ w for w in abstract_testing.split(' ')]])
data_testing = pad_sequences(sequences=seq_testing, maxlen=maxlen)
print(data_testing)
classes_testing = model.predict(data_testing)
print("prediction: ", classes_testing[0])
print("predicted category: ", label2target[np.argmax(classes_testing[0])])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0   16    8    4   75 2312  118]]
prediction:  [0.18530409 0.6776955  0.04168797 0.02523528 0.03012412 0.039953  ]
predicted category:  astro-ph.SR
