In [1]:
%matplotlib inline
import time

# python 3
import urllib.error
from urllib.request import urlopen
import datetime
#from itertools import ifilter
from collections import Counter, defaultdict
import xml.etree.ElementTree as ET

from bs4 import BeautifulSoup
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
#import bibtexparser

pd.set_option('mode.chained_assignment','warn')

In [2]:
store = pd.HDFStore("astroph_2016_preprocessed.h5")
#store['df'] = df
df = store['df']
store.close()

In [3]:
df.index.shape

(15842,)

In [4]:
# split test and train set
np.random.seed(42)
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [5]:
training_set, test_set = split_train_test(df, 0.2)

training_data = training_set.abstract
testing_data = test_set.abstract
training_label = training_set.label.astype('int64')
testing_label = test_set.label.astype('int64')
#training_data = df.abstract[:1500]
#testing_data = df.abstract[1500:]
#training_label = df.label[:1500].astype('int64')
#testing_label = df.label[1500:].astype('int64')

In [6]:
target_name_dict = { 'astro-ph.GA' : 0,
                     'astro-ph.SR' : 1,
                     'astro-ph.IM' : 2,
                     'astro-ph.EP' : 3,
                     'astro-ph.SR' : 4,
                     'astro-ph.HE' : 5,
                     'astro-ph.CO' : 6,
                   }
target_name = [k for k, v in target_name_dict.items()]

print(training_data[100])
print(target_name[training_label[100]])

we present comparison of numerical simulations of propagation of mhd waves excited by subphotospheric perturbations  in two different   deep  and  shallow   magnetostatic models of the sunspots  the  deep  sunspot model distorts both the shape of the wavefront and its amplitude stronger than the  shallow  model  for both sunspot models  the surface gravity waves  f mode  are affected by the sunspots stronger than the acoustic p modes  the wave amplitude inside the sunspot depends on the photospheric strength of the magnetic field and the distance of the source from the sunspot axis  for the source located at  mm from the center of the sunspot  the wave amplitude increases when the wavefront passes through the central part of the sunspot  for the source distance of  mm  the wave amplitude inside the sunspot is always smaller than outside  for the same source distance from the sunspot center but for the models with different strength of the magnetic field  the wave amplitude inside the s

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(training_data)
X_train_counts.shape

(12674, 28005)

In [9]:
count_vect.vocabulary_.keys()



In [10]:
count_vect.vocabulary_.get(u'spiral')

23391

In [11]:
# compute the term frequency
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

# alternative
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(12674, 28005)

In [12]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, training_label)

In [14]:
docs_new = ['spiral density wave', 'protoplanetary disk', 'galactic dynamics', 'theory']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, target_name[category]))

'spiral density wave' => astro-ph.GA
'protoplanetary disk' => astro-ph.SR
'galactic dynamics' => astro-ph.GA
'theory' => astro-ph.CO


In [15]:
# building a pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [16]:
text_clf.fit(training_data, training_label)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [19]:
import numpy as np
docs_test = testing_data
predicted = text_clf.predict(docs_test)
np.mean(predicted == testing_label)   

0.6985479797979798

In [20]:
from sklearn import metrics
print(metrics.classification_report(testing_label, predicted, target_names=target_name))

              precision    recall  f1-score   support

 astro-ph.GA       0.63      0.93      0.75       697
 astro-ph.SR       0.64      0.74      0.68       653
 astro-ph.IM       0.99      0.25      0.40       296
 astro-ph.EP       0.99      0.21      0.34       321
 astro-ph.HE       0.79      0.75      0.77       608
 astro-ph.CO       0.74      0.81      0.77       593

   micro avg       0.70      0.70      0.70      3168
   macro avg       0.79      0.62      0.62      3168
weighted avg       0.75      0.70      0.67      3168



In [21]:
metrics.confusion_matrix(testing_label, predicted)

array([[650,  11,   0,   0,  19,  17],
       [116, 482,   0,   1,  40,  14],
       [ 43,  62,  75,   0,  44,  72],
       [ 61, 172,   0,  67,   8,  13],
       [ 71,  26,   0,   0, 457,  54],
       [ 92,   6,   1,   0,  12, 482]])

In [22]:
# GRID SEARCH
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

In [23]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [24]:
gs_clf = gs_clf.fit(training_data[:400], training_label[:400])



In [25]:
target_name[gs_clf.predict(['spiral density wave theory'])[0]]

'astro-ph.GA'

In [26]:
gs_clf.best_score_

0.6925

In [27]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [28]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
forest_clf = RandomForestClassifier(random_state=42)

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(training_data)
#X_train_counts.shape
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [31]:
#forest_clf.fit(X_train_counts, training_label)
forest_clf.fit(X_train_tfidf, training_label)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [33]:
docs_new = ['spiral density wave', 'protoplanetary disk', 'galactic dynamics', 'theory']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = forest_clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, target_name[category]))

'spiral density wave' => astro-ph.GA
'protoplanetary disk' => astro-ph.SR
'galactic dynamics' => astro-ph.GA
'theory' => astro-ph.CO


In [34]:
# building a pipeline
from sklearn.pipeline import Pipeline
text_forest_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('forest_clf', RandomForestClassifier()),
])

In [35]:
text_forest_clf.fit(training_data, training_label)



Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [36]:
target_name[text_forest_clf.predict(['spiral density wave'])[0]]

'astro-ph.GA'

In [37]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

text_sgd_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('forest_clf', SGDClassifier(tol=1e-3)),
])

In [38]:
text_sgd_clf.fit(training_data, training_label)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip..._state=None, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [39]:
target_name[text_sgd_clf.predict(['gravitational wave'])[0]]

'astro-ph.CO'

In [40]:
predicted = text_sgd_clf.predict(testing_data)
np.mean(predicted == testing_label)   

0.8263888888888888

In [41]:
metrics.confusion_matrix(testing_label, predicted)

array([[601,  29,   9,   5,  30,  23],
       [ 44, 524,  13,  22,  45,   5],
       [ 12,  20, 217,   7,  20,  20],
       [  4,  32,   8, 268,   5,   4],
       [ 42,  25,   8,   0, 488,  45],
       [ 35,   5,  12,   0,  21, 520]])

In [42]:
text_sgd_clf.decision_function(['spiral density wave'])

array([[ 0.58211358, -1.13209957, -0.67344831, -1.06867204, -1.6597623 ,
        -1.44547037]])

In [47]:
prob = text_forest_clf.predict_proba(['gravitational wave'])[0]
print("probability:")
for i, p in enumerate(prob):
    print("%s : %f " % (target_name[i], p))

probability:
astro-ph.GA : 0.100000 
astro-ph.SR : 0.000000 
astro-ph.IM : 0.200000 
astro-ph.EP : 0.100000 
astro-ph.HE : 0.300000 
astro-ph.CO : 0.300000 


In [48]:
target_name

['astro-ph.GA',
 'astro-ph.SR',
 'astro-ph.IM',
 'astro-ph.EP',
 'astro-ph.HE',
 'astro-ph.CO']

In [49]:
myabstract = "Grand-design spiral structure has been observed by ALMA in protoplanetary disks, such as Elias 2-27. We investigate the possibility of the spirals being  unstable global normal modes described by the spiral density wave theory in self-gravitating disks, which was originally developed in the context of disk galaxies. The instability corresponds to the two-armed bisymmetric spirals is studied, which is caused by the WASER mechanism that involves the over-reflection at the corotation radius. The spiral normal modes differs from the dynamical spiral structures occur in a gravitationally-unstable disk. In particular, we study the criteria of these spirals in terms of the radial profile of the Toomre's $Q$ parameter. "

In [50]:
text_sgd_clf.predict([myabstract, 'proto-planetary'])

array([0, 3])

In [51]:
text_forest_clf.predict_proba([myabstract])

array([[0.3, 0.2, 0.3, 0.1, 0.1, 0. ]])