In [64]:
%matplotlib inline
import time

# python 3
import urllib.error
from urllib.request import urlopen
import datetime
#from itertools import ifilter
from collections import Counter, defaultdict
import xml.etree.ElementTree as ET

from bs4 import BeautifulSoup
import matplotlib.pylab as plt
import pandas as pd
import numpy as np
#import bibtexparser

pd.set_option('mode.chained_assignment','warn')

In [137]:
store = pd.HDFStore("astroph_Aug2016_preprocessed.h5")
#store['df'] = df
df = store['df']
store.close()

In [138]:
df.index.shape

(1843,)

In [139]:
# split test and train set
np.random.seed(42)
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [140]:
training_set, test_set = split_train_test(df, 0.2)

training_data = training_set.abstract
testing_data = test_set.abstract
training_label = training_set.label.astype('int64')
testing_label = test_set.label.astype('int64')
#training_data = df.abstract[:1500]
#testing_data = df.abstract[1500:]
#training_label = df.label[:1500].astype('int64')
#testing_label = df.label[1500:].astype('int64')

In [141]:
target_name_dict = { 'astro-ph.GA' : 0,
                     'astro-ph.SR' : 1,
                     'astro-ph.IM' : 2,
                     'astro-ph.EP' : 3,
                     'astro-ph.SR' : 4,
                     'astro-ph.HE' : 5,
                     'astro-ph.CO' : 6,
                   }
target_name = [k for k, v in target_name_dict.items()]

print(training_data[100])
print(target_name[training_label[100]])

despite astrophysical importance of binary star systems  detections are limited to those located in small ranges of separations  distances  and masses and thus it is necessary to use a variety of observational techniques for a complete view of stellar multiplicity across a broad range of physical parameters  in this paper  we report the detections and measurements of  binaries discovered from observations of microlensing events moa  blg  and ogle  blg   determinations of the binary masses are possible by simultaneously measuring the einstein radius and the lens parallax  the measured masses of the binary components are     and     for moa  blg  and     and     for ogle  blg  and thus both lens components of moa  blg  and one component of ogle  blg  are m dwarfs  demonstrating the usefulness of microlensing in detecting binaries composed of low mass components  from modeling of the light curves considering full keplerian motion of the lens  we also measure the orbital parameters of the 

In [142]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(training_data)
X_train_counts.shape

(1475, 11684)

In [143]:
count_vect.vocabulary_.keys()



In [144]:
count_vect.vocabulary_.get(u'spiral')

9822

In [145]:
# compute the term frequency
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

# alternative
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1475, 11684)

In [146]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, training_label)

In [147]:
docs_new = ['spiral density wave', 'protoplanetary disk', 'galactic dynamics', 'theory']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, keylist[category]))

'spiral density wave' => astro-ph.GA
'protoplanetary disk' => astro-ph.SR
'galactic dynamics' => astro-ph.GA
'theory' => astro-ph.CO


In [148]:
# building a pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [149]:
text_clf.fit(training_data, training_label)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [150]:
import numpy as np
docs_test = testing_data
predicted = text_clf.predict(docs_test)
np.mean(predicted == testing_label)   

0.57608695652173914

In [151]:
from sklearn import metrics
print(metrics.classification_report(testing_label, predicted, target_names=keylist))

             precision    recall  f1-score   support

astro-ph.GA       0.41      0.99      0.58        88
astro-ph.SR       0.86      0.47      0.61        78
astro-ph.IM       0.00      0.00      0.00        36
astro-ph.EP       1.00      0.06      0.11        34
astro-ph.HE       0.75      0.69      0.72        74
astro-ph.CO       0.80      0.60      0.69        58

avg / total       0.65      0.58      0.53       368



  'precision', 'predicted', average, warn_for)


In [152]:
metrics.confusion_matrix(testing_label, predicted)

array([[87,  0,  0,  0,  1,  0],
       [33, 37,  0,  0,  7,  1],
       [25,  1,  0,  0,  8,  2],
       [29,  2,  0,  2,  0,  1],
       [15,  3,  0,  0, 51,  5],
       [22,  0,  0,  0,  1, 35]])

In [153]:
# GRID SEARCH
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

In [154]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [155]:
gs_clf = gs_clf.fit(training_data[:400], training_label[:400])

In [160]:
target_name[gs_clf.predict(['spiral density wave theory'])[0]]

'astro-ph.GA'

In [161]:
gs_clf.best_score_

0.6925

In [162]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 1)


In [212]:
from sklearn.ensemble import RandomForestClassifier

In [213]:
forest_clf = RandomForestClassifier(random_state=42)

In [214]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(training_data)
#X_train_counts.shape
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [215]:
#forest_clf.fit(X_train_counts, training_label)
forest_clf.fit(X_train_tfidf, training_label)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [216]:
docs_new = ['spiral density wave', 'protoplanetary disk', 'galactic dynamics', 'theory']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = forest_clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, keylist[category]))

'spiral density wave' => astro-ph.SR
'protoplanetary disk' => astro-ph.CO
'galactic dynamics' => astro-ph.GA
'theory' => astro-ph.CO


In [217]:
# building a pipeline
from sklearn.pipeline import Pipeline
text_forest_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('forest_clf', RandomForestClassifier()),
])

In [218]:
text_forest_clf.fit(training_data, training_label)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [219]:
target_name[text_forest_clf.predict(['spiral density wave'])[0]]

'astro-ph.HE'

In [205]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

text_sgd_clf = Pipeline([('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('forest_clf', SGDClassifier(tol=1e-3)),
])

In [206]:
text_sgd_clf.fit(training_data, training_label)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...'l2', power_t=0.5, random_state=None,
       shuffle=True, tol=0.001, verbose=0, warm_start=False))])

In [207]:
target_name[text_sgd_clf.predict(['gravitational wave'])[0]]

'astro-ph.HE'

In [208]:
predicted = text_sgd_clf.predict(testing_data)
np.mean(predicted == testing_label)   

0.76630434782608692

In [209]:
metrics.confusion_matrix(testing_label, predicted)

array([[78,  2,  0,  1,  3,  4],
       [ 7, 57,  1,  1, 10,  2],
       [ 2,  3, 22,  1,  6,  2],
       [ 3,  4,  0, 27,  0,  0],
       [ 4,  5,  1,  1, 54,  9],
       [ 8,  1,  2,  0,  3, 44]])

In [210]:
text_sgd_clf.decision_function(['spiral density wave'])

array([[ 0.00487084, -0.80824136, -1.28927103, -1.28194684, -0.73309226,
        -1.12089483]])

In [221]:
text_forest_clf.predict_proba(['gravitational wave'])

array([[ 0.1,  0.1,  0.2,  0. ,  0.4,  0.2]])

In [222]:
target_name

['astro-ph.GA',
 'astro-ph.SR',
 'astro-ph.IM',
 'astro-ph.EP',
 'astro-ph.HE',
 'astro-ph.CO']

In [224]:
myabstract = "Grand-design spiral structure has been observed by ALMA in protoplanetary disks, such as Elias 2-27. We investigate the possibility of the spirals being  unstable global normal modes described by the spiral density wave theory in self-gravitating disks, which was originally developed in the context of disk galaxies. The instability corresponds to the two-armed bisymmetric spirals is studied, which is caused by the WASER mechanism that involves the over-reflection at the corotation radius. The spiral normal modes differs from the dynamical spiral structures occur in a gravitationally-unstable disk. In particular, we study the criteria of these spirals in terms of the radial profile of the Toomre's $Q$ parameter. "

In [230]:
text_sgd_clf.predict([myabstract, 'proto-planetary'])

array([3, 3])