In [38]:
# json for parsing our JSONL files from the data set
import json

# numpy and pandas for efficient manipulation of data structures
import numpy as np
import pandas as pd

from itertools import chain

# spacy for embedding tokens from the dataset into vectors
import spacy
# or word2vec, we're trying things
from gensim.models import Word2Vec

# scikit-learn for the various models to try
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.naive_bayes import MultinomialNB



In [39]:
# read in the JSONL file
raw = []
with open("train.jsonl", "r") as f:
    for line in f:
        raw.append(json.loads(line))

# throw it into a dataframe
df = pd.DataFrame(raw)
df.head(5)

Unnamed: 0,bibcode,label_studio_id,ner_ids,ner_tags,section,tokens,unique_id
0,2019MNRAS.486.5558S,487,"[62, 62, 62, 62, 62, 62, 62, 15, 62, 62, 62, 6...","[O, O, O, O, O, O, O, B-Instrument, O, O, O, O...",fulltext,"[Whilst, a, reasonable, harmonic, fit, to, the...",fulltext_487_2019MNRAS.486.5558S
1,2018MNRAS.478.5533F,1129,"[62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 6...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",fulltext,"[comparison, once, the, angular, positions, of...",fulltext_1129_2018MNRAS.478.5533F
2,2018MNRAS.480.3062L,1086,"[62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 6...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",acknowledgments,"[ACKNOWLEDGEMENTS, The, authors, thank, an, an...",acknowledgments_1086_2018MNRAS.480.3062L
3,2016MNRAS.457.1786M,1135,"[22, 62, 62, 62, 62, 21, 13, 44, 44, 21, 62, 1...","[B-Person, O, O, O, O, B-Organization, B-Grant...",acknowledgments,"[BDM, gratefully, acknowledges, support, from,...",acknowledgments_1135_2016MNRAS.457.1786M
4,2019MNRAS.482L...9B,559,"[62, 62, 62, 22, 53, 22, 53, 22, 53, 22, 53, 2...","[O, O, O, B-Person, I-Person, B-Person, I-Pers...",acknowledgments,"[ACKNOWLEDGEMENTS, We, thank, Dougal, Mackey,,...",acknowledgments_559_2019MNRAS.482L...9B


In [40]:
# it'll be far more effective to store NER IDs vs names
# but names are nice, so map them all out
tag_map = {}
for id_list, name_list in zip(df["ner_ids"], df["ner_tags"]):
    for tag_id, tag_name in zip(id_list, name_list):
        tag_map[tag_id] = tag_name

In [41]:
# are there any null or NaN values we need to be worried about?
print("checking for null values\n", df.isnull().sum(), "\n")
print("checking for NaN values\n", df.isna().sum())

checking for null values
 bibcode            0
label_studio_id    0
ner_ids            0
ner_tags           0
section            0
tokens             0
unique_id          0
dtype: int64 

checking for NaN values
 bibcode            0
label_studio_id    0
ner_ids            0
ner_tags           0
section            0
tokens             0
unique_id          0
dtype: int64


In [42]:
# are there tokens with special characters?
all_tokens = np.concatenate(df["tokens"])
special = []
for token in all_tokens:
    if not token.isalpha():
        special.append(token)
len(np.unique(special))

28606

In [43]:
# yes, lots. maybe we will deal with that later,
# that could affect accuracy, but who knows yet

In [89]:
# either this cell or the next should be enabled. toggle cells with 'r' and 'y'

# alternatively, use Word2Vec for this
# TODO: understand parameters better

# the min count not being zero is a problem, sometimes words can't be encoded
min_count = 0
# the vector size makes a huge difference on how successful various algorithsm below are
vector_size = 20

model = Word2Vec(sentences=df["tokens"], vector_size=vector_size, window=5, min_count=min_count, sg=1)
# vectorize each word
X_train = np.zeros([len(all_tokens), vector_size])
for i, token in enumerate(all_tokens):
    X_train[i] = model.wv[token]

y_train = np.concatenate(df["ner_ids"].to_numpy())

# so the same with the validation data
raw = []
with open("validate.jsonl", "r") as f:
    for line in f:
        raw.append(json.loads(line))

# throw it into a dataframe
df_validate = pd.DataFrame(raw)

model_validate = Word2Vec(sentences=df_validate["tokens"], vector_size=vector_size, window=5, min_count=min_count, sg=1)
# vectorize each word
all_test_tokens = np.concatenate(df_validate["tokens"])
X_test = np.zeros([len(all_test_tokens), vector_size])
for i, token in enumerate(all_test_tokens):
    X_test[i] = model_validate.wv[token]

y_test = np.concatenate(df_validate["ner_ids"].to_numpy())

In [83]:
# sanity test on data
print(X_train.shape, y_train.shape)
print(X_test.shape, y_train.shape)

(573132, 10) (573132,)
(447366, 10) (573132,)


In [61]:
# pull out the categorizations
labels = np.unique(y_train).tolist()
# also get their names for prettier reports
tag_map = dict(zip(np.concatenate(df["ner_ids"]), np.concatenate(df["ner_tags"])))
# remove tag for 'O', we don't want that in reports
# it's so common it heavily biases the f-scores
del(tag_map[62])
# now create lists for the IDs and tags in the same order
# sklearn.metrics.classification_report requires two different parameters
tag_ids = sorted(tag_map.keys())
tag_names = [tag_map[i] for i in tag_ids]

In [62]:
# split the data into training vs testing
# TODO: when properly working on the DEAL data, it is already split
# move this to a multi-file pipeline
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)

# basic stats for sanity check on matrix sizes
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(573132, 100) (573132,)
(447366, 100) (447366,)


In [63]:
# quick helper function for printing classification reports
def report(y_pred):
    print(classification_report(
        y_pred=y_pred,
        y_true=y_test,
        labels=tag_ids,
        target_names=tag_names,
        zero_division=np.nan
    )
)

In [90]:
# simple stochastic gradient descent
sgd = SGDClassifier()
# fit and predict
sgd.partial_fit(X_train, y_train, labels)
y_pred = sgd.predict(X_test)

In [91]:
report(y_pred)

                           precision    recall  f1-score   support

                B-Archive        nan      0.00      0.00       153
        B-CelestialObject        nan      0.00      0.00      2285
  B-CelestialObjectRegion        nan      0.00      0.00       150
        B-CelestialRegion        nan      0.00      0.00       102
               B-Citation       0.00      0.00      0.00      4820
          B-Collaboration        nan      0.00      0.00       238
      B-ComputingFacility        nan      0.00      0.00       360
               B-Database        nan      0.00      0.00       199
                B-Dataset        nan      0.00      0.00       222
 B-EntityOfFutureInterest        nan      0.00      0.00        52
                  B-Event        nan      0.00      0.00        37
             B-Fellowship        nan      0.00      0.00       326
                B-Formula        nan      0.00      0.00      1541
                  B-Grant        nan      0.00      0.00     

In [76]:
# build a simple perceptron
per = Perceptron(verbose=1, n_jobs=-1, max_iter=25)
# train it and predict
per.partial_fit(X_train, y_train, labels)
y_pred = per.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 17.27, NNZs: 20, Bias: 0.000000, T: 573132, Avg. loss: 0.024629
Total training time: 0.09 seconds.
Norm: 8.65, NNZs: 20, Bias: 1.000000, T: 573132, Avg. loss: 0.032421
Total training time: 0.09 seconds.
-- Epoch 1
-- Epoch 1
Norm: 10.12, NNZs: 20, Bias: 0.000000, T: 573132, Avg. loss: 0.001953
Total training time: 0.10 seconds.
Norm: 20.68, NNZs: 20, Bias: -2.000000, T: 573132, Avg. loss: 0.003837
Total training time: 0.10 seconds.
-- Epoch 1
-- Epoch 1
Norm: 13.59, NNZs: 20, Bias: 1.000000, T: 573132, Avg. loss: 0.007136
Total training time: 0.10 seconds.
Norm: 20.37, NNZs: 20, Bias: 1.000000, T: 573132, Avg. loss: 0.004445
Total training time: 0.10 seconds.
Norm: 9.76, NNZs: 20, Bias: 0.000000, T: 573132, Avg. loss: 0.003436
Total training time: 0.11 seconds.
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 16.14, NNZs: 20, Bias: 1.000000, T: 573132, Avg. loss: 0.006481
Total training time: 0.11 secon

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s


-- Epoch 1
Norm: 10.70, NNZs: 20, Bias: 0.000000, T: 573132, Avg. loss: 0.019323
Total training time: 0.09 seconds.
-- Epoch 1
Norm: 26.62, NNZs: 20, Bias: -1.000000, T: 573132, Avg. loss: 0.262665
Total training time: 0.09 seconds.
-- Epoch 1
Norm: 10.18, NNZs: 20, Bias: 1.000000, T: 573132, Avg. loss: 0.010216
Total training time: 0.09 seconds.
-- Epoch 1
Norm: 29.20, NNZs: 20, Bias: 1.000000, T: 573132, Avg. loss: 0.019945
Total training time: 0.09 seconds.
-- Epoch 1
Norm: 6.59, NNZs: 20, Bias: 0.000000, T: 573132, Avg. loss: 0.001256
Total training time: 0.09 seconds.
-- Epoch 1
Norm: 7.80, NNZs: 20, Bias: -1.000000, T: 573132, Avg. loss: 0.001050
Total training time: 0.10 seconds.
-- Epoch 1
Norm: 9.52, NNZs: 20, Bias: 2.000000, T: 573132, Avg. loss: 0.007352
Total training time: 0.10 seconds.
Norm: 16.74, NNZs: 20, Bias: 0.000000, T: 573132, Avg. loss: 0.007654
Total training time: 0.10 seconds.
Norm: 7.46, NNZs: 20, Bias: 0.000000, T: 573132, Avg. loss: 0.000221
Total training 

[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:    0.8s finished


In [77]:
report(y_pred)

                           precision    recall  f1-score   support

                B-Archive        nan      0.00      0.00       153
        B-CelestialObject       0.02      0.12      0.03      2285
  B-CelestialObjectRegion        nan      0.00      0.00       150
        B-CelestialRegion        nan      0.00      0.00       102
               B-Citation       0.01      0.02      0.01      4820
          B-Collaboration       0.00      0.00      0.00       238
      B-ComputingFacility       0.01      0.19      0.01       360
               B-Database       0.00      0.00      0.00       199
                B-Dataset        nan      0.00      0.00       222
 B-EntityOfFutureInterest        nan      0.00      0.00        52
                  B-Event       0.00      0.00      0.00        37
             B-Fellowship       0.00      0.00      0.00       326
                B-Formula       0.00      0.00      0.00      1541
                  B-Grant       0.06      0.09      0.07     

In [78]:
# try a passive aggressive classifer
pa = PassiveAggressiveClassifier()
# fit and predict
pa.partial_fit(X_train, y_train, labels)
y_pred = pa.predict(X_test)

In [79]:
# evaluate how the perceptron did
report(y_pred)

                           precision    recall  f1-score   support

                B-Archive        nan      0.00      0.00       153
        B-CelestialObject       0.00      0.00      0.00      2285
  B-CelestialObjectRegion        nan      0.00      0.00       150
        B-CelestialRegion       0.00      0.00      0.00       102
               B-Citation       0.05      0.48      0.09      4820
          B-Collaboration        nan      0.00      0.00       238
      B-ComputingFacility       0.00      0.00      0.00       360
               B-Database        nan      0.00      0.00       199
                B-Dataset        nan      0.00      0.00       222
 B-EntityOfFutureInterest        nan      0.00      0.00        52
                  B-Event        nan      0.00      0.00        37
             B-Fellowship       0.00      0.00      0.00       326
                B-Formula        nan      0.00      0.00      1541
                  B-Grant        nan      0.00      0.00     

In [80]:
# try a naive bayes classifier for multinomial models
# this requires no negative values in the input
# so make it a pipeline fronted by a minmaxscaler that
# will scale all the features to fit into (0, 10)
# TODO: play around with the feature range, (0, 10) performs better than (0, 1). why? what are good values?
nb = Pipeline([
    ('scaling', MinMaxScaler(feature_range=(0, 10))),
    ('multinominalNB', MultinomialNB(alpha=0.01)),
])
# and fit and train
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [81]:
report(y_pred)

                           precision    recall  f1-score   support

                B-Archive        nan      0.00      0.00       153
        B-CelestialObject        nan      0.00      0.00      2285
  B-CelestialObjectRegion        nan      0.00      0.00       150
        B-CelestialRegion        nan      0.00      0.00       102
               B-Citation        nan      0.00      0.00      4820
          B-Collaboration        nan      0.00      0.00       238
      B-ComputingFacility        nan      0.00      0.00       360
               B-Database        nan      0.00      0.00       199
                B-Dataset        nan      0.00      0.00       222
 B-EntityOfFutureInterest        nan      0.00      0.00        52
                  B-Event        nan      0.00      0.00        37
             B-Fellowship        nan      0.00      0.00       326
                B-Formula        nan      0.00      0.00      1541
                  B-Grant        nan      0.00      0.00     

In [52]:
# let's try something different. for embeddings, let's use astroBERT directly
# credit to https://huggingface.co/adsabs/astroBERT
from transformers import AutoTokenizer, TFAutoModel
import torch

In [53]:
# load model from huggingface
remote_model_path = 'adsabs/astroBERT'
# instantiate the tokenizer
astroBERT_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=remote_model_path,
                                                    auth_token=True,
                                                    add_special_tokens=True,
                                                    do_lower_case=False,)

In [54]:
# instantiate the model
astroBERT_automodel = TFAutoModel.from_pretrained(remote_model_path, 
                                                  token=True,)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'bert.embeddings.position_ids', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already u

In [55]:
all_tokens = list(chain(*df["tokens"]))

In [56]:
# tokenize our, errr, tokens
tokens = astroBERT_tokenizer(all_tokens, padding=True, return_tensors="tf")

In [57]:
tokens

{'input_ids': <tf.Tensor: shape=(573132, 37), dtype=int32, numpy=
array([[16338, 16133, 16341, ..., 16340, 16340, 16340],
       [16338, 16348, 16341, ..., 16340, 16340, 16340],
       [16338, 25274, 16341, ..., 16340, 16340, 16340],
       ...,
       [16338, 22117, 16341, ..., 16340, 16340, 16340],
       [16338,  7483, 16341, ..., 16340, 16340, 16340],
       [16338, 22406, 16341, ..., 16340, 16340, 16340]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(573132, 37), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(573132, 37), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype

In [None]:
# this dies miserably with a kernel crash. don't know why?
# output = astroBERT_automodel(**tokens, output_hidden_states=False)
output