In [35]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from scipy.sparse import vstack
import json

In [37]:
# read in the JSONL file
raw = []
with open("/tmp/train.jsonl", "r") as f:
    for line in f:
        raw.append(json.loads(line))

# throw it into a dataframe
df = pd.DataFrame(raw)
# for now, select just the first 100 records
df = df[:750]

# parse them into the format in the medium.com article
# just the sentence (which doubles as a record id), the token, and its tag
dataset = []
sentence = 1
for row in df.itertuples():
    for i in range(len(row.tokens)):
        record = {"Sentence #": sentence, "Word": row.tokens[i], "Tag": row.ner_tags[i]}
        dataset.append(record)
    sentence += 1

# throw that back into the dataframe
df = pd.DataFrame(dataset)

# do some basic analysis on how clean the data is
print("checking for null values")
print(df.isnull().sum())

print("checking for NaN values")
print(df.isna().sum())

# check for unique values in the columns
(df["Sentence #"].nunique(), df.Word.nunique(), df.Tag.nunique())

# check for how tags are distributed
df.groupby('Tag').size().reset_index(name='counts')

checking for null values
Sentence #    0
Word          0
Tag           0
dtype: int64
checking for NaN values
Sentence #    0
Word          0
Tag           0
dtype: int64


Unnamed: 0,Tag,counts
0,B-Archive,91
1,B-CelestialObject,1470
2,B-CelestialObjectRegion,134
3,B-CelestialRegion,52
4,B-Citation,2891
...,...,...
58,I-Telescope,280
59,I-TextGarbage,39
60,I-URL,1
61,I-Wavelength,658


In [39]:
# data set is too large to just convert to a vector all at once
# chunk it

# vectorize the data and split it into training vs testing
# first, pull out the feature columns
X = df.drop("Tag", axis=1)
# convert it to a list of dictionaries
data_dicts = X.to_dict(orient="records")

# function to yield chunks of data
def chunk_data(data, batch_size=10000):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

# vectorize the chunks
v = DictVectorizer(sparse=False)
# fit on the whole data model since different chunks 
# might have different features
v.fit(data_dicts)

# process the chunks
chunks = []
batch_size = 10000
for i in range(0, len(data_dicts), batch_size):
    part = v.transform(data_dicts[i:i + batch_size])
    chunks.append(part)

# re-assemble
X = vstack(chunks)

# extract the targets
y = df.Tag.values

# pull out the categorizations
classes = np.unique(y).tolist()

# split the data into training vs testing, !!! don't need to do that with DEAL data, it's already split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)

# basic stats for sanity check on matrix sizes
print(X_train.shape, y_train.shape)

(188673, 28878) (188673,)


In [44]:
# remove "O" from the classes for evaluation, because it's so common
# that it will overwhelm our success rate by randomly guessing "O"
new_classes = classes.copy()
_ = new_classes.pop()

In [40]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)


In [41]:
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                           precision    recall  f1-score   support

                B-Archive       0.00      0.00      0.00        23
        B-CelestialObject       0.00      0.00      0.00       378
  B-CelestialObjectRegion       0.00      0.00      0.00        36
        B-CelestialRegion       0.00      0.00      0.00         7
               B-Citation       0.00      0.00      0.00       715
          B-Collaboration       0.00      0.00      0.00        24
      B-ComputingFacility       0.00      0.00      0.00        32
               B-Database       0.00      0.00      0.00        38
                B-Dataset       0.00      0.00      0.00        29
 B-EntityOfFutureInterest       0.00      0.00      0.00        13
                  B-Event       0.00      0.00      0.00         6
             B-Fellowship       0.00      0.00      0.00        50
                B-Formula       0.00      0.00      0.00       232
                  B-Grant       0.00      0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [45]:
# build a simple perceptron
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5000, early_stopping=False)
# train it
per.partial_fit(X_train, y_train, classes)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.2s


-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 464.52, NNZs: 121, Bias: 0.220000, T: 188673, Avg. loss: 74.001078
Total training time: 0.05 seconds.
Norm: 498.36, NNZs: 94, Bias: -0.010000, T: 188673, Avg. loss: 72.743708
Total training time: 0.05 seconds.
Norm: 307.30, NNZs: 165, Bias: -0.170000, T: 188673, Avg. loss: 131.496493
Total training time: 0.02 seconds.
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 273.75, NNZs: 80, Bias: -0.030000, T: 188673, Avg. loss: 57.377810
Total training time: 0.04 seconds.
Norm: 268.40, NNZs: 2693, Bias: 0.390000, T: 188673, Avg. loss: 2099.290162
Total training time: 0.04 seconds.
Norm: 56.87, NNZs: 113, Bias: 0.060000, T: 188673, Avg. loss: 89.330814
Total training time: 0.03 seconds.
Norm: 50.68, NNZs: 64, Bias: 0.030000, T: 188673, Avg. loss: 56.447660
Total training time: 0.04 seconds.
Norm: 398.64, NNZs: 1067, Bias: 0.070000, T: 188673, Avg. loss: 1070.972495
Total training time: 0.04 seconds.
-- Epoch 1
-- Epoch 1
-- Epoch 1

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s


-- Epoch 1
Norm: 324.36, NNZs: 1561, Bias: 0.430000, T: 188673, Avg. loss: 1165.416301
Total training time: 0.02 seconds.
Norm: 49.89, NNZs: 50, Bias: 0.060000, T: 188673, Avg. loss: 39.371416
Total training time: 0.05 seconds.
Norm: 153.67, NNZs: 485, Bias: -0.140000, T: 188673, Avg. loss: 379.587917
Total training time: 0.08 seconds.
Norm: 359.06, NNZs: 1745, Bias: 0.620000, T: 188673, Avg. loss: 2072.035420
Total training time: 0.08 seconds.
-- Epoch 1
Norm: 93.98, NNZs: 268, Bias: 0.230000, T: 188673, Avg. loss: 220.031832
Total training time: 0.04 seconds.
-- Epoch 1
-- Epoch 1
Norm: 273.13, NNZs: 60, Bias: -0.030000, T: 188673, Avg. loss: 29.703250
Total training time: 0.02 seconds.
Norm: 368.40, NNZs: 82, Bias: 0.100000, T: 188673, Avg. loss: 44.050214
Total training time: 0.05 seconds.
-- Epoch 1
Norm: 253.19, NNZs: 580, Bias: -0.180000, T: 188673, Avg. loss: 535.686901
Total training time: 0.02 seconds.
Norm: 75.36, NNZs: 525, Bias: -0.250000, T: 188673, Avg. loss: 358.052061


[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done  55 out of  63 | elapsed:    0.7s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:    0.7s finished


In [46]:
# evaluate how the perceptron did
print(classification_report(y_pred=per.predict(X_test), y_true=y_test, labels=new_classes))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                           precision    recall  f1-score   support

                B-Archive       0.00      0.00      0.00        23
        B-CelestialObject       0.00      0.00      0.00       378
  B-CelestialObjectRegion       0.00      0.00      0.00        36
        B-CelestialRegion       0.00      0.00      0.00         7
               B-Citation       0.00      0.00      0.00       715
          B-Collaboration       0.00      0.00      0.00        24
      B-ComputingFacility       0.00      0.00      0.00        32
               B-Database       0.00      0.00      0.00        38
                B-Dataset       0.00      0.00      0.00        29
 B-EntityOfFutureInterest       0.00      0.00      0.00        13
                  B-Event       0.00      0.00      0.00         6
             B-Fellowship       0.00      0.00      0.00        50
                B-Formula       0.00      0.00      0.00       232
                  B-Grant       0.00      0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [47]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

In [48]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                           precision    recall  f1-score   support

                B-Archive       0.50      0.17      0.26        23
        B-CelestialObject       0.80      0.63      0.71       378
  B-CelestialObjectRegion       0.07      0.17      0.10        36
        B-CelestialRegion       0.00      0.00      0.00         7
               B-Citation       0.85      0.42      0.56       715
          B-Collaboration       0.20      0.04      0.07        24
      B-ComputingFacility       0.00      0.00      0.00        32
               B-Database       0.82      0.24      0.37        38
                B-Dataset       0.50      0.07      0.12        29
 B-EntityOfFutureInterest       1.00      0.23      0.38        13
                  B-Event       0.00      0.00      0.00         6
             B-Fellowship       0.56      0.10      0.17        50
                B-Formula       0.37      0.06      0.10       232
                  B-Grant       0.78      0.19      0.30     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [49]:
pa =PassiveAggressiveClassifier()
pa.partial_fit(X_train, y_train, classes)

In [50]:
print(classification_report(y_pred=pa.predict(X_test), y_true=y_test, labels=new_classes))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                           precision    recall  f1-score   support

                B-Archive       0.00      0.00      0.00        23
        B-CelestialObject       0.00      0.00      0.00       378
  B-CelestialObjectRegion       0.00      0.00      0.00        36
        B-CelestialRegion       0.00      0.00      0.00         7
               B-Citation       1.00      0.00      0.01       715
          B-Collaboration       0.00      0.00      0.00        24
      B-ComputingFacility       0.00      0.00      0.00        32
               B-Database       0.00      0.00      0.00        38
                B-Dataset       0.00      0.00      0.00        29
 B-EntityOfFutureInterest       0.00      0.00      0.00        13
                  B-Event       0.00      0.00      0.00         6
             B-Fellowship       0.00      0.00      0.00        50
                B-Formula       0.00      0.00      0.00       232
                  B-Grant       0.00      0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [51]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

In [57]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(), 
                                                     s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(df)
sentences = getter.sentences

  self.grouped = self.data.groupby('Sentence #').apply(agg_func)


In [58]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features
    
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, label in sent]
def sent2tokens(sent):
    return [token for token, label in sent]

In [59]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [63]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

# blindly following https://stackoverflow.com/questions/66059532/attributeerror-crf-object-has-no-attribute-keep-tempfiles
# comments there call out that this won't work for tuning hyperparameters
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

In [68]:
y_pred = crf.predict(X_test)
# below throws an error that has been a bug in this unmaintained package since 2017
# could either pull in 3rd party fixes, or downgrade 7 years...
#print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes))

# naively trying the below also generates an error about legacy label representation
#print(classification_report(y_test, y_pred, labels=new_classes))

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.