In [None]:
!pip install river

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import river
import json
import datetime

In [None]:
# Imports to make the notebook experience nicer
import pprint

# Read Sample Data

In [None]:
read_start = datetime.datetime.now()

with open('combined_names_newest.json') as f:
    dataset = json.load(f)

# NB: this is getting all used pt_ids, also reformatting in something more like what we might use
pts = set()
for item in dataset:
    item_pts = []
    # NB: this replaces the provision_type_ids list;
    # the file has it as a list of strings like this: "label-true/false"
    # we reformat the list so it is a list of tuples: ['label', bool]
    for label in item['labels']['provision_type_ids']:
        l = label.rsplit('-', 1)
        if l[1] == 'true':
            l[1] = True
        else:
            l[1] = False
        pts.add(l[0])
        item_pts.append(l)
        # if l[0] == 'Governing Law':
        #     pts.add(l[0])
        #     item_pts.append(l)
    item['labels']['provision_type_ids'] = item_pts


read_end = datetime.datetime.now()
print('file load:', (read_end - read_start))

file load: 0:00:03.330297


# Learn

In [None]:
# Decide how much to learn on -- we learn *up to* the slice_point
slice_point = len(dataset) - int(len(dataset) / 4)

In [None]:
# Set up transformers and estimators
transformers = [
    river.feature_extraction.TFIDF(on='compounded_core_lemma_text', lowercase=False, ngram_range=(2,1))
]

estimators = {}
for pt in pts:
    #estimators[pt] = river.naive_bayes.GaussianNB()
    estimators[pt] = river.linear_model.LogisticRegression(optimizer=river.optim.SGD(.1))

In [None]:
print(len(estimators))

70


In [None]:
# Learn
learn_start = datetime.datetime.now()

for item in dataset[:slice_point]:
    transformers[0] = transformers[0].learn_one(item['features'])
    transformed = transformers[0].transform_one(item['features'])
    for label in item['labels']['provision_type_ids']:
        estimators[label[0]] = estimators[label[0]].learn_one(transformed, label[1])

learn_end = datetime.datetime.now()
print('learning:', (learn_end - learn_start))

learning: 0:07:32.559624


# Predict

In [None]:
predict_start = datetime.datetime.now()

pred_dataset = dataset[-300:]
preds = []

for item in pred_dataset:
    transformed = transformers[0].transform_one(item['features'])
    item_preds = {}
    for estimator in estimators:
        pred = estimators[estimator].predict_proba_one(transformed)
        if pred[True] > 0.1:
            item_preds[estimator] = pred[True]
    # item_output = {
    #     'text': item['features']['compounded_core_lemma_text'],
    #     'preds': item_preds
    # }
    preds.append(item_preds)

predict_end = datetime.datetime.now()
print('predict time:', (predict_end - predict_start))
#pprint.pprint(preds)

predict time: 0:00:00.095597


# Metrics

In [None]:
class MLMetrics():

    def __init__(self):
        self.accuracy90 = river.metrics.Accuracy()
        self.precision90 = river.metrics.Precision()
        self.recall90 = river.metrics.Recall()
        self.accuracy50 = river.metrics.Accuracy()
        self.precision50 = river.metrics.Precision()
        self.recall50 = river.metrics.Recall()
        self.accuracy10 = river.metrics.Accuracy()
        self.precision10 = river.metrics.Precision()
        self.recall10 = river.metrics.Recall()
        self.F1 = river.metrics.F1()
        self.R2 = river.metrics.R2()

    def update(self, y, pred):
        eval_90 = pred > 0.9
        eval_50 = pred > 0.5
        eval_10 = pred > 0.1
        self.accuracy90.update(y, eval_90)
        self.precision90.update(y, eval_90)
        self.recall90.update(y, eval_90)
        self.accuracy50.update(y, eval_50)
        self.precision50.update(y, eval_50)
        self.recall50.update(y, eval_50)
        self.accuracy10.update(y, eval_10)
        self.precision10.update(y, eval_10)
        self.recall10.update(y, eval_10)
        self.F1.update(y, pred)
        self.R2.update(y, pred)

In [None]:
metrics_start = datetime.datetime.now()

overall_metrics = MLMetrics()
class_metrics = {}
for estimator in estimators:
    class_metrics[estimator] = MLMetrics()

for item, pred in zip(pred_dataset, preds):
    for label, conf in pred.items():
        true_label = [label, True]
        if true_label in item['labels']['provision_type_ids']:
            overall_metrics.update(True, conf)
            class_metrics[label].update(True, conf)
        else:
            overall_metrics.update(False, conf)
            class_metrics[label].update(False, conf)

metrics_end = datetime.datetime.now()
print('metrics time:', (metrics_end - metrics_start))

print(vars(overall_metrics))

metrics time: 0:00:00.052225
{'accuracy90': Accuracy: 52.55%, 'precision90': Precision: 100.00%, 'recall90': Recall: 3.40%, 'accuracy50': Accuracy: 70.70%, 'precision50': Precision: 96.57%, 'recall50': Recall: 41.83%, 'accuracy10': Accuracy: 49.11%, 'precision10': Precision: 49.11%, 'recall10': Recall: 100.00%, 'F1': F1: 0.00%, 'R2': R2: 0.230444}
