In [17]:
!pip install river
!pip install pandas
!pip install vowpalwabbit


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [45]:
import river
import json
import datetime
import pandas as pd
import tokenize
import vowpalwabbit

In [19]:
# Imports to make the notebook experience nicer
import pprint

# Read Sample Data

In [22]:
# NON-PANDAS
read_start = datetime.datetime.now()

with open('combined_names_newest.json') as f:
    dataset = json.load(f)

# NB: this is getting all used pt_ids, also reformatting in something more like what we might use
pts = set()
for item in dataset:
    item_pts = []
    # NB: this replaces the provision_type_ids list;
    # the file has it as a list of strings like this: "label-true/false"
    # we reformat the list so it is a list of tuples: ['label', bool]
    for label in item['labels']['provision_type_ids']:
        l = label.rsplit('-', 1)
        if l[1] == 'true':
            l[1] = True
        else:
            l[1] = False
        pts.add(l[0])
        item_pts.append(l)
        # if l[0] == 'Governing Law':
        #     pts.add(l[0])
        #     item_pts.append(l)
    item['labels']['provision_type_ids'] = item_pts


read_end = datetime.datetime.now()
print('file load:', (read_end - read_start))



file load: 0:00:02.584920


In [5]:
# PANDAS-FRIENDLY
read_start = datetime.datetime.now()

with open('combined_names_newest.json') as f:
    dataset = json.load(f)

# NB: this is getting all used pt_ids, also reformatting in something more like what we might use
pts = set()
for item in dataset:
    item_pts = {}
    # NB: this replaces the provision_type_ids list;
    # the file has it as a list of strings like this: "label-true/false"
    # we reformat the list so it is a list of tuples: ['label', bool]
    for label in item['labels']['provision_type_ids']:
        l = label.rsplit('-', 1)
        if l[1] == 'true':
            l[1] = True
        else:
            l[1] = False
        pts.add(l[0])
        item_pts[l[0]] = l[1]
        # if l[0] == 'Governing Law':
        #     pts.add(l[0])
        #     item_pts.append(l)
    item['labels']['provision_type_ids'] = item_pts


read_end = datetime.datetime.now()
print('file load:', (read_end - read_start))

print(dataset[0])

file load: 0:00:01.432386
{'id': 's6avtdg0pl_QbjGMOcU1', 'features': {'core_lemma_text': 'i Delivery amount return amount credit support amount', 'compounded_core_lemma_text': 'i Deliveryamount returnamount creditsupportamount'}, 'labels': {'provision_type_ids': {'CSA Delivery Amount, Return Amount, and Credit Support Amount': True, 'CSA Credit Support Obligations': True, 'CSA Paragraph 11/13': True, 'CSA Eligible Credit Support / Collateral': False, 'CSA Resolution Time': False, 'CSA Dispute Resolution': False, 'CSA Demands and Notices': False, 'CSA Other Provisions': False, 'CSA Valuation Time': False, 'CSA Valuation and Timing': False, 'CSA Exchange Date': False, 'Addresses for Transfers': False, 'CSA Threshold': False, 'CSA Thresholds': False, 'CSA Minimum Transfer Amount': False, 'CSA Credit Support Amount': False, 'CSA Value': False, 'CSA Return Amount': False, 'CSA Base Currency and Eligible Currency': False, 'CSA Delivery Amount': False, 'CSA Valuation Agent': False, 'CSA Valua

In [21]:
test_dataset = dataset[-10:]

normalized = pd.json_normalize(test_dataset)
#pprint.pprint(normalized)
# # create a list of strings
columns = ['id', 'features.core_lemma_text', 'labels.provision_type_ids["CSA Delivery Amount, Return Amount, and Credit Support Amount"]']


# # Passing a dictionary
# # key: column name
# # value: series of values
df = pd.DataFrame(normalized)
#print(df)
#new_series = df.squeeze()
#print(new_series)

ser = pd.Series(df['features.core_lemma_text'],index=df["id"])
pprint.pprint(ser)

id
v2Oh0IETww3WAd2-tiwF    NaN
q36Vd5XFPTk2S8gHHPoi    NaN
j8E7ZSgG7qqEvjIrNHk_    NaN
p9TpwUaWXTvrqd1y-vHG    NaN
r9T4zjsW5Rcw3OttNaVb    NaN
f8oRo6v1THrrT7WBhMSA    NaN
y1h5Av4u-dqNs5Ijd9BA    NaN
o4LyYnXaqwb1UMC8Fb4c    NaN
v2-IBGNEmOqs6NNyIbxU    NaN
d1HbEkbM6zwPUB6mhW0G    NaN
Name: features.core_lemma_text, dtype: object


# Overrides

In [50]:
feature_given_class_time = datetime.timedelta()
feature_given_class_count = 0
class ARGaussianNB(river.naive_bayes.GaussianNB):

    def p_feature_given_class(self, f: str, c: str) -> float:
        global feature_given_class_count
        #global feature_given_class_time
        #start_time = datetime.datetime.now()
        response = river.naive_bayes.GaussianNB.p_feature_given_class(self, f, c)
        #end_time = datetime.datetime.now()
        #feature_given_class_time += end_time - start_time
        feature_given_class_count += 1
        return response
        # feat_dict = self.feature_counts[f]
        # feat_count = feat_dict[c]
        # num = feat_count + self.alpha
        # #num = self.feature_counts.get(f, {}).get(c, 0.0) + self.alpha
        # den = self.class_counts[c] + self.alpha * 2
        # return num / den

# Learn

In [38]:
# Decide how much to learn on -- we learn *up to* the slice_point
slice_point = len(dataset) - int(len(dataset) / 4)

In [39]:
# Set up transformers and estimators
transformers = [river.feature_extraction.BagOfWords(on='compounded_core_lemma_text', lowercase=False,)]
#transformers = [river.feature_extraction.TFIDF(on='compounded_core_lemma_text', lowercase=True, ngram_range=(1,1))]

estimators = {}
for pt in pts:
    estimators[pt] = ARGaussianNB()

In [40]:
print(len(estimators))

70


In [41]:
# Learn
learn_start = datetime.datetime.now()



for item in dataset[:slice_point]:
    #transformed = transformers[0].transform_one(item['features'])
    transformers[0] = transformers[0].learn_one(item['features'])
    transformed = transformers[0].transform_one(item['features'])
    for label in item['labels']['provision_type_ids']:
      estimators[label[0]] = estimators[label[0]].learn_one(transformed, label[1])

learn_end = datetime.datetime.now()
print('learning:', (learn_end - learn_start))

learning: 0:00:23.482656


# Predict

In [28]:
# TODO: convert dataset[:slice_point] into a pandas dataframe, and try learn_many on it for Bernoulli.
# See if there is a performance difference.
pred_dataset = dataset[-10:]
series_data = {}
for item in pred_dataset:
    series_data[item['id']] = item['features']['core_lemma_text']

real_series = pd.Series(series_data)


# normalized = pd.json_normalize(pred_dataset)
# #pprint.pprint(normalized)
# # # create a list of strings
# columns = ['id', 'features.core_lemma_text']


# # # Passing a dictionary
# # # key: column name
# # # value: series of values
# df = pd.DataFrame(normalized, columns=columns)
# new_series = df.squeeze()
# pprint.pprint(new_series)

# ser = pd.Series(data=new_series['features.core_lemma_text'], index=new_series['id'])
# pprint.pprint(ser)

# # unseen = pd.Series(["Taiwanese Taipei", "Chinese Shanghai"])
# # pprint.pprint(unseen)

predict_start = datetime.datetime.now()

preds = []
test_transformer = river.feature_extraction.BagOfWords(lowercase=False)
transformed = test_transformer.transform_many(X=real_series)
#transformed = transformers[0].transform_many(X=unseen)
for estimator in estimators:
    #pipeline = transformers[0] | estimators[estimator]
    #pred = pipeline.predict_proba_many(ser)
    pred = estimators[estimator].predict_proba_many(transformed)
    preds.append(pred)

predict_end = datetime.datetime.now()
print('predict time:', (predict_end - predict_start))
#print('feature_given_class time:', feature_given_class_time)
print('feature_given_class_count:', feature_given_class_count)
#pprint.pprint(preds)

  self[col] = value


KeyboardInterrupt: ignored

In [42]:
predict_start = datetime.datetime.now()

pred_dataset = dataset[slice_point:]
preds = []

for item in pred_dataset:
    transformed = transformers[0].transform_one(item['features'])
    item_preds = {}
    for estimator in estimators:
        pred = estimators[estimator].predict_proba_one(transformed)
        if pred[True] > 0.1:
            item_preds[estimator] = pred[True]
    # item_output = {
    #     'text': item['features']['compounded_core_lemma_text'],
    #     'preds': item_preds
    # }
    preds.append(item_preds)

predict_end = datetime.datetime.now()
print('predict time:', (predict_end - predict_start))
#print('feature_given_class time:', feature_given_class_time)
print('feature_given_class_count:', feature_given_class_count)
#pprint.pprint(preds)

predict time: 0:00:43.516979
feature_given_class_count: 0


# Metrics

In [43]:
# perhaps the thing to do here would be to instead find, for each strategy:
# 1. the probability at which we get 100% precision, and what the recall at that probability is; and
# 2. the probability at which we get 100% recall, and what the precision at that probability is
class MLMetrics():

    def __init__(self):
        self.accuracy90 = river.metrics.Accuracy()
        self.precision90 = river.metrics.Precision()
        self.recall90 = river.metrics.Recall()
        self.accuracy50 = river.metrics.Accuracy()
        self.precision50 = river.metrics.Precision()
        self.recall50 = river.metrics.Recall()
        self.accuracy10 = river.metrics.Accuracy()
        self.precision10 = river.metrics.Precision()
        self.recall10 = river.metrics.Recall()
        self.F1 = river.metrics.F1()
        self.R2 = river.metrics.R2()

    def update(self, y, pred):
        eval_90 = pred > 0.9
        eval_50 = pred > 0.5
        eval_10 = pred > 0.1
        self.accuracy90.update(y, eval_90)
        self.precision90.update(y, eval_90)
        self.recall90.update(y, eval_90)
        self.accuracy50.update(y, eval_50)
        self.precision50.update(y, eval_50)
        self.recall50.update(y, eval_50)
        self.accuracy10.update(y, eval_10)
        self.precision10.update(y, eval_10)
        self.recall10.update(y, eval_10)
        self.F1.update(y, pred)
        self.R2.update(y, pred)

In [44]:
metrics_start = datetime.datetime.now()

overall_metrics = MLMetrics()
class_metrics = {}
for estimator in estimators:
    class_metrics[estimator] = MLMetrics()

for item, pred in zip(pred_dataset, preds):
    for label, conf in pred.items():
        true_label = [label, True]
        if true_label in item['labels']['provision_type_ids']:
            overall_metrics.update(True, conf)
            class_metrics[label].update(True, conf)
        else:
            overall_metrics.update(False, conf)
            class_metrics[label].update(False, conf)

metrics_end = datetime.datetime.now()
print('metrics time:', (metrics_end - metrics_start))

print(vars(overall_metrics))

metrics time: 0:00:00.353191
{'accuracy90': Accuracy: 75.42%, 'precision90': Precision: 77.19%, 'recall90': Recall: 58.69%, 'accuracy50': Accuracy: 74.05%, 'precision50': Precision: 70.32%, 'recall50': Recall: 65.85%, 'accuracy10': Accuracy: 41.90%, 'precision10': Precision: 41.90%, 'recall10': Recall: 100.00%, 'F1': F1: 30.56%, 'R2': R2: 0.140416}


In [None]:
# first_estimator = next(iter(estimators))
# pprint.pprint(estimators[first_estimator].feature_counts)
