In [1]:
import glob
import os
import json
import typing
import weave

import base_types
#weave.use_frontend_devmode()

In [2]:
PROJECT = 'text-extract32'

In [3]:
!ls dataset

Articles_of_Incorporation_Real_Example_1.txt
Articles_of_Incorporation_Real_Example_2.txt
Articles_of_Incorporation_Real_Example_3.txt
Highly_Varied_Article_of_Incorporation_1.txt
Highly_Varied_Article_of_Incorporation_10.txt
Highly_Varied_Article_of_Incorporation_2.txt
Highly_Varied_Article_of_Incorporation_3.txt
Highly_Varied_Article_of_Incorporation_4.txt
Highly_Varied_Article_of_Incorporation_5.txt
Highly_Varied_Article_of_Incorporation_6.txt
Highly_Varied_Article_of_Incorporation_7.txt
Highly_Varied_Article_of_Incorporation_8.txt
Highly_Varied_Article_of_Incorporation_9.txt
labels.json


In [4]:
example_doc = open('dataset/Articles_of_Incorporation_Real_Example_1.txt').read()
print(example_doc)


Articles of Incorporation of TechBoost Corp

Article I: Name
The name of the corporation is TechBoost Corp.

Article II: Purpose
The purpose of this corporation is to engage in any lawful act or activity for which corporations may be organized under the California Corporation Code.

Article III: Registered Agent
The address of the registered agent is 1234 Silicon Valley Blvd, San Jose, CA 95131.

Article IV: Share Structure
The corporation is authorized to issue 1,000,000 shares of Common Stock.

Article V: Directors
The names and addresses of the initial directors are as follows:
- Alice Johnson, 5678 Tech Park Dr, Palo Alto, CA 94306
- Bob Smith, 9101 Innovation Way, Mountain View, CA 94043

Article VI: Incorporators
The names and addresses of the incorporators are as follows:
- Emily Davis, 1112 Startup St, San Francisco, CA 94103
- Mark Lee, 1314 Venture Ave, Santa Clara, CA 95050

Article VII: Dissolution
Upon dissolution, assets will be distributed for one or more exempt purpose

In [7]:
import re

def predict_name(doc: str) -> typing.Any:
    match = re.search(r'name.*is ([^.]*)(\.|\n)', doc)
    return match.group(1) if match else None

def predict_shares(doc: str) -> typing.Any:
    match = re.search(r'[Ss]hares.*?([\d,]+)', doc)
    return match.group(1).replace(',', '') if match else None

@weave.op()
def predict(doc: str) -> typing.Any:
    return {
        'name': predict_name(doc),
        'shares': predict_shares(doc)
    }

In [10]:
weave.use(predict(example_doc))

{'name': 'TechBoost Corp', 'shares': None}

In [9]:
from weave.monitoring import init_monitor
init_monitor(f'shawn/{PROJECT}/stream')



In [11]:
# Read in our dataset
def read_dataset():
    dataset_rows = []
    raw_labels = json.load(open(os.path.join("dataset", "labels.json")))
    for p in glob.glob(os.path.join("dataset", "*.txt")):
        example_id = os.path.basename(p).replace(".", "_")
        label = raw_labels.get(example_id)
        if label:
            dataset_rows.append(
                {"id": example_id, "example": open(p).read(), "label": label})
    return weave.WeaveList(dataset_rows)

@weave.type()
class Dataset:
    rows: list[typing.Any]

# Construct and publish to W&B
dataset = Dataset(read_dataset())
dataset_ref = weave.storage.publish(dataset, f"{PROJECT}/dataset")

In [12]:
for row in dataset_ref.get().rows:
    print(weave.use(predict(row['example'])))

{'name': None, 'shares': '31783'}
{'name': 'Gibson, Hunt and Davidson', 'shares': None}
{'name': 'HealthFirst Solutions LLC', 'shares': '500000'}
{'name': 'GreenLeaf Inc', 'shares': None}
{'name': 'TechBoost Corp', 'shares': None}
{'name': None, 'shares': '41141'}
{'name': None, 'shares': '73981'}
{'name': None, 'shares': '41300'}
{'name': None, 'shares': '98608'}
{'name': None, 'shares': '5732'}
{'name': None, 'shares': '78821'}
{'name': None, 'shares': '76197'}
{'name': None, 'shares': '54183'}


In [14]:
@weave.type()
class Model:
    pass
    
@weave.type()
class RegexModel(Model):
    @weave.op()
    def predict(self, doc: str) -> typing.Any:
        return {
            'name': predict_name(doc),
            'shares': predict_shares(doc)
        }

In [15]:
import op_evaluate
weave.use(op_evaluate.evaluate_multi_task_f1(dataset_ref.get(), RegexModel()))

{'eval_table': <ArrowWeaveList: TypedDict(property_types={'dataset_id': String(), 'output': TypedDict(property_types={'name': UnionType(members=[NoneType(), String()]), 'shares': UnionType(members=[String(), NoneType()])}, not_required_keys=set()), 'latency': Float(), 'summary': TypedDict(property_types={'name_correct': UnionType(members=[NoneType(), Boolean()]), 'name_negative': UnionType(members=[NoneType(), Boolean()]), 'shares_correct': UnionType(members=[NoneType(), Boolean()]), 'shares_negative': UnionType(members=[NoneType(), Boolean()]), 'directors_correct': UnionType(members=[NoneType(), Boolean()]), 'directors_negative': UnionType(members=[NoneType(), Boolean()]), 'correct': UnionType(members=[NoneType(), Int()]), 'negative': UnionType(members=[NoneType(), Int()]), 'tp': UnionType(members=[NoneType(), Int()]), 'fp': UnionType(members=[NoneType(), Int()]), 'tn': UnionType(members=[NoneType(), Int()]), 'fn': UnionType(members=[NoneType(), Int()]), 'precision': UnionType(members

In [16]:
import json
from weave.monitoring import openai

@weave.type()
class OpenAIChatModel(Model):
    model_name: str
    temperature: float

    @weave.op()
    def predict(self, doc: str) -> typing.Any:
        response = openai.ChatCompletion.create(
            model=self.model_name,
            messages=[
                {'role': 'user',
                 'content': 'Extract company name (name) and number of shares (shares) from the following Articles of Incorporation document, as a json object: %s' % doc}])
        result = response['choices'][0]['message']['content']
        parsed = json.loads(result)
        return {
            'name': parsed['name'],
            'shares': int(parsed['shares'])
        }

In [17]:
weave.use(op_evaluate.evaluate_multi_task_f1(dataset_ref.get(), OpenAIModel('gpt-3.5-turbo')))

{'eval_table': <ArrowWeaveList: TypedDict(property_types={'dataset_id': String(), 'output': TypedDict(property_types={'name': UnionType(members=[String(), NoneType()]), 'shares': UnionType(members=[Int(), NoneType()])}, not_required_keys=set()), 'latency': Float(), 'summary': TypedDict(property_types={'name_correct': UnionType(members=[NoneType(), Boolean()]), 'name_negative': UnionType(members=[NoneType(), Boolean()]), 'shares_correct': UnionType(members=[NoneType(), Boolean()]), 'shares_negative': UnionType(members=[NoneType(), Boolean()]), 'directors_correct': UnionType(members=[NoneType(), Boolean()]), 'directors_negative': UnionType(members=[NoneType(), Boolean()]), 'correct': UnionType(members=[NoneType(), Int()]), 'negative': UnionType(members=[NoneType(), Int()]), 'tp': UnionType(members=[NoneType(), Int()]), 'fp': UnionType(members=[NoneType(), Int()]), 'tn': UnionType(members=[NoneType(), Int()]), 'fn': UnionType(members=[NoneType(), Int()]), 'precision': UnionType(members=[N