In [None]:
import glob
import os
import json
import typing
import weave

In [None]:
PROJECT = 'text-extract35'

In [None]:
!ls dataset

In [None]:
example_doc = open('dataset/Articles_of_Incorporation_Real_Example_1.txt').read()
print(example_doc)

In [None]:
import re

def predict_name(doc: str) -> typing.Any:
    match = re.search(r'name.*is ([^.]*)(\.|\n)', doc)
    return match.group(1) if match else None

def predict_shares(doc: str) -> typing.Any:
    match = re.search(r'[Ss]hares.*?([\d,]+)', doc)
    return match.group(1).replace(',', '') if match else None

@weave.op()
def predict(doc: str) -> typing.Any:
    return {
        'name': predict_name(doc),
        'shares': predict_shares(doc)
    }

In [None]:
predict(example_doc)

In [None]:
weave.init(f'shawn/{PROJECT}')

In [None]:
# Read in our dataset
def read_dataset():
    dataset_rows = []
    raw_labels = json.load(open(os.path.join("dataset", "labels.json")))
    for p in glob.glob(os.path.join("dataset", "*.txt")):
        example_id = os.path.basename(p).replace(".", "_")
        label = raw_labels.get(example_id)
        if label:
            dataset_rows.append(
                {"id": example_id, "example": open(p).read(), "label": label})
    return dataset_rows

# Construct and publish to W&B
dataset = weave.Dataset(read_dataset())
dataset_ref = weave.publish(dataset, "dataset")

In [None]:
for row in dataset_ref.get().rows:
    print(predict(row['example']))

In [None]:
@weave.type()
class RegexModel(weave.Model):
    @weave.op()
    def predict(self, doc: str) -> typing.Any:
        return {
            'name': predict_name(doc),
            'shares': predict_shares(doc)
        }

In [None]:
import op_evaluate
op_evaluate.evaluate_multi_task_f1(dataset_ref, RegexModel())

In [None]:
import json
from weave.monitoring import openai

@weave.type()
class Prompt:
    text: str

@weave.type()
class OpenAIChatModel(weave.Model):
    model_name: str
    prompt: Prompt

    @weave.op()
    def predict(self, doc: str) -> typing.Any:
        response = openai.ChatCompletion.create(
            model=self.model_name,
            messages=[
                {'role': 'user',
                 'content': self.prompt.text.format(doc=doc)}])
        result = response['choices'][0]['message']['content']
        parsed = json.loads(result)
        return {
            'name': parsed['name'],
            'shares': int(parsed['shares'])
        }

In [None]:
model = OpenAIChatModel(
    'gpt-3.5-turbo',
    Prompt("Extract company name (name) and number of shares (shares) from the following Articles of Incorporation document, as a json object: {doc}"))

In [None]:
#model.predict(example_doc)

In [None]:
op_evaluate.evaluate_multi_task_f1(dataset_ref, model)