In [2]:
from typing import Optional, List
from pydantic.v1 import BaseModel
import datetime

# Define a class that inherits from BaseModel
class User(BaseModel):
    id: int
    name: str
    friends: List[int] = []

# Create an instance of the class
user = User(id=123, name="John Doe", friends=[1, 2, 3])
user.dict()

{'id': 123, 'name': 'John Doe', 'friends': [1, 2, 3]}

In [3]:
user.schema_json()

'{"title": "User", "type": "object", "properties": {"id": {"title": "Id", "type": "integer"}, "name": {"title": "Name", "type": "string"}, "friends": {"title": "Friends", "default": [], "type": "array", "items": {"type": "integer"}}}, "required": ["id", "name"]}'

## Extract XML

In [4]:
import re

# Sample string with XML content
xml_string = """<retrieved>
- Reocín is a mining complex comprised of the following operational areas for the production of separate lead and zinc concentrates: an underground mine, an open pit mine, a concentrator (mill) mine waste management, and infrastructure. Figure 2-1 shows a location map of Reocín. Figure 2-2 shows an overview of the various facilities at the Reocín mine. Zinc ore has been continuously mined at the Reocín site since 1857. Asturiana purchased the mine in 1981.
- Hinojedo roasting plant is located approximately 6 kilometres north-east of the Reocín mine and processes zinc concentrates from the Reocín mine to produce calcine. The calcine is transported to San Juan de Nieva zinc operation for processing. Additionally, lead concentrates from Reocín are stored here prior to loadout and shipment to external lead smelting facilities. The Hinojedo facility dates to 1929, with a major rehabilitation program completed in 1996 to bring the plant to its current operating configuration. The facility was acquired by Asturiana in 1980.
- The San Juan de Nieva zinc smelter/refinery operation utilises conventional technologies for the production of zinc ingots, zinc alloys, zinc for plating and zinc dust. By-products from San Juan de Nieva are sulphuric acid, mercury and germanium oxide. The unit’s operations at San Juan de Nieva include: concentrate receiving and storage, roasting and acid plant, leaching and purification, electrowinning and casting. The San Juan de Nieva smelter was originally commissioned in 1960 and has undergone several expansions, withthe most recent being completed in August 2001.
- The Arnao manufacturing facility produces zinc oxide dust, rolled zinc products, lead anodes, and zinc wire. The Arnao factory is reported to have been in operation as early as the 1830s. Asturiana purchased the facility in 1980.
</retrieved>"""

# Regular expression to match content between <tag> and </tag>
pattern = r"<retrieved>((.|\n)*?)<\/retrieved>"

# Search for the pattern in the string
match = re.search(pattern, xml_string)

# Extract all matches if found
if match:
    content = match.group(1)
    print(content)
else:
    print("No match found")



- Reocín is a mining complex comprised of the following operational areas for the production of separate lead and zinc concentrates: an underground mine, an open pit mine, a concentrator (mill) mine waste management, and infrastructure. Figure 2-1 shows a location map of Reocín. Figure 2-2 shows an overview of the various facilities at the Reocín mine. Zinc ore has been continuously mined at the Reocín site since 1857. Asturiana purchased the mine in 1981.
- Hinojedo roasting plant is located approximately 6 kilometres north-east of the Reocín mine and processes zinc concentrates from the Reocín mine to produce calcine. The calcine is transported to San Juan de Nieva zinc operation for processing. Additionally, lead concentrates from Reocín are stored here prior to loadout and shipment to external lead smelting facilities. The Hinojedo facility dates to 1929, with a major rehabilitation program completed in 1996 to bring the plant to its current operating configuration. The facility w

# Regex to process query

In [5]:
import re

# List of queries
queries = [
    "what's the mineral site's name this document about?",
    "whAt's the mineral site's location, coordinate reference system used, the country and state/province where the mineral site is located in?",
    "wHat's the mineral site's resources or reserves? What are the mineral commodities, their categories (e.g. indicated, inferred, measured, probable, proven), ore tonnage, grade, cutoff grade, date reported, and zone?",
    "What's the mineral site's deposit type?"
]

# Regex to find and replace "what's " and "?"
regex_pattern = r"what's\s|\?"

# Applying regex replace to each query
cleaned_queries = [re.sub(regex_pattern, "", query, flags=re.IGNORECASE) for query in queries]

cleaned_queries


["the mineral site's name this document about",
 "the mineral site's location, coordinate reference system used, the country and state/province where the mineral site is located in",
 "the mineral site's resources or reserves What are the mineral commodities, their categories (e.g. indicated, inferred, measured, probable, proven), ore tonnage, grade, cutoff grade, date reported, and zone",
 "the mineral site's deposit type"]

# Read txt to string

In [2]:
import json
with open("../data/asset/parsed_result_w_gt/Bongará_Zn_3-2019/Bongará_Zn_3-2019.txt", "r") as file:
    content = file.read()

d = {"input": content}

# wirte d to a file
with open("test.json", "w") as file:
    file.write(json.dumps(d, indent=4))

# LangChain Evaluation

In [6]:
from langchain_benchmarks import clone_public_dataset, registry
from loguru import logger


task = registry["Email Extraction"]
logger.info(f"Cloning {task.name} dataset")
logger.info(f"Task description {task.description}")

clone_public_dataset(task.dataset_id, dataset_name=task.name)



[32m2024-03-09 21:25:41.668[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mCloning Email Extraction dataset[0m
[32m2024-03-09 21:25:41.669[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mTask description A dataset of 42 real emails deduped from a spam folder, with semantic HTML tags removed, as well as a script for initial extraction and formatting of other emails from an arbitrary .mbox file like the one exported by Gmail.

Some additional cleanup of the data was done by hand after the initial pass.

See https://github.com/jacoblee93/oss-model-extraction-evals.
    [0m


Dataset Email Extraction already exists. Skipping.
You can access the dataset at https://smith.langchain.com/o/070d9f00-e34e-5115-a9ff-2f39ea17056a/datasets/26c0f964-36c0-479c-8ad9-978293de69f5.


In [7]:
type(task.schema)

pydantic.v1.main.ModelMetaclass

In [8]:
task.schema.schema()

{'title': 'Email',
 'description': 'Relevant information about an email.',
 'type': 'object',
 'properties': {'sender': {'title': 'Sender',
   'description': "The sender's name, if available",
   'type': 'string'},
  'sender_phone_number': {'title': 'Sender Phone Number',
   'description': "The sender's phone number, if available",
   'type': 'string'},
  'sender_address': {'title': 'Sender Address',
   'description': "The sender's address, if available",
   'type': 'string'},
  'action_items': {'title': 'Action Items',
   'description': 'A list of action items requested by the email',
   'type': 'array',
   'items': {'type': 'string'}},
  'topic': {'title': 'Topic',
   'description': 'High level description of what the email is about',
   'type': 'string'},
  'tone': {'description': 'The tone of the email.',
   'allOf': [{'$ref': '#/definitions/ToneEnum'}]}},
 'required': ['action_items', 'topic', 'tone'],
 'definitions': {'ToneEnum': {'title': 'ToneEnum',
   'description': 'The tone 

In [9]:
task.instructions

ChatPromptTemplate(input_variables=['input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are an expert researcher.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='What can you tell me about the following email? Make sure to extract the question in the correct format. Here is the email:\n ```\n{input}\n```'))])

In [10]:
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0).bind_functions(
    functions=[task.schema],
    function_call=task.schema.schema()["title"],
)

output_parser = JsonOutputFunctionsParser()
extraction_chain = task.instructions | llm | output_parser | (lambda x: {"output": x})

In [11]:
extraction_chain.invoke(
    {
        "input": "Hello Dear MR. I want you to send me gold to get rich."
        " First buy an envelope. Then open it and put some gold inside. "
        "Then close it and finally mail it to my address at 12345 My Gold Way."
        " You can call me any time at 000-1212-1111."
    }
)

{'output': {'sender': 'Unknown',
  'sender_phone_number': '000-1212-1111',
  'sender_address': '12345 My Gold Way',
  'action_items': ['Buy an envelope',
   'Put gold inside',
   'Close the envelope',
   "Mail it to sender's address"],
  'topic': 'Request to send gold',
  'tone': 'positive'}}

In [12]:
from langsmith.client import Client

from langchain_benchmarks.extraction import get_eval_config

client = Client()
eval_llm = ChatOpenAI(model="gpt-4", model_kwargs={"seed": 42})
eval_config = get_eval_config(eval_llm)

test_run = client.run_on_dataset(
    dataset_name=task.name,
    llm_or_chain_factory=extraction_chain,
    evaluation=eval_config,
    verbose=True,
    project_metadata={
        "arch": "openai-functions",
    },
)

View the evaluation results for project 'excellent-gate-30' at:
https://smith.langchain.com/o/070d9f00-e34e-5115-a9ff-2f39ea17056a/datasets/26c0f964-36c0-479c-8ad9-978293de69f5/compare?selectedSessions=a3c743dd-2851-4467-9b8f-a84dfc962820

View all tests for Dataset Email Extraction at:
https://smith.langchain.com/o/070d9f00-e34e-5115-a9ff-2f39ea17056a/datasets/26c0f964-36c0-479c-8ad9-978293de69f5
[------------------------------------------------->] 42/42

Unnamed: 0,feedback.json_edit_distance,feedback.score_string:accuracy,error,execution_time,run_id
count,42.0,42.0,0.0,42.0,42
unique,,,0.0,,42
top,,,,,d9c6a0ed-eb1f-45c6-b4f9-9bd4860e04fa
freq,,,,,1
mean,0.572561,0.457143,,1.139226,
std,0.178292,0.251961,,0.382229,
min,0.190883,0.1,,0.539119,
25%,0.441978,0.3,,0.914502,
50%,0.583348,0.3,,1.110949,
75%,0.696266,0.7,,1.302855,


## JSON evaluators

In [14]:
from langchain.evaluation import JsonEditDistanceEvaluator
from flatten_json import flatten

evaluator = JsonEditDistanceEvaluator()
# Equivalently
# evaluator = load_evaluator("json_edit_distance")

pred = {
    "basic_info": {
        "name": "Bleiberg Property"
    },
    "location_info": {
        "location": "POINT(13.658889 46.625556)",
        "crs": "WGS84",
        "country": "Austria",
        "state_or_province": "Carinthia"
    },
    "mineral_inventory": [
        {
            "commodity": "Zinc",
            "category": "Inferred",
            "ore_unit": "million tonnes",
            "ore_value": 13.23,
            "grade_unit": "percent",
            "grade_value": 6.0,
            "cutoff_grade_unit": "unknown",
            "cutoff_grade_value": -1,
            "contained_metal": -1,
            "date": "2017-05",
            "zone": "Kalkscholle, Josefischolle, Riedhardscholle"
        },
        {
            "commodity": "Lead",
        }
    ],
    "deposit_type_candidate": [
        {
            "observed_name": "MVT zinc-lead"
        }
    ]
}

ref = {
    "basic_info": {
        "name": "BLEIBERG PROPERTY"
    },
    "location_info": {
        "location": "POINT(394000E, 5164500N)",
        "crs": "WGS84",
        "country": "Austria",
        "state_or_province": "Southern Carinthia"
    },
    "mineral_inventory": [
        {
            "commodity": "Zinc",
            "category": "Estimated",
            "ore_unit": "tonnes",
            "ore_value": 12247399,
            "grade_unit": "percent",
            "grade_value": 6,
            "cutoff_grade_unit": "unknown",
            "cutoff_grade_value": -1,
            "date": "2017-05",
            "zone": "Bleiberg"
        },
        {
            "commodity": "Lead",
        }
    ],
    "deposit_type_candidate": [
        {
            "observed_name": "MVT zinc-lead"
        }
    ]
}


result = []
for key in ["basic_info", "location_info", "mineral_inventory", "deposit_type_candidate"]:
    result = evaluator.evaluate_strings(
        prediction=["MVT Zinc-lead"], reference=["Mississippi Valley Type Zinc-lead"]
    )
print(result)

{'score': 0.5405405405405406}
