In [1]:
from typing import List, Optional
import itertools
import requests

import pandas as pd
from pydantic import BaseModel, Field, validator
from kor import extract_from_documents, from_pydantic, create_extraction_chain
from kor.documents.html import MarkdownifyHTMLProcessor
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms.openai import OpenAI

In [2]:
# Using gpt-3.5-turbo which is pretty cheap, but has worse quality
llm = ChatOpenAI(temperature=0, openai_api_key="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", model_name="gpt-4")
# llm = OpenAI(temperature=0, openai_api_key="YOUR_API_KEY", model)

In [3]:
class train(BaseModel):
    departure: str = Field(
        description="出发地点",
    )
    destination: str = Field(
        description="到达地点",
    )
    departure_time: str = Field(
        description="出发时间",
    )
    arrival_time: str = Field(
        description="到达时间",
    )
    train_number: str = Field(
        description="火车班次号码",
    )
    train_money: str = Field(
        description="价格",
    )
    time: str = Field(
        description="全程时间",
    )


schema, extraction_validator = from_pydantic(
    train,
    description="提取有关火车时刻表的信息，包括它们的出发、目的地、出发时间、到达时间、车次、价格和全程时间。",
    examples=[
        (
            """
            抢票成功率：07:40青岛2时50分D291610:30 灌南127* 5月20日09:30开售,可预约抢票,开售自动抢
                抢* **二等座**127抢票
                * **一等座**203抢票
                * **无座**127抢票
            """,
            {"departure": "青岛", "destination": "灌南", "departure_time": "07:40", "arrival_time": "10:30", "train_number": "D2916", "train_money": "127", "time": "2时50分"},
        ),
        (
            """
            抢票成功率：10:27青岛北2时26分G155312:53 灌南123* 5月20日09:30开售,可预约抢票,开售自动抢
            抢* **二等座**123抢票
            * **一等座**197抢票
            * **商务座**370抢票
            """,
            {"departure": "青岛北", "destination": "灌南", "departure_time": "10:27", "arrival_time": "12:53", "train_number": "G1553", "train_money": "123", "time": "2时26分"},
        )
        
    ],
    many=True,
)

In [4]:
chain = create_extraction_chain(
    llm,
    schema,
    encoder_or_encoder_class="csv",
    validator=extraction_validator,
    input_formatter="triple_quotes",
)

In [5]:
print(chain.prompt.format_prompt(text="[user input]").to_string())

Your goal is to extract structured information from the user's input that matches the form described below. When extracting information please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below.

```TypeScript

train: Array<{ // 提取有关火车时刻表的信息，包括它们的出发、目的地、出发时间、到达时间、车次、价格和全程时间。
 departure: string // 出发地点
 destination: string // 到达地点
 departure_time: string // 出发时间
 arrival_time: string // 到达时间
 train_number: string // 火车班次号码
 train_money: string // 价格
 time: string // 全程时间
}>
```


Please output the extracted information in CSV format in Excel dialect. Please use a | as the delimiter. 
 Do NOT add any clarifying information. Output MUST follow the schema above. Do NOT add any additional columns that do not appear in the schema.



Input: """

            抢票成功率：07:40青岛2时50分D291610:30 灌南127* 5月20日09:30开售,可预约抢票,开售自动抢
                抢* **二等座**127抢票
                * **一等座**203抢票
                * **无座**127抢票
            


In [6]:
url = "https://trains.ctrip.com/webapp/train/list?ticketType=0&dStation=%E9%9D%92%E5%B2%9B%E6%9C%BA%E5%9C%BA&aStation=%E9%9D%92%E5%B2%9B%E5%8C%97&dDate=2023-06-03&rDate=&trainsType=gaotie-dongche&hubCityName=&highSpeedOnly=0"
response = requests.get(url)  # Please see comment at top about using Selenium or

In [7]:
doc = Document(page_content=response.text)
md = MarkdownifyHTMLProcessor().process(doc)
md

Document(page_content='青岛机场到青岛北火车票预订与代购-高铁票价,动车票价-高铁订票,动车订票网-携程火车票订购中心酒店[国内酒店](//www.ctrip.com)[海外酒店](//hotels.ctrip.com/international/?intl=1)机票[国内/国际/中国港澳台](//flights.ctrip.com/online/channel/domestic)[特价机票](//flights.ctrip.com/fuzzysearch/search)[航班动态](//flights.ctrip.com/actualtime/search?newpchpheader=1)[值机选座](//m.ctrip.com/webapp/flight/postservice/iframes/index.html?type=bookseat&newpchpheader=1)[退票改签](//my.ctrip.com/myinfo/flight)[机场攻略](//flights.ctrip.com/booking/airport-guides.html)[定制包机](//flights.ctrip.com/itinerary/charter/book)火车票[国内火车票](//trains.ctrip.com)[国际/中国港澳台](//trains.ctrip.com/overseasTrains)旅游[旅游首页](//vacations.ctrip.com)[周末游](//vacations.ctrip.com/around?startcity=14)[跟团游](//vacations.ctrip.com/grouptravel)[自由行](//vacations.ctrip.com/freetravel)[私家团](//vacations.ctrip.com/privategroup)[邮轮](//cruise.ctrip.com/newpackage)[一日游](//huodong.ctrip.com/things-to-do/list?pshowcode=1daytrip&sourceFrom=vacation&newheader=1)[主题游](//vacations.ctrip.com/themetravel)[定制旅游](//

In [8]:
md.page_content = md.page_content.split("### 中转方案推荐")[0]

In [9]:
split_docs = RecursiveCharacterTextSplitter().split_documents([md])
print(split_docs[0].page_content)
len(split_docs)

青岛机场到青岛北火车票预订与代购-高铁票价,动车票价-高铁订票,动车订票网-携程火车票订购中心酒店[国内酒店](//www.ctrip.com)[海外酒店](//hotels.ctrip.com/international/?intl=1)机票[国内/国际/中国港澳台](//flights.ctrip.com/online/channel/domestic)[特价机票](//flights.ctrip.com/fuzzysearch/search)[航班动态](//flights.ctrip.com/actualtime/search?newpchpheader=1)[值机选座](//m.ctrip.com/webapp/flight/postservice/iframes/index.html?type=bookseat&newpchpheader=1)[退票改签](//my.ctrip.com/myinfo/flight)[机场攻略](//flights.ctrip.com/booking/airport-guides.html)[定制包机](//flights.ctrip.com/itinerary/charter/book)火车票[国内火车票](//trains.ctrip.com)[国际/中国港澳台](//trains.ctrip.com/overseasTrains)旅游[旅游首页](//vacations.ctrip.com)[周末游](//vacations.ctrip.com/around?startcity=14)[跟团游](//vacations.ctrip.com/grouptravel)[自由行](//vacations.ctrip.com/freetravel)[私家团](//vacations.ctrip.com/privategroup)[邮轮](//cruise.ctrip.com/newpackage)[一日游](//huodong.ctrip.com/things-to-do/list?pshowcode=1daytrip&sourceFrom=vacation&newheader=1)[主题游](//vacations.ctrip.com/themetravel)[定制旅游](//vacations.ctrip.com/cus

2

In [10]:
from langchain.callbacks import get_openai_callback

In [11]:
with get_openai_callback() as cb:
    document_extraction_results = await extract_from_documents(
        chain, split_docs, max_concurrency=5, use_uid=False, return_exceptions=True
    )
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Successful Requests: {cb.successful_requests}")
    print(f"Total Cost (USD): ${cb.total_cost}")

Error in on_llm callback: 'OpenAICallbackHandler' object has no attribute 'on_llm'
Error in on_llm callback: 'OpenAICallbackHandler' object has no attribute 'on_llm'


Total Tokens: 5713
Prompt Tokens: 5221
Completion Tokens: 492
Successful Requests: 2
Total Cost (USD): $0.18614999999999998


In [12]:
validated_data = list(
    itertools.chain.from_iterable(
        extraction["validated_data"] for extraction in document_extraction_results
    )
)

In [13]:
result = pd.DataFrame(record.dict() for record in validated_data)

result[(~result['train_money'].isin(['无票', '未知'])) & (~result['time'].isin(['无票', '未知']))].query('departure == "青岛北"')

Unnamed: 0,departure,destination,departure_time,arrival_time,train_number,train_money,time


In [14]:
result

Unnamed: 0,departure,destination,departure_time,arrival_time,train_number,train_money,time
0,青岛机场,青岛,08:54,09:41,G6901,25,0时47分
1,青岛机场,青岛北,08:54,09:17,G6901,19,0时23分
2,青岛机场,青岛北,10:23,10:47,D8161,18,0时24分
3,青岛机场,青岛,11:14,12:01,G1061,25,0时47分
4,青岛机场,青岛北,11:14,11:38,G1061,19,0时24分
5,青岛机场,青岛北,12:38,13:03,G6915,19,0时25分
6,青岛机场,青岛北,13:11,13:35,G223,19,0时24分
7,青岛机场,青岛北,13:58,14:22,G2071,19,0时24分
8,青岛机场,青岛北,15:16,15:39,G6955,19,0时23分
9,青岛机场,青岛北,15:39,16:03,G207,19,0时24分
