In [1]:
from typing import List, Optional
import itertools
import requests

import pandas as pd
from pydantic import BaseModel, Field, validator
from kor import extract_from_documents, from_pydantic, create_extraction_chain
from kor.documents.html import MarkdownifyHTMLProcessor
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
# Using gpt-3.5-turbo which is pretty cheap, but has worse quality
llm = ChatOpenAI(temperature=0)

In [3]:
class ShowOrMovie(BaseModel):
    name: str = Field(
        description="The name of the movie or tv show",
    )
    season: Optional[str] = Field(
        description="Season of TV show. Extract as a digit stripping Season prefix.",
    )
    year: Optional[str] = Field(
        description="Year when the movie / tv show was released",
    )
    latest_episode: Optional[str] = Field(
        description="Date when the latest episode was released",
    )
    link: Optional[str] = Field(description="Link to the movie / tv show.")

    # rating -- not included because rating on rottentomatoes is in the html elements
    # you could try extracting it by using the raw HTML (rather than markdown)
    # or you could try doing something similar on imdb

    @validator("name")
    def name_must_not_be_empty(cls, v):
        if not v:
            raise ValueError("Name must not be empty")
        return v


schema, extraction_validator = from_pydantic(
    ShowOrMovie,
    description="Extract information about popular movies/tv shows including their name, year, link and rating.",
    examples=[
        (
            "[Rain Dogs Latest Episode: Apr 03](/tv/rain_dogs)",
            {"name": "Rain Dogs", "latest_episode": "Apr 03", "link": "/tv/rain_dogs"},
        )
    ],
    many=True,
)

In [4]:
chain = create_extraction_chain(
    llm,
    schema,
    encoder_or_encoder_class="csv",
    validator=extraction_validator,
    input_formatter="triple_quotes",
)

In [5]:
url = "https://www.rottentomatoes.com/browse/tv_series_browse/sort:popular"
response = requests.get(url)  # Please see comment at top about using Selenium or

In [13]:
doc = Document(page_content=response.text)

In [15]:
md = MarkdownifyHTMLProcessor().process(doc)

In [16]:
split_docs = RecursiveCharacterTextSplitter().split_documents([md])

In [18]:
print(split_docs[-1].page_content)

Join The Newsletter

Get the freshest reviews, news, and more delivered right to your inbox!

[Join The Newsletter](https://optout.services.fandango.com/rottentomatoes)

Follow Us

* 
* 
* 
* 
* 

Copyright © Fandango. All rights reserved.

[Join Newsletter](https://optout.services.fandango.com/rottentomatoes)
* [Privacy Policy](//www.fandango.com/policies/privacy-policy)
* [Terms and Policies](//www.fandango.com/policies/terms-and-policies)
* [Cookie Settings](javascript:void(0))
* [California Notice](//www.fandango.com/californianotice)
* [Ad Choices](//www.fandango.com/policies/cookies-and-tracking#cookie_management)
* 
* [Accessibility](/faq#accessibility)

* V3.1
* [Privacy Policy](//www.fandango.com/policies/privacy-policy)
* [Terms and Policies](//www.fandango.com/policies/terms-and-policies)
* [Cookie Settings](javascript:void(0))
* [California Notice](//www.fandango.com/californianotice)
* [Ad Choices](//www.fandango.com/policies/cookies-and-tracking#cookie_management)
* [Acce

In [19]:
len(split_docs)

4

In [20]:
from langchain.callbacks import get_openai_callback

In [21]:
with get_openai_callback() as cb:
    document_extraction_results = await extract_from_documents(
        chain, split_docs, max_concurrency=5, use_uid=False, return_exceptions=True
    )
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Successful Requests: {cb.successful_requests}")
    print(f"Total Cost (USD): ${cb.total_cost}")

Total Tokens: 5633
Prompt Tokens: 4804
Completion Tokens: 829
Successful Requests: 4
Total Cost (USD): $0.011265999999999998


In [22]:
validated_data = list(
    itertools.chain.from_iterable(
        extraction["validated_data"] for extraction in document_extraction_results
    )
)

In [23]:
len(validated_data)

46

In [24]:
pd.DataFrame(record.dict() for record in validated_data)

Unnamed: 0,name,season,year,latest_episode,link
0,"XO, Kitty",1.0,,,/tv/xo_kitty/s01
1,Fear the Walking Dead,8.0,,,/tv/fear_the_walking_dead/s08
2,High Desert,1.0,,,/tv/high_desert/s01
3,SisterS,1.0,2023.0,,/tv/sisters_2023/s01
4,Primo,1.0,,,/tv/primo/s01
5,Working: What We Do All Day,1.0,,,/tv/working_what_we_do_all_day/s01
6,Rainn Wilson and the Geography of Bliss,1.0,,,/tv/rainn_wilson_and_the_geography_of_bliss/s01
7,The Secrets of Hillsong,1.0,,,/tv/the_secrets_of_hillsong/s01
8,Queen Cleopatra,1.0,,,/tv/queen_cleopatra/s01
9,From,2.0,,,/tv/from/s02
